-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtranscriptor.py
More file actions
46 lines (43 loc) · 1.47 KB
/
transcriptor.py
File metadata and controls
46 lines (43 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
This function intends to get exact word onset time of each recording files.
whisper_approx_char offer faster way but coarse estimation
"""
import re
from faster_whisper import WhisperModel
from funasr import AutoModel
### For Faster extraction
def whisper_approx_char(audio_path, model_size="tiny", device="cuda"):
model = WhisperModel(model_size, device=device, compute_type="float16")
segments, info = model.transcribe(audio_path, word_timestamps=True, language="zh")
onsets = []
for seg in segments:
for word in seg.words:
text = word.word.strip()
# Remove any character that is NOT a Chinese character, letter, or number
clean_text = re.sub(r'[^\u4e00-\u9fffa-zA-Z0-9]', '', text)
if not clean_text:
continue
n = len(clean_text)
duration = word.end - word.start
step = duration / n
for i in range(n):
start = word.start + i * step
onsets.append(start)
return onsets
### For accurate timestamp
def align_chinese(audio_path: str, device: str = "cpu"):
model = AutoModel(
model="paraformer-zh",
vad_model="fsmn-vad",
punc_model="ct-punc",
device=device
)
res = model.generate(
input=audio_path,
output_dir=None,
param_dict={"align": True}
)
onsets = []
for item in res[0]["timestamp"]:
onsets.append(item[0] / 1000)
return onsets