HackathonCOSN2025/transcriptor.py at main · 10eit/HackathonCOSN2025 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
This function intends to get exact word onset time of each recording files.
whisper_approx_char offer faster way but coarse estimation

"""
import re
from faster_whisper import WhisperModel
from funasr import AutoModel

### For Faster extraction
def whisper_approx_char(audio_path, model_size="tiny", device="cuda"):
    model = WhisperModel(model_size, device=device, compute_type="float16")
    segments, info = model.transcribe(audio_path, word_timestamps=True, language="zh")
    onsets = []
    for seg in segments:
        for word in seg.words:
            text = word.word.strip()
            # Remove any character that is NOT a Chinese character, letter, or number
            clean_text = re.sub(r'[^\u4e00-\u9fffa-zA-Z0-9]', '', text)
            if not clean_text:
                continue
            n = len(clean_text)
            duration = word.end - word.start
            step = duration / n
            for i in range(n):
                start = word.start + i * step
                onsets.append(start)
    return onsets

### For accurate timestamp
def align_chinese(audio_path: str, device: str = "cpu"):
    model = AutoModel(
        model="paraformer-zh",
        vad_model="fsmn-vad",
        punc_model="ct-punc",
        device=device
    )
    res = model.generate(
        input=audio_path,
        output_dir=None,
        param_dict={"align": True}
    )
    onsets = []
    for item in res[0]["timestamp"]:
        onsets.append(item[0] / 1000)
    return onsets