diff --git a/requirements.txt b/requirements.txt index eb56c39..15e5118 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ numpy>=1.24.0 customtkinter>=5.2.0 pystray>=0.19.5 Pillow>=10.0.0 +transformers>=4.23.0 diff --git a/src/hearsay/app.py b/src/hearsay/app.py index a7b78ba..4bd7552 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -254,7 +254,7 @@ def _teardown_recording( for seg in result.segments: from hearsay.output.formatter import format_timestamp ts = format_timestamp( - result.chunk_index * 30 + seg["start"] + result.start_time + seg["start"] ) safe_after(self._root, 0, lambda t=f"[{ts}] {seg['text']}": ( @@ -307,7 +307,7 @@ def _poll_transcripts(self) -> None: for seg in result.segments: from hearsay.output.formatter import format_timestamp ts = format_timestamp( - result.chunk_index * 30 + seg["start"] + result.start_time + seg["start"] ) self._live_view.append_text(f"[{ts}] {seg['text']}") except queue.Empty: diff --git a/src/hearsay/audio/recorder.py b/src/hearsay/audio/recorder.py index 845b31c..bdcc3a3 100644 --- a/src/hearsay/audio/recorder.py +++ b/src/hearsay/audio/recorder.py @@ -4,7 +4,6 @@ import logging import queue -import time import numpy as np @@ -14,20 +13,121 @@ AUDIO_SOURCE_BOTH, AUDIO_SOURCE_MIC, AUDIO_SOURCE_SYSTEM, - CHUNK_DURATION_S, + MAX_CHUNK_DURATION_S, + MIN_CHUNK_DURATION_S, OVERLAP_DURATION_S, SAMPLE_RATE, + SILENCE_DURATION_S, + SILENCE_RMS_THRESHOLD, ) from hearsay.utils.threading_utils import StoppableThread log = logging.getLogger(__name__) +class _ChunkAccumulator: + """Accumulates mono 16 kHz float32 audio and decides chunk boundaries. + + A chunk becomes ready when either: + * the buffer reaches ``MAX_CHUNK_DURATION_S`` (hard cap), or + * at least ``MIN_CHUNK_DURATION_S`` has accumulated AND the trailing + ``SILENCE_DURATION_S`` of audio is near-silent. + + Consecutive chunks share ``OVERLAP_DURATION_S`` of audio so the + transcription pipeline can stitch words across boundaries. Each emitted + chunk carries its absolute start time (seconds from the start of the + recording), so downstream timestamps stay correct despite variable lengths. + """ + + def __init__(self) -> None: + self._buffer: list[np.ndarray] = [] + self._total = 0 # samples currently buffered + self._silence_run = 0 # consecutive trailing near-silent samples + self._start_sample = 0 # absolute index of buffer[0] in the recording + self.chunk_index = 0 + + self._min = int(MIN_CHUNK_DURATION_S * SAMPLE_RATE) + self._max = int(MAX_CHUNK_DURATION_S * SAMPLE_RATE) + self._silence_needed = int(SILENCE_DURATION_S * SAMPLE_RATE) + self._overlap = int(OVERLAP_DURATION_S * SAMPLE_RATE) + + def add(self, mono: np.ndarray, silent: bool | None = None) -> None: + """Append a mono frame, updating the trailing-silence run. + + If *silent* is None, silence is computed from this frame's RMS. + Callers mixing multiple sources (Both mode) pass an explicit flag. + """ + if mono is None or len(mono) == 0: + return + self._buffer.append(mono) + self._total += len(mono) + + if silent is None: + rms = float(np.sqrt(np.mean(mono ** 2))) + silent = rms < SILENCE_RMS_THRESHOLD + + if silent: + self._silence_run += len(mono) + else: + self._silence_run = 0 + + def ready(self) -> bool: + """True when the current buffer should be emitted as a chunk.""" + if self._total >= self._max: + return True + return self._total >= self._min and self._silence_run >= self._silence_needed + + def pop(self) -> tuple[int, float, np.ndarray]: + """Emit a chunk and retain the overlap tail. Returns (index, start_s, audio).""" + data = np.concatenate(self._buffer) + emitted_len = min(len(data), self._max) + chunk = data[:emitted_len] + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + + # Advance by the unique (non-overlapping) audio we just consumed. + advance = max(0, emitted_len - self._overlap) + self._start_sample += advance + + if self._overlap > 0: + leftover = data[emitted_len - self._overlap:] + else: + leftover = data[emitted_len:] + self._buffer = [leftover] if len(leftover) else [] + self._total = int(len(leftover)) + self._silence_run = 0 + self.chunk_index += 1 + return idx, start_time, chunk + + def flush(self) -> tuple[int, float, np.ndarray] | None: + """Emit whatever remains (if > 1s) when recording stops.""" + if self._total <= SAMPLE_RATE: # less than 1 second — discard + return None + data = np.concatenate(self._buffer) + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + self._buffer = [] + self._total = 0 + self.chunk_index += 1 + return idx, start_time, data + + +def _rms(mono: np.ndarray) -> float: + """Root-mean-square level of a mono float32 frame.""" + if mono is None or len(mono) == 0: + return 0.0 + return float(np.sqrt(np.mean(mono ** 2))) + + class AudioRecorder(StoppableThread): - """Record audio and push 30-second chunks to a queue. + """Record audio and push variable-length chunks to a queue. + + Each queue item is a ``(chunk_index, start_time_s, np.ndarray)`` tuple, + where ``start_time_s`` is the chunk's absolute offset from the start of the + recording. Args: - audio_queue: Queue to push (chunk_index, np.ndarray) tuples. + audio_queue: Queue to push chunks to. source: One of 'system', 'microphone', 'both'. loopback_device_index: PyAudioWPatch device index for loopback. mic_device_index: sounddevice device index for mic. @@ -108,32 +208,16 @@ def _record_mic(self) -> None: """Record microphone via sounddevice.""" import sounddevice as sd - buffer: list[np.ndarray] = [] - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - chunk_index = 0 + acc = _ChunkAccumulator() def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None: - nonlocal chunk_index mono = resample(indata.copy(), self.mic_rate, self.mic_channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - # Keep overlap - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - device = self.mic_device_index + acc.add(mono) + if acc.ready(): + self.audio_queue.put(acc.pop()) + with sd.InputStream( - device=device, + device=self.mic_device_index, samplerate=self.mic_rate, channels=self.mic_channels, dtype="float32", @@ -142,11 +226,9 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object) while not self.stopped(): self.wait(timeout=0.5) - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) + final = acc.flush() + if final is not None: + self.audio_queue.put(final) def _record_both(self) -> None: """Record both loopback and mic, mix them. @@ -156,7 +238,8 @@ def _record_both(self) -> None: occurs when PyAudioWPatch and sounddevice run on the same thread. The mic stream uses PyAudio's callback mode so it accumulates data asynchronously while the main loop drives off blocking loopback - reads. + reads. Chunk boundaries are decided on the *combined* activity, so a + chunk is only cut when both sources fall silent. """ import pyaudiowpatch as pyaudio @@ -230,10 +313,15 @@ def mic_callback(in_data, frame_count, time_info, status_flags): mic_stream.start_stream() # --- Main loop (driven by blocking loopback reads) --- - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - loopback_buf: list[np.ndarray] = [] - chunk_index = 0 + acc = _ChunkAccumulator() + + def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray: + if not mic_buffer: + return lb_chunk + mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] + if len(mic_chunk) < len(lb_chunk): + mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) + return mix_streams(lb_chunk, mic_chunk) while not self.stopped(): try: @@ -241,49 +329,24 @@ def mic_callback(in_data, frame_count, time_info, status_flags): except Exception: break audio = np.frombuffer(raw, dtype=np.int16) - mono = resample(audio, self.loopback_rate, self.loopback_channels) - loopback_buf.append(mono) - - total = sum(len(b) for b in loopback_buf) - if total >= chunk_samples: - lb_chunk = np.concatenate(loopback_buf)[:chunk_samples] - mic_samples = sum(len(b) for b in mic_buffer) - log.debug( - "Mixing chunk %d: loopback=%d mic=%d samples", - chunk_index, len(lb_chunk), mic_samples, - ) - - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:chunk_samples] - if len(mic_chunk) < chunk_samples: - mic_chunk = np.pad(mic_chunk, (0, chunk_samples - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - - self.audio_queue.put((chunk_index, mixed)) - chunk_index += 1 - - if overlap_samples > 0: - leftover = np.concatenate(loopback_buf)[chunk_samples - overlap_samples:] - loopback_buf.clear() - loopback_buf.append(leftover) - else: - loopback_buf.clear() + lb_mono = resample(audio, self.loopback_rate, self.loopback_channels) + + # Combined silence: silent only when both sources are quiet. + # The latest mic frame approximates current mic activity. + mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True + silent = (_rms(lb_mono) < SILENCE_RMS_THRESHOLD) and mic_silent + + acc.add(lb_mono, silent=silent) + if acc.ready(): + idx, start_time, lb_chunk = acc.pop() + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_buffer.clear() # --- Flush remaining audio --- - if loopback_buf: - lb_chunk = np.concatenate(loopback_buf) - if len(lb_chunk) > SAMPLE_RATE: # Only if > 1 second - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] - if len(mic_chunk) < len(lb_chunk): - mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - self.audio_queue.put((chunk_index, mixed)) + final = acc.flush() + if final is not None: + idx, start_time, lb_chunk = final + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_stream.stop_stream() mic_stream.close() @@ -298,11 +361,8 @@ def _chunk_loop( sr: int, channels: int, ) -> None: - """Generic chunking loop for loopback-style streams.""" - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - buffer: list[np.ndarray] = [] - chunk_index = 0 + """Generic chunking loop for loopback-style (blocking-read) streams.""" + acc = _ChunkAccumulator() while not self.stopped(): try: @@ -311,25 +371,18 @@ def _chunk_loop( break audio = np.frombuffer(raw, dtype=np.int16) mono = resample(audio, sr, channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - log.debug("Audio chunk %d queued (%d samples)", chunk_index - 1, len(chunk)) - - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) - log.debug("Final audio chunk %d queued (%d samples)", chunk_index, len(chunk)) + acc.add(mono) + + if acc.ready(): + idx, start_time, chunk = acc.pop() + self.audio_queue.put((idx, start_time, chunk)) + log.debug( + "Audio chunk %d queued (%d samples, t=%.1fs)", + idx, len(chunk), start_time, + ) + + final = acc.flush() + if final is not None: + idx, start_time, chunk = final + self.audio_queue.put((idx, start_time, chunk)) + log.debug("Final audio chunk %d queued (%d samples)", idx, len(chunk)) diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py index 710dfca..2a6dfd0 100644 --- a/src/hearsay/constants.py +++ b/src/hearsay/constants.py @@ -7,10 +7,34 @@ # Audio settings SAMPLE_RATE = 16000 # Whisper expects 16kHz CHANNELS = 1 # Whisper expects mono -CHUNK_DURATION_S = 30 # Whisper's native context window -OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting +# Variable-length chunking driven by trailing-silence detection. +# A chunk is cut once at least MIN_CHUNK_DURATION_S has accumulated AND the +# trailing SILENCE_DURATION_S of audio is near-silent — or unconditionally once +# MAX_CHUNK_DURATION_S (Whisper's native context window) is reached. +MIN_CHUNK_DURATION_S = 5 # Minimum audio buffered before an early (silence) cut +MAX_CHUNK_DURATION_S = 30 # Hard cap — Whisper's native context window +SILENCE_DURATION_S = 1.0 # Trailing near-silence (seconds) that triggers a cut +SILENCE_RMS_THRESHOLD = 0.01 # RMS on [-1, 1] float audio below which ≈ silence +OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting AUDIO_DTYPE = "float32" +# Custom HuggingFace models: short name -> {repo_id, parameters, vram_gb, english_only} +# These models are in Transformers format and must be converted to CTranslate2 on first use. +HF_CUSTOM_MODELS: dict[str, dict] = { + "small-ko": { + "repo_id": "SungBeom/whisper-small-ko", + "parameters": "244M", + "vram_gb": 2, + "english_only": False, + }, + "medium-ko-zeroth": { + "repo_id": "seastar105/whisper-medium-ko-zeroth", + "parameters": "769M", + "vram_gb": 5, + "english_only": False, + }, +} + # Model table: name -> (parameters, vram_gb, english_only) MODEL_TABLE = { "tiny": ("39M", 1, False), @@ -23,6 +47,9 @@ "medium.en": ("769M", 5, True), "large-v3": ("1550M", 10, False), "turbo": ("809M", 6, False), + # Korean fine-tuned models (HuggingFace, converted to CTranslate2 on first use) + "small-ko": ("244M", 2, False), + "medium-ko-zeroth": ("769M", 5, False), } # Default model recommendations diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index 912585a..4f13afa 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -50,7 +50,7 @@ def append(self, result: TranscriptionResult) -> None: self._append_fallback(result) return - chunk_offset = result.chunk_index * 30 # seconds offset for this chunk + chunk_offset = result.start_time # absolute seconds offset for this chunk pieces: list[str] = [] for seg in result.segments: diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py index 8495de7..e5dc224 100644 --- a/src/hearsay/transcription/engine.py +++ b/src/hearsay/transcription/engine.py @@ -21,6 +21,7 @@ class TranscriptionResult: language: str language_probability: float chunk_index: int + start_time: float = 0.0 # absolute offset (s) of this chunk from recording start class TranscriptionEngine: @@ -44,7 +45,9 @@ def __init__( def load(self) -> None: """Load the Whisper model into memory.""" from faster_whisper import WhisperModel + from hearsay.transcription.model_manager import resolve_model_path + model_path = resolve_model_path(self.model_name) log.info( "Loading model '%s' (device=%s, compute=%s)", self.model_name, @@ -52,7 +55,7 @@ def load(self) -> None: self.compute_type, ) self._model = WhisperModel( - self.model_name, + model_path, device=self.device, compute_type=self.compute_type, download_root=str(get_models_dir()), @@ -63,12 +66,14 @@ def transcribe( self, audio: np.ndarray, chunk_index: int = 0, + start_time: float = 0.0, ) -> TranscriptionResult: """Transcribe a float32 16kHz mono audio array. Args: audio: Audio data as float32 numpy array at 16kHz. chunk_index: Index of this chunk (for ordering). + start_time: Absolute offset (s) of this chunk from recording start. Returns: TranscriptionResult with text and segment details. @@ -110,6 +115,7 @@ def transcribe( language=info.language, language_probability=info.language_probability, chunk_index=chunk_index, + start_time=start_time, ) def unload(self) -> None: diff --git a/src/hearsay/transcription/model_manager.py b/src/hearsay/transcription/model_manager.py index ed6150c..3fee329 100644 --- a/src/hearsay/transcription/model_manager.py +++ b/src/hearsay/transcription/model_manager.py @@ -3,9 +3,12 @@ from __future__ import annotations import logging +import shutil +import subprocess +import sys from pathlib import Path -from hearsay.constants import MODEL_TABLE +from hearsay.constants import HF_CUSTOM_MODELS, MODEL_TABLE from hearsay.utils.paths import get_models_dir log = logging.getLogger(__name__) @@ -21,57 +24,162 @@ def get_model_info(name: str) -> tuple[str, int, bool] | None: return MODEL_TABLE.get(name) +def is_hf_custom_model(name: str) -> bool: + """Return True if this model requires HuggingFace download + CTranslate2 conversion.""" + return name in HF_CUSTOM_MODELS + + +def get_hf_model_local_path(name: str) -> Path: + """Return the local CTranslate2 directory path for a custom HF model.""" + return get_models_dir() / f"hf-ct2-{name}" + + +def resolve_model_path(name: str) -> str: + """Return the model name or local path string for WhisperModel(). + + For standard models, returns the name as-is (faster-whisper handles download). + For custom HF models, returns the local CTranslate2 directory path. + """ + if is_hf_custom_model(name): + return str(get_hf_model_local_path(name)) + return name + + def is_model_downloaded(name: str) -> bool: """Check if a model is already cached locally.""" + if is_hf_custom_model(name): + local_path = get_hf_model_local_path(name) + return local_path.exists() and (local_path / "model.bin").exists() + model_dir = get_models_dir() - # faster-whisper stores models in subdirectories named after the model - # Check for the CTranslate2 model file model_path = model_dir / f"models--Systran--faster-whisper-{name}" if model_path.exists(): return True - # Also check for direct directory naming alt_path = model_dir / name return alt_path.exists() and any(alt_path.iterdir()) +def _get_converter_cmd() -> str: + """Find the ct2-transformers-converter executable.""" + converter = shutil.which("ct2-transformers-converter") + if converter: + return converter + + import site + candidate_dirs: list[Path] = [Path(sys.executable).parent] + + # pip --user installs scripts under {userbase}/PythonXY/Scripts on Windows + user_base = Path(site.getuserbase()) + for child in user_base.iterdir() if user_base.exists() else []: + if child.is_dir() and child.name.startswith("Python"): + candidate_dirs.append(child / "Scripts") + candidate_dirs.append(user_base / "Scripts") + candidate_dirs.append(user_base / "bin") + + for d in candidate_dirs: + for exe_name in ["ct2-transformers-converter", "ct2-transformers-converter.exe"]: + p = d / exe_name + if p.exists(): + return str(p) + + raise RuntimeError( + "ct2-transformers-converter not found.\n" + "Install required packages:\n" + " pip install ctranslate2 transformers torch" + ) + + +def _download_and_convert_hf_model( + name: str, + progress_callback: callable | None = None, +) -> None: + """Download a HuggingFace Whisper model and convert it to CTranslate2 format.""" + info = HF_CUSTOM_MODELS[name] + repo_id = info["repo_id"] + local_path = get_hf_model_local_path(name) + + log.info("Downloading and converting HF model '%s' -> %s", repo_id, local_path) + + try: + converter = _get_converter_cmd() + except RuntimeError as exc: + raise RuntimeError(str(exc)) from exc + + local_path.mkdir(parents=True, exist_ok=True) + + if progress_callback: + progress_callback(f"Downloading '{repo_id}' from HuggingFace...") + + result = subprocess.run( + [ + converter, + "--model", repo_id, + "--output_dir", str(local_path), + "--quantization", "int8", + "--force", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + shutil.rmtree(local_path, ignore_errors=True) + stderr_tail = result.stderr[-600:] if result.stderr else "(no output)" + raise RuntimeError( + f"CTranslate2 conversion failed for '{repo_id}':\n{stderr_tail}\n\n" + "Make sure torch is installed: pip install torch" + ) + + log.info("HF model '%s' converted successfully to %s", repo_id, local_path) + + if progress_callback: + progress_callback(f"Model '{name}' ready!") + + def download_model( name: str, progress_callback: callable | None = None, ) -> str: - """Download a model if not cached. Returns the model size string for faster-whisper. + """Download (and convert if needed) a model. Returns model path/name for WhisperModel(). Args: - name: Model name (e.g., 'turbo', 'small.en'). + name: Model name from MODEL_TABLE. progress_callback: Optional callable(status_text) for progress updates. Returns: - The model name/path string to pass to WhisperModel(). + The model name or local path string to pass to WhisperModel(). """ if name not in MODEL_TABLE: raise ValueError(f"Unknown model: {name}") + if is_hf_custom_model(name): + if not is_model_downloaded(name): + if progress_callback: + progress_callback(f"Converting '{name}' to CTranslate2 format (this may take several minutes)...") + _download_and_convert_hf_model(name, progress_callback) + elif progress_callback: + progress_callback(f"Model '{name}' already converted.") + return str(get_hf_model_local_path(name)) + + # Standard faster-whisper model if progress_callback: progress_callback(f"Preparing model '{name}'...") model_dir = get_models_dir() log.info("Downloading/loading model '%s' to %s", name, model_dir) - # faster-whisper downloads models from Hugging Face on first use. - # We trigger this by importing and constructing the model. - # The download_root parameter controls where models are cached. from faster_whisper import WhisperModel if progress_callback: progress_callback(f"Downloading '{name}' (this may take a few minutes)...") - # This will download if not cached _model = WhisperModel( name, device="cpu", compute_type="int8", download_root=str(model_dir), ) - del _model # Free memory; the real model will be loaded by the engine + del _model if progress_callback: progress_callback(f"Model '{name}' ready!") diff --git a/src/hearsay/transcription/pipeline.py b/src/hearsay/transcription/pipeline.py index 7f96ced..06e6a2f 100644 --- a/src/hearsay/transcription/pipeline.py +++ b/src/hearsay/transcription/pipeline.py @@ -42,10 +42,10 @@ def run(self) -> None: log.info("TranscriptionPipeline started") while not self.stopped(): try: - chunk_index, audio = self.audio_queue.get(timeout=1.0) + chunk_index, start_time, audio = self.audio_queue.get(timeout=1.0) except queue.Empty: continue - self._process_chunk(chunk_index, audio) + self._process_chunk(chunk_index, start_time, audio) # Drain any audio chunks still in the queue after stop signal. # The recorder flushes its buffer before exiting, so these chunks @@ -53,18 +53,20 @@ def run(self) -> None: log.info("TranscriptionPipeline draining remaining audio chunks") while True: try: - chunk_index, audio = self.audio_queue.get_nowait() + chunk_index, start_time, audio = self.audio_queue.get_nowait() except queue.Empty: break - self._process_chunk(chunk_index, audio) + self._process_chunk(chunk_index, start_time, audio) log.info("TranscriptionPipeline stopped") - def _process_chunk(self, chunk_index: int, audio) -> None: + def _process_chunk(self, chunk_index: int, start_time: float, audio) -> None: """Transcribe a single audio chunk and enqueue the result.""" try: t0 = time.perf_counter() - result = self.engine.transcribe(audio, chunk_index=chunk_index) + result = self.engine.transcribe( + audio, chunk_index=chunk_index, start_time=start_time + ) elapsed = time.perf_counter() - t0 log.info( "Chunk %d transcribed in %.1fs: %s", @@ -125,6 +127,7 @@ def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: language=result.language, language_probability=result.language_probability, chunk_index=result.chunk_index, + start_time=result.start_time, ) # Rebuild text and trim leading segments that were fully covered by the overlap. @@ -147,4 +150,5 @@ def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: language=result.language, language_probability=result.language_probability, chunk_index=result.chunk_index, + start_time=result.start_time, ) diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index a7f386b..a4327ff 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import threading from tkinter import filedialog import customtkinter as ctk @@ -15,6 +16,11 @@ AUDIO_SOURCE_SYSTEM, MODEL_TABLE, ) +from hearsay.transcription.model_manager import ( + download_model, + is_hf_custom_model, + is_model_downloaded, +) log = logging.getLogger(__name__) @@ -30,6 +36,7 @@ def __init__(self, master: ctk.CTk, config_manager: ConfigManager) -> None: self._config_manager = config_manager self._config = config_manager.config + self._dl_frame: ctk.CTkFrame | None = None self._build_ui() self.grab_set() @@ -70,9 +77,16 @@ def _build_ui(self) -> None: variable=self._model_var, values=list(MODEL_TABLE.keys()), width=200, + command=self._on_model_changed, ) self._model_menu.pack(anchor="w", padx=15) + self._model_hint = ctk.CTkLabel( + scroll, text="", font=("Segoe UI", 10), text_color="gray" + ) + self._model_hint.pack(anchor="w", padx=15) + self._update_model_hint(self._config.model_name) + # ── Compute Type ── ctk.CTkLabel(scroll, text="Compute Type", font=("Segoe UI", 14, "bold")).pack( anchor="w", pady=(15, 5) @@ -106,7 +120,7 @@ def _build_ui(self) -> None: self._lang_entry = ctk.CTkEntry(scroll, textvariable=self._lang_var, width=100) self._lang_entry.pack(anchor="w", padx=15) ctk.CTkLabel( - scroll, text="ISO 639-1 code (e.g., en, es, fr) or empty for auto-detect", + scroll, text="ISO 639-1 code (e.g., en, ko, fr) or empty for auto-detect", font=("Segoe UI", 10), text_color="gray" ).pack(anchor="w", padx=15) @@ -132,16 +146,32 @@ def _build_ui(self) -> None: ).pack(side="left") # ── Buttons ── - btn_frame = ctk.CTkFrame(self) - btn_frame.pack(fill="x", padx=20, pady=(0, 15)) + self._btn_frame = ctk.CTkFrame(self) + self._btn_frame.pack(fill="x", padx=20, pady=(0, 15)) - ctk.CTkButton( - btn_frame, text="Save", width=100, command=self._save - ).pack(side="right", padx=5) - ctk.CTkButton( - btn_frame, text="Cancel", width=100, fg_color="gray", + self._save_btn = ctk.CTkButton( + self._btn_frame, text="Save", width=100, command=self._save + ) + self._save_btn.pack(side="right", padx=5) + self._cancel_btn = ctk.CTkButton( + self._btn_frame, text="Cancel", width=100, fg_color="gray", command=self._cancel - ).pack(side="right", padx=5) + ) + self._cancel_btn.pack(side="right", padx=5) + + def _on_model_changed(self, name: str) -> None: + self._update_model_hint(name) + + def _update_model_hint(self, name: str) -> None: + if is_hf_custom_model(name): + if is_model_downloaded(name): + self._model_hint.configure(text="Korean model (converted, ready)", text_color="green") + else: + self._model_hint.configure( + text="Korean model — will download & convert on Save", text_color="#e07800" + ) + else: + self._model_hint.configure(text="") def _browse(self) -> None: path = filedialog.askdirectory( @@ -152,6 +182,13 @@ def _browse(self) -> None: self._dir_var.set(path) def _save(self) -> None: + new_model = self._model_var.get() + if is_hf_custom_model(new_model) and not is_model_downloaded(new_model): + self._start_download(new_model) + return + self._apply_and_close() + + def _apply_and_close(self) -> None: self._config.audio_source = self._source_var.get() self._config.model_name = self._model_var.get() self._config.compute_type = self._compute_var.get() @@ -164,6 +201,68 @@ def _save(self) -> None: self.grab_release() self.destroy() + def _start_download(self, model_name: str) -> None: + """Expand window, show progress, and download + convert the model.""" + self.geometry("550x640") + + self._save_btn.configure(state="disabled") + self._cancel_btn.configure(state="disabled") + + if self._dl_frame: + self._dl_frame.destroy() + + self._dl_frame = ctk.CTkFrame(self) + self._dl_frame.pack(fill="x", padx=20, pady=(0, 10)) + + ctk.CTkLabel( + self._dl_frame, + text=f"Downloading model '{model_name}'", + font=("Segoe UI", 13, "bold"), + ).pack(pady=(10, 2)) + + self._dl_status = ctk.CTkLabel( + self._dl_frame, + text="Starting...", + font=("Segoe UI", 11), + text_color="gray", + ) + self._dl_status.pack(pady=4) + + self._dl_bar = ctk.CTkProgressBar(self._dl_frame, width=460) + self._dl_bar.pack(pady=(4, 10)) + self._dl_bar.configure(mode="indeterminate") + self._dl_bar.start() + + threading.Thread( + target=self._download_bg, args=(model_name,), daemon=True + ).start() + + def _download_bg(self, model_name: str) -> None: + def set_status(text: str) -> None: + self.after(0, lambda: self._dl_status.configure(text=text)) + + try: + download_model(model_name, progress_callback=set_status) + self.after(0, self._download_complete) + except Exception as exc: + log.error("Model download/conversion failed", exc_info=True) + self.after(0, lambda: self._download_failed(str(exc))) + + def _download_complete(self) -> None: + self._dl_bar.stop() + self._dl_bar.set(1) + self._dl_bar.configure(mode="determinate") + self._dl_status.configure(text="Done! Saving settings...", text_color="green") + self.after(600, self._apply_and_close) + + def _download_failed(self, error: str) -> None: + self._dl_bar.stop() + self._dl_bar.set(0) + short_error = error.splitlines()[0][:80] + self._dl_status.configure(text=f"Error: {short_error}", text_color="red") + self._save_btn.configure(state="normal") + self._cancel_btn.configure(state="normal") + def _cancel(self) -> None: self.grab_release() self.destroy()