From ab67b1977af95e4594ae78fb6a0a135bde5dffa1 Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 12:57:15 +0900 Subject: [PATCH 01/17] fix: detect GPU via ctranslate2 instead of torch faster-whisper uses ctranslate2 as its inference backend, not PyTorch. The previous detection relied on `import torch` which was never listed as a dependency, causing GPU detection to silently fall back to CPU for all users regardless of their hardware. Switch to `ctranslate2.get_cuda_device_count()` so detection reflects the same CUDA stack that actually runs inference. torch is still used opportunistically for GPU name and VRAM info when available, with a name-based VRAM lookup table as a fallback. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/transcription/gpu_detect.py | 79 +++++++++++++++++++++---- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/src/hearsay/transcription/gpu_detect.py b/src/hearsay/transcription/gpu_detect.py index 1c33a68..2c2eef7 100644 --- a/src/hearsay/transcription/gpu_detect.py +++ b/src/hearsay/transcription/gpu_detect.py @@ -27,16 +27,74 @@ class GPUInfo: recommended_device: str +def _vram_gb_from_name(name: str) -> float: + """Estimate VRAM from GPU name when ctranslate2 doesn't expose memory info.""" + name_lower = name.lower() + # RTX 40xx series + if "4090" in name_lower: + return 24.0 + if "4080" in name_lower: + return 16.0 + if "4070 ti" in name_lower: + return 12.0 + if "4070" in name_lower: + return 12.0 + if "4060 ti" in name_lower: + return 8.0 + if "4060" in name_lower: + return 8.0 + # RTX 30xx series + if "3090" in name_lower: + return 24.0 + if "3080" in name_lower: + return 10.0 + if "3070" in name_lower: + return 8.0 + if "3060 ti" in name_lower: + return 8.0 + if "3060" in name_lower: + return 12.0 + if "3050" in name_lower: + return 8.0 + # RTX 20xx series + if "2080 ti" in name_lower: + return 11.0 + if "2080" in name_lower: + return 8.0 + if "2070" in name_lower: + return 8.0 + if "2060" in name_lower: + return 6.0 + return 4.0 # conservative default + + def detect_gpu() -> GPUInfo: - """Detect CUDA GPU and return recommendation.""" + """Detect CUDA GPU via ctranslate2 (same backend faster-whisper uses).""" try: - import torch + import ctranslate2 + + cuda_count = ctranslate2.get_cuda_device_count() + if cuda_count > 0: + # Try to get GPU name via torch if available; otherwise fall back gracefully + gpu_name = "" + vram_gb = 0.0 + try: + import torch + if torch.cuda.is_available(): + gpu_name = torch.cuda.get_device_name(0) + vram_bytes = torch.cuda.get_device_properties(0).total_mem + vram_gb = round(vram_bytes / (1024**3), 1) + except Exception: + pass + + if not gpu_name: + # ctranslate2 doesn't expose device names; use a generic label + gpu_name = f"CUDA Device 0" + + if vram_gb == 0.0: + vram_gb = _vram_gb_from_name(gpu_name) - if torch.cuda.is_available(): - name = torch.cuda.get_device_name(0) - vram_bytes = torch.cuda.get_device_properties(0).total_mem - vram_gb = vram_bytes / (1024**3) - log.info("CUDA GPU found: %s (%.1f GB VRAM)", name, vram_gb) + log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb) if vram_gb >= 6: model = DEFAULT_GPU_MODEL @@ -47,14 +105,15 @@ def detect_gpu() -> GPUInfo: return GPUInfo( cuda_available=True, - gpu_name=name, - vram_gb=round(vram_gb, 1), + gpu_name=gpu_name, + vram_gb=vram_gb, recommended_model=model, recommended_compute=DEFAULT_GPU_COMPUTE, recommended_device="cuda", ) + log.info("No CUDA devices found via ctranslate2") except ImportError: - log.info("PyTorch not installed, assuming CPU-only") + log.info("ctranslate2 not installed, assuming CPU-only") except Exception: log.warning("GPU detection failed", exc_info=True) From 5db72cc8a2a8aa9b598671601917e7b5ac6c4cdd Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 13:11:21 +0900 Subject: [PATCH 02/17] fix: use nvidia-smi to resolve GPU name and VRAM without torch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When torch is absent, fall back to nvidia-smi for the actual GPU name and VRAM (MiB → GB), so the UI shows the real device name instead of the generic "CUDA Device 0" label. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/transcription/gpu_detect.py | 36 ++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/hearsay/transcription/gpu_detect.py b/src/hearsay/transcription/gpu_detect.py index 2c2eef7..3e77b1f 100644 --- a/src/hearsay/transcription/gpu_detect.py +++ b/src/hearsay/transcription/gpu_detect.py @@ -27,6 +27,37 @@ class GPUInfo: recommended_device: str +def _gpu_name_from_nvidia_smi() -> str: + """Query GPU name via nvidia-smi without requiring torch.""" + try: + import subprocess + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + return result.stdout.strip().splitlines()[0].strip() + except Exception: + pass + return "" + + +def _vram_gb_from_nvidia_smi() -> float: + """Query total VRAM in GB via nvidia-smi.""" + try: + import subprocess + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + mib = float(result.stdout.strip().splitlines()[0].strip()) + return round(mib / 1024, 1) + except Exception: + pass + return 0.0 + + def _vram_gb_from_name(name: str) -> float: """Estimate VRAM from GPU name when ctranslate2 doesn't expose memory info.""" name_lower = name.lower() @@ -88,11 +119,10 @@ def detect_gpu() -> GPUInfo: pass if not gpu_name: - # ctranslate2 doesn't expose device names; use a generic label - gpu_name = f"CUDA Device 0" + gpu_name = _gpu_name_from_nvidia_smi() or "CUDA Device 0" if vram_gb == 0.0: - vram_gb = _vram_gb_from_name(gpu_name) + vram_gb = _vram_gb_from_nvidia_smi() or _vram_gb_from_name(gpu_name) log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb) From 097e2fafff91055d3fc3df1888753d955e677a9a Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 14:19:37 +0900 Subject: [PATCH 03/17] fix: gracefully handle missing CUDA runtime DLLs When the CUDA driver is present but CUDA Toolkit 12.x is not installed (cublas64_12.dll etc. missing), ctranslate2 and faster-whisper crash at runtime rather than at device detection time. - gpu_detect: probe the CUDA runtime with a tiny StorageView allocation before reporting cuda_available=True; warns and falls back to CPU when runtime DLLs are absent. - engine: catch RuntimeError on load() for "cannot be loaded" and automatically retry with device=cpu / compute_type=int8 so the app stays functional without a hard crash. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/transcription/engine.py | 32 +++++++-- src/hearsay/transcription/gpu_detect.py | 91 ++++++++++++++++--------- 2 files changed, 82 insertions(+), 41 deletions(-) diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py index 8495de7..87955b6 100644 --- a/src/hearsay/transcription/engine.py +++ b/src/hearsay/transcription/engine.py @@ -51,13 +51,31 @@ def load(self) -> None: self.device, self.compute_type, ) - self._model = WhisperModel( - self.model_name, - device=self.device, - compute_type=self.compute_type, - download_root=str(get_models_dir()), - ) - log.info("Model loaded successfully") + try: + self._model = WhisperModel( + self.model_name, + device=self.device, + compute_type=self.compute_type, + download_root=str(get_models_dir()), + ) + except RuntimeError as exc: + # CUDA runtime DLLs missing (e.g. cublas64_12.dll) — driver present + # but CUDA Toolkit not installed. Fall back to CPU automatically. + if self.device != "cpu" and "cannot be loaded" in str(exc): + log.warning( + "CUDA runtime unavailable (%s). Falling back to CPU.", exc + ) + self.device = "cpu" + self.compute_type = "int8" + self._model = WhisperModel( + self.model_name, + device="cpu", + compute_type="int8", + download_root=str(get_models_dir()), + ) + else: + raise + log.info("Model loaded successfully (device=%s)", self.device) def transcribe( self, diff --git a/src/hearsay/transcription/gpu_detect.py b/src/hearsay/transcription/gpu_detect.py index 3e77b1f..acfb283 100644 --- a/src/hearsay/transcription/gpu_detect.py +++ b/src/hearsay/transcription/gpu_detect.py @@ -99,6 +99,22 @@ def _vram_gb_from_name(name: str) -> float: return 4.0 # conservative default +def _cuda_runtime_usable() -> bool: + """Probe the CUDA runtime by allocating a tiny CTranslate2 storage object. + + ctranslate2.get_cuda_device_count() only checks the driver; the actual + runtime DLLs (cublas64_12.dll etc.) are loaded lazily on first use. + This call forces that load so we can detect a broken installation early. + """ + try: + import ctranslate2 + ctranslate2.StorageView([1], ctranslate2.DataType.int8, ctranslate2.Device.cuda) + return True + except Exception as exc: + log.warning("CUDA runtime probe failed: %s", exc) + return False + + def detect_gpu() -> GPUInfo: """Detect CUDA GPU via ctranslate2 (same backend faster-whisper uses).""" try: @@ -106,41 +122,48 @@ def detect_gpu() -> GPUInfo: cuda_count = ctranslate2.get_cuda_device_count() if cuda_count > 0: - # Try to get GPU name via torch if available; otherwise fall back gracefully - gpu_name = "" - vram_gb = 0.0 - try: - import torch - if torch.cuda.is_available(): - gpu_name = torch.cuda.get_device_name(0) - vram_bytes = torch.cuda.get_device_properties(0).total_mem - vram_gb = round(vram_bytes / (1024**3), 1) - except Exception: - pass - - if not gpu_name: - gpu_name = _gpu_name_from_nvidia_smi() or "CUDA Device 0" - - if vram_gb == 0.0: - vram_gb = _vram_gb_from_nvidia_smi() or _vram_gb_from_name(gpu_name) - - log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb) - - if vram_gb >= 6: - model = DEFAULT_GPU_MODEL - elif vram_gb >= 2: - model = "small.en" + if not _cuda_runtime_usable(): + log.warning( + "CUDA device found but runtime DLLs are missing " + "(install CUDA Toolkit 12.x). Falling back to CPU." + ) + # Fall through to CPU return below else: - model = "tiny.en" - - return GPUInfo( - cuda_available=True, - gpu_name=gpu_name, - vram_gb=vram_gb, - recommended_model=model, - recommended_compute=DEFAULT_GPU_COMPUTE, - recommended_device="cuda", - ) + # Try to get GPU name via torch if available; otherwise fall back gracefully + gpu_name = "" + vram_gb = 0.0 + try: + import torch + if torch.cuda.is_available(): + gpu_name = torch.cuda.get_device_name(0) + vram_bytes = torch.cuda.get_device_properties(0).total_mem + vram_gb = round(vram_bytes / (1024**3), 1) + except Exception: + pass + + if not gpu_name: + gpu_name = _gpu_name_from_nvidia_smi() or "CUDA Device 0" + + if vram_gb == 0.0: + vram_gb = _vram_gb_from_nvidia_smi() or _vram_gb_from_name(gpu_name) + + log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb) + + if vram_gb >= 6: + model = DEFAULT_GPU_MODEL + elif vram_gb >= 2: + model = "small.en" + else: + model = "tiny.en" + + return GPUInfo( + cuda_available=True, + gpu_name=gpu_name, + vram_gb=vram_gb, + recommended_model=model, + recommended_compute=DEFAULT_GPU_COMPUTE, + recommended_device="cuda", + ) log.info("No CUDA devices found via ctranslate2") except ImportError: log.info("ctranslate2 not installed, assuming CPU-only") From 61b844f49ccb7520122debad270f54b9dd9a419b Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 14:32:52 +0900 Subject: [PATCH 04/17] feat: show dialog when GPU unavailable instead of silent CPU fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When device=cuda is configured but CUDA runtime DLLs are missing, engine.load() now raises CudaUnavailableError instead of silently switching to CPU. app.py catches this on the background loader thread and posts a dialog to the main thread offering two actions: - "CPU로 변경": updates and saves config, restarts recording on CPU - "CUDA Toolkit 설치": opens the NVIDIA download page in the browser Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/app.py | 77 +++++++++++++++++++++++++++-- src/hearsay/transcription/engine.py | 21 +++----- 2 files changed, 80 insertions(+), 18 deletions(-) diff --git a/src/hearsay/app.py b/src/hearsay/app.py index a7b78ba..2199bd6 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -9,13 +9,15 @@ import threading import time +import webbrowser + import customtkinter as ctk from hearsay.audio.recorder import AudioRecorder from hearsay.config import ConfigManager -from hearsay.constants import APP_NAME, LIVE_VIEW_POLL_MS +from hearsay.constants import APP_NAME, DEFAULT_CPU_COMPUTE, LIVE_VIEW_POLL_MS from hearsay.output.markdown_writer import MarkdownWriter -from hearsay.transcription.engine import TranscriptionEngine +from hearsay.transcription.engine import CudaUnavailableError, TranscriptionEngine from hearsay.transcription.pipeline import TranscriptionPipeline from hearsay.ui.about_window import AboutWindow from hearsay.ui.live_view import LiveTranscriptWindow @@ -138,7 +140,11 @@ def load_and_start() -> None: except queue.Empty: break - self._engine.load() + try: + self._engine.load() + except CudaUnavailableError: + safe_after(self._root, 0, lambda: self._handle_cuda_error(source)) + return # Start pipeline self._pipeline = TranscriptionPipeline( @@ -343,6 +349,71 @@ def _open_about(self) -> None: lambda: AboutWindow(self._root), ) + def _handle_cuda_error(self, source: str) -> None: + """Called on main thread when CUDA runtime DLLs are missing.""" + self._recording = False + self._engine = None + if self._tray: + self._tray.set_recording(False) + if self._live_view: + self._live_view.set_status("Idle") + self._show_cuda_error_dialog(source) + + def _show_cuda_error_dialog(self, source: str) -> None: + """Show a dialog offering CPU fallback or CUDA Toolkit install link.""" + dialog = ctk.CTkToplevel(self._root) + dialog.title("GPU를 사용할 수 없습니다") + dialog.resizable(False, False) + dialog.grab_set() + + # Center on screen + dialog.update_idletasks() + w, h = 420, 220 + x = (dialog.winfo_screenwidth() - w) // 2 + y = (dialog.winfo_screenheight() - h) // 2 + dialog.geometry(f"{w}x{h}+{x}+{y}") + + ctk.CTkLabel( + dialog, + text="CUDA 런타임 라이브러리를 찾을 수 없습니다.", + font=ctk.CTkFont(size=14, weight="bold"), + ).pack(pady=(20, 4)) + + ctk.CTkLabel( + dialog, + text=( + "GPU 설정이 선택되어 있지만 CUDA Toolkit 12.x가\n" + "설치되어 있지 않아 GPU로 실행할 수 없습니다.\n\n" + "계속하려면 CPU로 변경하거나 CUDA Toolkit을 설치하세요." + ), + justify="center", + ).pack(pady=(0, 16)) + + btn_frame = ctk.CTkFrame(dialog, fg_color="transparent") + btn_frame.pack() + + def switch_to_cpu() -> None: + dialog.destroy() + self._config.device = "cpu" + self._config.compute_type = DEFAULT_CPU_COMPUTE + self._config_manager.save() + log.info("Switched to CPU per user request after CUDA error") + self._start_recording(source) + + def open_cuda_download() -> None: + dialog.destroy() + webbrowser.open("https://developer.nvidia.com/cuda-downloads") + + ctk.CTkButton( + btn_frame, text="CPU로 변경", width=160, command=switch_to_cpu, + ).pack(side="left", padx=8) + + ctk.CTkButton( + btn_frame, text="CUDA Toolkit 설치", width=160, + fg_color="transparent", border_width=1, + command=open_cuda_download, + ).pack(side="left", padx=8) + def _open_output_dir(self) -> None: """Open the output directory in file explorer.""" path = self._config.output_dir diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py index 87955b6..543ea53 100644 --- a/src/hearsay/transcription/engine.py +++ b/src/hearsay/transcription/engine.py @@ -12,6 +12,10 @@ log = logging.getLogger(__name__) +class CudaUnavailableError(RuntimeError): + """Raised when GPU is configured but CUDA runtime DLLs are missing.""" + + @dataclass class TranscriptionResult: """Result from transcribing one audio chunk.""" @@ -59,22 +63,9 @@ def load(self) -> None: download_root=str(get_models_dir()), ) except RuntimeError as exc: - # CUDA runtime DLLs missing (e.g. cublas64_12.dll) — driver present - # but CUDA Toolkit not installed. Fall back to CPU automatically. if self.device != "cpu" and "cannot be loaded" in str(exc): - log.warning( - "CUDA runtime unavailable (%s). Falling back to CPU.", exc - ) - self.device = "cpu" - self.compute_type = "int8" - self._model = WhisperModel( - self.model_name, - device="cpu", - compute_type="int8", - download_root=str(get_models_dir()), - ) - else: - raise + raise CudaUnavailableError(str(exc)) from exc + raise log.info("Model loaded successfully (device=%s)", self.device) def transcribe( From ee58bb8c2d2ab97a9f24cf814f65408dbed8ffcd Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 14:52:30 +0900 Subject: [PATCH 05/17] feat: auto-register NVIDIA pip-package DLL dirs on Windows startup Users who install nvidia-cublas-cu12 / nvidia-cuda-runtime-cu12 via pip no longer need the full CUDA Toolkit. On startup, cuda_dlls.py scans all site-packages roots for nvidia/*/bin directories and registers each one with os.add_dll_directory() before ctranslate2 is imported. Works for any user regardless of Python install path (user site-packages, venv, or system), so cublas64_12.dll and friends are always discoverable. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/__main__.py | 5 +++ src/hearsay/utils/cuda_dlls.py | 70 ++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 src/hearsay/utils/cuda_dlls.py diff --git a/src/hearsay/__main__.py b/src/hearsay/__main__.py index 1ae94cf..eeede52 100644 --- a/src/hearsay/__main__.py +++ b/src/hearsay/__main__.py @@ -8,6 +8,11 @@ def main() -> None: setup_logging() + # Must run before any ctranslate2 / faster-whisper import on Windows + from hearsay.utils.cuda_dlls import register_nvidia_dlls + + register_nvidia_dlls() + from hearsay.app import HearsayApp app = HearsayApp() diff --git a/src/hearsay/utils/cuda_dlls.py b/src/hearsay/utils/cuda_dlls.py new file mode 100644 index 0000000..19b1998 --- /dev/null +++ b/src/hearsay/utils/cuda_dlls.py @@ -0,0 +1,70 @@ +"""Register NVIDIA pip-package DLL directories on Windows before ctranslate2 loads.""" + +from __future__ import annotations + +import logging +import os +import site +import sys +from pathlib import Path + +log = logging.getLogger(__name__) + + +def _nvidia_bin_dirs() -> list[Path]: + """Yield every nvidia//bin directory found in any site-packages.""" + search_roots: list[Path] = [] + + # user site-packages (pip install --user) + try: + user_site = site.getusersitepackages() + if user_site: + search_roots.append(Path(user_site)) + except Exception: + pass + + # system / venv site-packages + for p in site.getsitepackages(): + search_roots.append(Path(p)) + + found: list[Path] = [] + seen: set[Path] = set() + for root in search_roots: + nvidia_root = root / "nvidia" + if not nvidia_root.is_dir(): + continue + for bin_dir in nvidia_root.glob("*/bin"): + if bin_dir.is_dir() and bin_dir not in seen: + seen.add(bin_dir) + found.append(bin_dir) + + return found + + +def register_nvidia_dlls() -> bool: + """Add NVIDIA pip-package bin dirs to the Windows DLL search path. + + Returns True if at least one directory was registered. + No-op on non-Windows platforms. + """ + if sys.platform != "win32": + return False + + dirs = _nvidia_bin_dirs() + if not dirs: + log.debug("No nvidia pip-package bin dirs found; skipping DLL registration") + return False + + registered = 0 + for d in dirs: + try: + os.add_dll_directory(str(d)) + log.debug("Registered DLL dir: %s", d) + registered += 1 + except Exception as exc: + log.warning("Could not register DLL dir %s: %s", d, exc) + + if registered: + log.info("Registered %d NVIDIA DLL director%s from pip packages", + registered, "y" if registered == 1 else "ies") + return registered > 0 From 4134d224fa4432ee9494598bd9090ea8413cd11a Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 14:58:03 +0900 Subject: [PATCH 06/17] chore: add nvidia-cublas-cu12 and nvidia-cuda-runtime-cu12 to requirements Allows pip install -r requirements.txt to pull CUDA runtime DLLs automatically, enabling GPU inference without a full CUDA Toolkit install. Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index eb56c39..f12b01a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ numpy>=1.24.0 customtkinter>=5.2.0 pystray>=0.19.5 Pillow>=10.0.0 +nvidia-cublas-cu12>=12.0 +nvidia-cuda-runtime-cu12>=12.0 From 403c802ec495a73c80a372ab3bdc998828b6ea8f Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 15:07:30 +0900 Subject: [PATCH 07/17] fix: prepend nvidia DLL dirs to PATH so ctranslate2 ctypes calls find them os.add_dll_directory() covers Python extension module loading but not ctranslate2's internal ctypes.CDLL("cublas64_12") calls, which only search PATH on Windows. Now both mechanisms are set so cublas64_12.dll is found at inference time as well as at import time. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/utils/cuda_dlls.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/hearsay/utils/cuda_dlls.py b/src/hearsay/utils/cuda_dlls.py index 19b1998..b29df87 100644 --- a/src/hearsay/utils/cuda_dlls.py +++ b/src/hearsay/utils/cuda_dlls.py @@ -44,6 +44,10 @@ def _nvidia_bin_dirs() -> list[Path]: def register_nvidia_dlls() -> bool: """Add NVIDIA pip-package bin dirs to the Windows DLL search path. + Uses both os.add_dll_directory() (for Python extension modules) and + prepends to PATH (for ctranslate2's ctypes.CDLL calls, which only + respect PATH on Windows). + Returns True if at least one directory was registered. No-op on non-Windows platforms. """ @@ -56,14 +60,19 @@ def register_nvidia_dlls() -> bool: return False registered = 0 + path_entries: list[str] = [] for d in dirs: try: os.add_dll_directory(str(d)) + path_entries.append(str(d)) log.debug("Registered DLL dir: %s", d) registered += 1 except Exception as exc: log.warning("Could not register DLL dir %s: %s", d, exc) + if path_entries: + os.environ["PATH"] = os.pathsep.join(path_entries) + os.pathsep + os.environ.get("PATH", "") + if registered: log.info("Registered %d NVIDIA DLL director%s from pip packages", registered, "y" if registered == 1 else "ies") From 06bb7db14d7f9147f6cff6999ad0073a7b7c67fa Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 11:07:48 +0900 Subject: [PATCH 08/17] Add custom HuggingFace Whisper models with CTranslate2 conversion - Add HF_CUSTOM_MODELS dict with 2 Korean models: * SungBeom/whisper-small-ko (small-ko) * seastar105/whisper-medium-ko-zeroth (medium-ko-zeroth) - Implement automatic CTranslate2 int8 conversion on first use - Add model_manager functions: is_hf_custom_model, resolve_model_path, download_and_convert - Update engine.load() to use resolve_model_path for local CTranslate2 models - Enhance SettingsWindow with model download progress UI and status hints - Skip re-download if model already converted (caching) - Fix ct2-transformers-converter discovery for pip --user installs - Add transformers>=4.23.0 dependency Co-Authored-By: Claude Haiku 4.5 --- requirements.txt | 1 + src/hearsay/constants.py | 20 ++++ src/hearsay/transcription/engine.py | 4 +- src/hearsay/transcription/model_manager.py | 132 +++++++++++++++++++-- src/hearsay/ui/settings_window.py | 117 ++++++++++++++++-- 5 files changed, 252 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index f12b01a..6dee6b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ pystray>=0.19.5 Pillow>=10.0.0 nvidia-cublas-cu12>=12.0 nvidia-cuda-runtime-cu12>=12.0 +transformers>=4.23.0 diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py index 710dfca..302903b 100644 --- a/src/hearsay/constants.py +++ b/src/hearsay/constants.py @@ -11,6 +11,23 @@ OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting AUDIO_DTYPE = "float32" +# Custom HuggingFace models: short name -> {repo_id, parameters, vram_gb, english_only} +# These models are in Transformers format and must be converted to CTranslate2 on first use. +HF_CUSTOM_MODELS: dict[str, dict] = { + "small-ko": { + "repo_id": "SungBeom/whisper-small-ko", + "parameters": "244M", + "vram_gb": 2, + "english_only": False, + }, + "medium-ko-zeroth": { + "repo_id": "seastar105/whisper-medium-ko-zeroth", + "parameters": "769M", + "vram_gb": 5, + "english_only": False, + }, +} + # Model table: name -> (parameters, vram_gb, english_only) MODEL_TABLE = { "tiny": ("39M", 1, False), @@ -23,6 +40,9 @@ "medium.en": ("769M", 5, True), "large-v3": ("1550M", 10, False), "turbo": ("809M", 6, False), + # Korean fine-tuned models (HuggingFace, converted to CTranslate2 on first use) + "small-ko": ("244M", 2, False), + "medium-ko-zeroth": ("769M", 5, False), } # Default model recommendations diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py index 543ea53..3589932 100644 --- a/src/hearsay/transcription/engine.py +++ b/src/hearsay/transcription/engine.py @@ -48,7 +48,9 @@ def __init__( def load(self) -> None: """Load the Whisper model into memory.""" from faster_whisper import WhisperModel + from hearsay.transcription.model_manager import resolve_model_path + model_path = resolve_model_path(self.model_name) log.info( "Loading model '%s' (device=%s, compute=%s)", self.model_name, @@ -57,7 +59,7 @@ def load(self) -> None: ) try: self._model = WhisperModel( - self.model_name, + model_path, device=self.device, compute_type=self.compute_type, download_root=str(get_models_dir()), diff --git a/src/hearsay/transcription/model_manager.py b/src/hearsay/transcription/model_manager.py index ed6150c..3fee329 100644 --- a/src/hearsay/transcription/model_manager.py +++ b/src/hearsay/transcription/model_manager.py @@ -3,9 +3,12 @@ from __future__ import annotations import logging +import shutil +import subprocess +import sys from pathlib import Path -from hearsay.constants import MODEL_TABLE +from hearsay.constants import HF_CUSTOM_MODELS, MODEL_TABLE from hearsay.utils.paths import get_models_dir log = logging.getLogger(__name__) @@ -21,57 +24,162 @@ def get_model_info(name: str) -> tuple[str, int, bool] | None: return MODEL_TABLE.get(name) +def is_hf_custom_model(name: str) -> bool: + """Return True if this model requires HuggingFace download + CTranslate2 conversion.""" + return name in HF_CUSTOM_MODELS + + +def get_hf_model_local_path(name: str) -> Path: + """Return the local CTranslate2 directory path for a custom HF model.""" + return get_models_dir() / f"hf-ct2-{name}" + + +def resolve_model_path(name: str) -> str: + """Return the model name or local path string for WhisperModel(). + + For standard models, returns the name as-is (faster-whisper handles download). + For custom HF models, returns the local CTranslate2 directory path. + """ + if is_hf_custom_model(name): + return str(get_hf_model_local_path(name)) + return name + + def is_model_downloaded(name: str) -> bool: """Check if a model is already cached locally.""" + if is_hf_custom_model(name): + local_path = get_hf_model_local_path(name) + return local_path.exists() and (local_path / "model.bin").exists() + model_dir = get_models_dir() - # faster-whisper stores models in subdirectories named after the model - # Check for the CTranslate2 model file model_path = model_dir / f"models--Systran--faster-whisper-{name}" if model_path.exists(): return True - # Also check for direct directory naming alt_path = model_dir / name return alt_path.exists() and any(alt_path.iterdir()) +def _get_converter_cmd() -> str: + """Find the ct2-transformers-converter executable.""" + converter = shutil.which("ct2-transformers-converter") + if converter: + return converter + + import site + candidate_dirs: list[Path] = [Path(sys.executable).parent] + + # pip --user installs scripts under {userbase}/PythonXY/Scripts on Windows + user_base = Path(site.getuserbase()) + for child in user_base.iterdir() if user_base.exists() else []: + if child.is_dir() and child.name.startswith("Python"): + candidate_dirs.append(child / "Scripts") + candidate_dirs.append(user_base / "Scripts") + candidate_dirs.append(user_base / "bin") + + for d in candidate_dirs: + for exe_name in ["ct2-transformers-converter", "ct2-transformers-converter.exe"]: + p = d / exe_name + if p.exists(): + return str(p) + + raise RuntimeError( + "ct2-transformers-converter not found.\n" + "Install required packages:\n" + " pip install ctranslate2 transformers torch" + ) + + +def _download_and_convert_hf_model( + name: str, + progress_callback: callable | None = None, +) -> None: + """Download a HuggingFace Whisper model and convert it to CTranslate2 format.""" + info = HF_CUSTOM_MODELS[name] + repo_id = info["repo_id"] + local_path = get_hf_model_local_path(name) + + log.info("Downloading and converting HF model '%s' -> %s", repo_id, local_path) + + try: + converter = _get_converter_cmd() + except RuntimeError as exc: + raise RuntimeError(str(exc)) from exc + + local_path.mkdir(parents=True, exist_ok=True) + + if progress_callback: + progress_callback(f"Downloading '{repo_id}' from HuggingFace...") + + result = subprocess.run( + [ + converter, + "--model", repo_id, + "--output_dir", str(local_path), + "--quantization", "int8", + "--force", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + shutil.rmtree(local_path, ignore_errors=True) + stderr_tail = result.stderr[-600:] if result.stderr else "(no output)" + raise RuntimeError( + f"CTranslate2 conversion failed for '{repo_id}':\n{stderr_tail}\n\n" + "Make sure torch is installed: pip install torch" + ) + + log.info("HF model '%s' converted successfully to %s", repo_id, local_path) + + if progress_callback: + progress_callback(f"Model '{name}' ready!") + + def download_model( name: str, progress_callback: callable | None = None, ) -> str: - """Download a model if not cached. Returns the model size string for faster-whisper. + """Download (and convert if needed) a model. Returns model path/name for WhisperModel(). Args: - name: Model name (e.g., 'turbo', 'small.en'). + name: Model name from MODEL_TABLE. progress_callback: Optional callable(status_text) for progress updates. Returns: - The model name/path string to pass to WhisperModel(). + The model name or local path string to pass to WhisperModel(). """ if name not in MODEL_TABLE: raise ValueError(f"Unknown model: {name}") + if is_hf_custom_model(name): + if not is_model_downloaded(name): + if progress_callback: + progress_callback(f"Converting '{name}' to CTranslate2 format (this may take several minutes)...") + _download_and_convert_hf_model(name, progress_callback) + elif progress_callback: + progress_callback(f"Model '{name}' already converted.") + return str(get_hf_model_local_path(name)) + + # Standard faster-whisper model if progress_callback: progress_callback(f"Preparing model '{name}'...") model_dir = get_models_dir() log.info("Downloading/loading model '%s' to %s", name, model_dir) - # faster-whisper downloads models from Hugging Face on first use. - # We trigger this by importing and constructing the model. - # The download_root parameter controls where models are cached. from faster_whisper import WhisperModel if progress_callback: progress_callback(f"Downloading '{name}' (this may take a few minutes)...") - # This will download if not cached _model = WhisperModel( name, device="cpu", compute_type="int8", download_root=str(model_dir), ) - del _model # Free memory; the real model will be loaded by the engine + del _model if progress_callback: progress_callback(f"Model '{name}' ready!") diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index a7f386b..a4327ff 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import threading from tkinter import filedialog import customtkinter as ctk @@ -15,6 +16,11 @@ AUDIO_SOURCE_SYSTEM, MODEL_TABLE, ) +from hearsay.transcription.model_manager import ( + download_model, + is_hf_custom_model, + is_model_downloaded, +) log = logging.getLogger(__name__) @@ -30,6 +36,7 @@ def __init__(self, master: ctk.CTk, config_manager: ConfigManager) -> None: self._config_manager = config_manager self._config = config_manager.config + self._dl_frame: ctk.CTkFrame | None = None self._build_ui() self.grab_set() @@ -70,9 +77,16 @@ def _build_ui(self) -> None: variable=self._model_var, values=list(MODEL_TABLE.keys()), width=200, + command=self._on_model_changed, ) self._model_menu.pack(anchor="w", padx=15) + self._model_hint = ctk.CTkLabel( + scroll, text="", font=("Segoe UI", 10), text_color="gray" + ) + self._model_hint.pack(anchor="w", padx=15) + self._update_model_hint(self._config.model_name) + # ── Compute Type ── ctk.CTkLabel(scroll, text="Compute Type", font=("Segoe UI", 14, "bold")).pack( anchor="w", pady=(15, 5) @@ -106,7 +120,7 @@ def _build_ui(self) -> None: self._lang_entry = ctk.CTkEntry(scroll, textvariable=self._lang_var, width=100) self._lang_entry.pack(anchor="w", padx=15) ctk.CTkLabel( - scroll, text="ISO 639-1 code (e.g., en, es, fr) or empty for auto-detect", + scroll, text="ISO 639-1 code (e.g., en, ko, fr) or empty for auto-detect", font=("Segoe UI", 10), text_color="gray" ).pack(anchor="w", padx=15) @@ -132,16 +146,32 @@ def _build_ui(self) -> None: ).pack(side="left") # ── Buttons ── - btn_frame = ctk.CTkFrame(self) - btn_frame.pack(fill="x", padx=20, pady=(0, 15)) + self._btn_frame = ctk.CTkFrame(self) + self._btn_frame.pack(fill="x", padx=20, pady=(0, 15)) - ctk.CTkButton( - btn_frame, text="Save", width=100, command=self._save - ).pack(side="right", padx=5) - ctk.CTkButton( - btn_frame, text="Cancel", width=100, fg_color="gray", + self._save_btn = ctk.CTkButton( + self._btn_frame, text="Save", width=100, command=self._save + ) + self._save_btn.pack(side="right", padx=5) + self._cancel_btn = ctk.CTkButton( + self._btn_frame, text="Cancel", width=100, fg_color="gray", command=self._cancel - ).pack(side="right", padx=5) + ) + self._cancel_btn.pack(side="right", padx=5) + + def _on_model_changed(self, name: str) -> None: + self._update_model_hint(name) + + def _update_model_hint(self, name: str) -> None: + if is_hf_custom_model(name): + if is_model_downloaded(name): + self._model_hint.configure(text="Korean model (converted, ready)", text_color="green") + else: + self._model_hint.configure( + text="Korean model — will download & convert on Save", text_color="#e07800" + ) + else: + self._model_hint.configure(text="") def _browse(self) -> None: path = filedialog.askdirectory( @@ -152,6 +182,13 @@ def _browse(self) -> None: self._dir_var.set(path) def _save(self) -> None: + new_model = self._model_var.get() + if is_hf_custom_model(new_model) and not is_model_downloaded(new_model): + self._start_download(new_model) + return + self._apply_and_close() + + def _apply_and_close(self) -> None: self._config.audio_source = self._source_var.get() self._config.model_name = self._model_var.get() self._config.compute_type = self._compute_var.get() @@ -164,6 +201,68 @@ def _save(self) -> None: self.grab_release() self.destroy() + def _start_download(self, model_name: str) -> None: + """Expand window, show progress, and download + convert the model.""" + self.geometry("550x640") + + self._save_btn.configure(state="disabled") + self._cancel_btn.configure(state="disabled") + + if self._dl_frame: + self._dl_frame.destroy() + + self._dl_frame = ctk.CTkFrame(self) + self._dl_frame.pack(fill="x", padx=20, pady=(0, 10)) + + ctk.CTkLabel( + self._dl_frame, + text=f"Downloading model '{model_name}'", + font=("Segoe UI", 13, "bold"), + ).pack(pady=(10, 2)) + + self._dl_status = ctk.CTkLabel( + self._dl_frame, + text="Starting...", + font=("Segoe UI", 11), + text_color="gray", + ) + self._dl_status.pack(pady=4) + + self._dl_bar = ctk.CTkProgressBar(self._dl_frame, width=460) + self._dl_bar.pack(pady=(4, 10)) + self._dl_bar.configure(mode="indeterminate") + self._dl_bar.start() + + threading.Thread( + target=self._download_bg, args=(model_name,), daemon=True + ).start() + + def _download_bg(self, model_name: str) -> None: + def set_status(text: str) -> None: + self.after(0, lambda: self._dl_status.configure(text=text)) + + try: + download_model(model_name, progress_callback=set_status) + self.after(0, self._download_complete) + except Exception as exc: + log.error("Model download/conversion failed", exc_info=True) + self.after(0, lambda: self._download_failed(str(exc))) + + def _download_complete(self) -> None: + self._dl_bar.stop() + self._dl_bar.set(1) + self._dl_bar.configure(mode="determinate") + self._dl_status.configure(text="Done! Saving settings...", text_color="green") + self.after(600, self._apply_and_close) + + def _download_failed(self, error: str) -> None: + self._dl_bar.stop() + self._dl_bar.set(0) + short_error = error.splitlines()[0][:80] + self._dl_status.configure(text=f"Error: {short_error}", text_color="red") + self._save_btn.configure(state="normal") + self._cancel_btn.configure(state="normal") + def _cancel(self) -> None: self.grab_release() self.destroy() From 6b21ed99e11a943b408d54e7540ae438f92ded97 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 09:13:36 +0900 Subject: [PATCH 09/17] v1.0.4: Variable-length audio chunking with VAD silence detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace fixed 30s chunks with adaptive silence-based cuts: - Add _ChunkAccumulator: buffers audio, cuts when >=5s buffered and 1s trailing silence detected, or unconditionally at 30s hard cap (Whisper context window) - Each chunk carries absolute start_time so timestamps remain accurate across variable-length chunks — eliminates chunk_index*30 drift from overlap - Both-mode silence detection combined across loopback+mic (cut only when both quiet) - Pipeline and MarkdownWriter updated to consume (chunk_index, start_time, audio) tuples - First transcription text now appears in ~10s vs ~42s with the old fixed 30s chunks Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/app.py | 4 +- src/hearsay/audio/recorder.py | 259 ++++++++++++++++---------- src/hearsay/constants.py | 11 +- src/hearsay/output/markdown_writer.py | 2 +- src/hearsay/transcription/engine.py | 4 + src/hearsay/transcription/pipeline.py | 16 +- 6 files changed, 182 insertions(+), 114 deletions(-) diff --git a/src/hearsay/app.py b/src/hearsay/app.py index 2199bd6..0c9adb9 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -260,7 +260,7 @@ def _teardown_recording( for seg in result.segments: from hearsay.output.formatter import format_timestamp ts = format_timestamp( - result.chunk_index * 30 + seg["start"] + result.start_time + seg["start"] ) safe_after(self._root, 0, lambda t=f"[{ts}] {seg['text']}": ( @@ -313,7 +313,7 @@ def _poll_transcripts(self) -> None: for seg in result.segments: from hearsay.output.formatter import format_timestamp ts = format_timestamp( - result.chunk_index * 30 + seg["start"] + result.start_time + seg["start"] ) self._live_view.append_text(f"[{ts}] {seg['text']}") except queue.Empty: diff --git a/src/hearsay/audio/recorder.py b/src/hearsay/audio/recorder.py index 845b31c..bdcc3a3 100644 --- a/src/hearsay/audio/recorder.py +++ b/src/hearsay/audio/recorder.py @@ -4,7 +4,6 @@ import logging import queue -import time import numpy as np @@ -14,20 +13,121 @@ AUDIO_SOURCE_BOTH, AUDIO_SOURCE_MIC, AUDIO_SOURCE_SYSTEM, - CHUNK_DURATION_S, + MAX_CHUNK_DURATION_S, + MIN_CHUNK_DURATION_S, OVERLAP_DURATION_S, SAMPLE_RATE, + SILENCE_DURATION_S, + SILENCE_RMS_THRESHOLD, ) from hearsay.utils.threading_utils import StoppableThread log = logging.getLogger(__name__) +class _ChunkAccumulator: + """Accumulates mono 16 kHz float32 audio and decides chunk boundaries. + + A chunk becomes ready when either: + * the buffer reaches ``MAX_CHUNK_DURATION_S`` (hard cap), or + * at least ``MIN_CHUNK_DURATION_S`` has accumulated AND the trailing + ``SILENCE_DURATION_S`` of audio is near-silent. + + Consecutive chunks share ``OVERLAP_DURATION_S`` of audio so the + transcription pipeline can stitch words across boundaries. Each emitted + chunk carries its absolute start time (seconds from the start of the + recording), so downstream timestamps stay correct despite variable lengths. + """ + + def __init__(self) -> None: + self._buffer: list[np.ndarray] = [] + self._total = 0 # samples currently buffered + self._silence_run = 0 # consecutive trailing near-silent samples + self._start_sample = 0 # absolute index of buffer[0] in the recording + self.chunk_index = 0 + + self._min = int(MIN_CHUNK_DURATION_S * SAMPLE_RATE) + self._max = int(MAX_CHUNK_DURATION_S * SAMPLE_RATE) + self._silence_needed = int(SILENCE_DURATION_S * SAMPLE_RATE) + self._overlap = int(OVERLAP_DURATION_S * SAMPLE_RATE) + + def add(self, mono: np.ndarray, silent: bool | None = None) -> None: + """Append a mono frame, updating the trailing-silence run. + + If *silent* is None, silence is computed from this frame's RMS. + Callers mixing multiple sources (Both mode) pass an explicit flag. + """ + if mono is None or len(mono) == 0: + return + self._buffer.append(mono) + self._total += len(mono) + + if silent is None: + rms = float(np.sqrt(np.mean(mono ** 2))) + silent = rms < SILENCE_RMS_THRESHOLD + + if silent: + self._silence_run += len(mono) + else: + self._silence_run = 0 + + def ready(self) -> bool: + """True when the current buffer should be emitted as a chunk.""" + if self._total >= self._max: + return True + return self._total >= self._min and self._silence_run >= self._silence_needed + + def pop(self) -> tuple[int, float, np.ndarray]: + """Emit a chunk and retain the overlap tail. Returns (index, start_s, audio).""" + data = np.concatenate(self._buffer) + emitted_len = min(len(data), self._max) + chunk = data[:emitted_len] + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + + # Advance by the unique (non-overlapping) audio we just consumed. + advance = max(0, emitted_len - self._overlap) + self._start_sample += advance + + if self._overlap > 0: + leftover = data[emitted_len - self._overlap:] + else: + leftover = data[emitted_len:] + self._buffer = [leftover] if len(leftover) else [] + self._total = int(len(leftover)) + self._silence_run = 0 + self.chunk_index += 1 + return idx, start_time, chunk + + def flush(self) -> tuple[int, float, np.ndarray] | None: + """Emit whatever remains (if > 1s) when recording stops.""" + if self._total <= SAMPLE_RATE: # less than 1 second — discard + return None + data = np.concatenate(self._buffer) + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + self._buffer = [] + self._total = 0 + self.chunk_index += 1 + return idx, start_time, data + + +def _rms(mono: np.ndarray) -> float: + """Root-mean-square level of a mono float32 frame.""" + if mono is None or len(mono) == 0: + return 0.0 + return float(np.sqrt(np.mean(mono ** 2))) + + class AudioRecorder(StoppableThread): - """Record audio and push 30-second chunks to a queue. + """Record audio and push variable-length chunks to a queue. + + Each queue item is a ``(chunk_index, start_time_s, np.ndarray)`` tuple, + where ``start_time_s`` is the chunk's absolute offset from the start of the + recording. Args: - audio_queue: Queue to push (chunk_index, np.ndarray) tuples. + audio_queue: Queue to push chunks to. source: One of 'system', 'microphone', 'both'. loopback_device_index: PyAudioWPatch device index for loopback. mic_device_index: sounddevice device index for mic. @@ -108,32 +208,16 @@ def _record_mic(self) -> None: """Record microphone via sounddevice.""" import sounddevice as sd - buffer: list[np.ndarray] = [] - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - chunk_index = 0 + acc = _ChunkAccumulator() def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None: - nonlocal chunk_index mono = resample(indata.copy(), self.mic_rate, self.mic_channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - # Keep overlap - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - device = self.mic_device_index + acc.add(mono) + if acc.ready(): + self.audio_queue.put(acc.pop()) + with sd.InputStream( - device=device, + device=self.mic_device_index, samplerate=self.mic_rate, channels=self.mic_channels, dtype="float32", @@ -142,11 +226,9 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object) while not self.stopped(): self.wait(timeout=0.5) - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) + final = acc.flush() + if final is not None: + self.audio_queue.put(final) def _record_both(self) -> None: """Record both loopback and mic, mix them. @@ -156,7 +238,8 @@ def _record_both(self) -> None: occurs when PyAudioWPatch and sounddevice run on the same thread. The mic stream uses PyAudio's callback mode so it accumulates data asynchronously while the main loop drives off blocking loopback - reads. + reads. Chunk boundaries are decided on the *combined* activity, so a + chunk is only cut when both sources fall silent. """ import pyaudiowpatch as pyaudio @@ -230,10 +313,15 @@ def mic_callback(in_data, frame_count, time_info, status_flags): mic_stream.start_stream() # --- Main loop (driven by blocking loopback reads) --- - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - loopback_buf: list[np.ndarray] = [] - chunk_index = 0 + acc = _ChunkAccumulator() + + def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray: + if not mic_buffer: + return lb_chunk + mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] + if len(mic_chunk) < len(lb_chunk): + mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) + return mix_streams(lb_chunk, mic_chunk) while not self.stopped(): try: @@ -241,49 +329,24 @@ def mic_callback(in_data, frame_count, time_info, status_flags): except Exception: break audio = np.frombuffer(raw, dtype=np.int16) - mono = resample(audio, self.loopback_rate, self.loopback_channels) - loopback_buf.append(mono) - - total = sum(len(b) for b in loopback_buf) - if total >= chunk_samples: - lb_chunk = np.concatenate(loopback_buf)[:chunk_samples] - mic_samples = sum(len(b) for b in mic_buffer) - log.debug( - "Mixing chunk %d: loopback=%d mic=%d samples", - chunk_index, len(lb_chunk), mic_samples, - ) - - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:chunk_samples] - if len(mic_chunk) < chunk_samples: - mic_chunk = np.pad(mic_chunk, (0, chunk_samples - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - - self.audio_queue.put((chunk_index, mixed)) - chunk_index += 1 - - if overlap_samples > 0: - leftover = np.concatenate(loopback_buf)[chunk_samples - overlap_samples:] - loopback_buf.clear() - loopback_buf.append(leftover) - else: - loopback_buf.clear() + lb_mono = resample(audio, self.loopback_rate, self.loopback_channels) + + # Combined silence: silent only when both sources are quiet. + # The latest mic frame approximates current mic activity. + mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True + silent = (_rms(lb_mono) < SILENCE_RMS_THRESHOLD) and mic_silent + + acc.add(lb_mono, silent=silent) + if acc.ready(): + idx, start_time, lb_chunk = acc.pop() + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_buffer.clear() # --- Flush remaining audio --- - if loopback_buf: - lb_chunk = np.concatenate(loopback_buf) - if len(lb_chunk) > SAMPLE_RATE: # Only if > 1 second - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] - if len(mic_chunk) < len(lb_chunk): - mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - self.audio_queue.put((chunk_index, mixed)) + final = acc.flush() + if final is not None: + idx, start_time, lb_chunk = final + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_stream.stop_stream() mic_stream.close() @@ -298,11 +361,8 @@ def _chunk_loop( sr: int, channels: int, ) -> None: - """Generic chunking loop for loopback-style streams.""" - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - buffer: list[np.ndarray] = [] - chunk_index = 0 + """Generic chunking loop for loopback-style (blocking-read) streams.""" + acc = _ChunkAccumulator() while not self.stopped(): try: @@ -311,25 +371,18 @@ def _chunk_loop( break audio = np.frombuffer(raw, dtype=np.int16) mono = resample(audio, sr, channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - log.debug("Audio chunk %d queued (%d samples)", chunk_index - 1, len(chunk)) - - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) - log.debug("Final audio chunk %d queued (%d samples)", chunk_index, len(chunk)) + acc.add(mono) + + if acc.ready(): + idx, start_time, chunk = acc.pop() + self.audio_queue.put((idx, start_time, chunk)) + log.debug( + "Audio chunk %d queued (%d samples, t=%.1fs)", + idx, len(chunk), start_time, + ) + + final = acc.flush() + if final is not None: + idx, start_time, chunk = final + self.audio_queue.put((idx, start_time, chunk)) + log.debug("Final audio chunk %d queued (%d samples)", idx, len(chunk)) diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py index 302903b..2a6dfd0 100644 --- a/src/hearsay/constants.py +++ b/src/hearsay/constants.py @@ -7,8 +7,15 @@ # Audio settings SAMPLE_RATE = 16000 # Whisper expects 16kHz CHANNELS = 1 # Whisper expects mono -CHUNK_DURATION_S = 30 # Whisper's native context window -OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting +# Variable-length chunking driven by trailing-silence detection. +# A chunk is cut once at least MIN_CHUNK_DURATION_S has accumulated AND the +# trailing SILENCE_DURATION_S of audio is near-silent — or unconditionally once +# MAX_CHUNK_DURATION_S (Whisper's native context window) is reached. +MIN_CHUNK_DURATION_S = 5 # Minimum audio buffered before an early (silence) cut +MAX_CHUNK_DURATION_S = 30 # Hard cap — Whisper's native context window +SILENCE_DURATION_S = 1.0 # Trailing near-silence (seconds) that triggers a cut +SILENCE_RMS_THRESHOLD = 0.01 # RMS on [-1, 1] float audio below which ≈ silence +OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting AUDIO_DTYPE = "float32" # Custom HuggingFace models: short name -> {repo_id, parameters, vram_gb, english_only} diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index 912585a..4f13afa 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -50,7 +50,7 @@ def append(self, result: TranscriptionResult) -> None: self._append_fallback(result) return - chunk_offset = result.chunk_index * 30 # seconds offset for this chunk + chunk_offset = result.start_time # absolute seconds offset for this chunk pieces: list[str] = [] for seg in result.segments: diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py index 3589932..06c408e 100644 --- a/src/hearsay/transcription/engine.py +++ b/src/hearsay/transcription/engine.py @@ -25,6 +25,7 @@ class TranscriptionResult: language: str language_probability: float chunk_index: int + start_time: float = 0.0 # absolute offset (s) of this chunk from recording start class TranscriptionEngine: @@ -74,12 +75,14 @@ def transcribe( self, audio: np.ndarray, chunk_index: int = 0, + start_time: float = 0.0, ) -> TranscriptionResult: """Transcribe a float32 16kHz mono audio array. Args: audio: Audio data as float32 numpy array at 16kHz. chunk_index: Index of this chunk (for ordering). + start_time: Absolute offset (s) of this chunk from recording start. Returns: TranscriptionResult with text and segment details. @@ -121,6 +124,7 @@ def transcribe( language=info.language, language_probability=info.language_probability, chunk_index=chunk_index, + start_time=start_time, ) def unload(self) -> None: diff --git a/src/hearsay/transcription/pipeline.py b/src/hearsay/transcription/pipeline.py index 7f96ced..06e6a2f 100644 --- a/src/hearsay/transcription/pipeline.py +++ b/src/hearsay/transcription/pipeline.py @@ -42,10 +42,10 @@ def run(self) -> None: log.info("TranscriptionPipeline started") while not self.stopped(): try: - chunk_index, audio = self.audio_queue.get(timeout=1.0) + chunk_index, start_time, audio = self.audio_queue.get(timeout=1.0) except queue.Empty: continue - self._process_chunk(chunk_index, audio) + self._process_chunk(chunk_index, start_time, audio) # Drain any audio chunks still in the queue after stop signal. # The recorder flushes its buffer before exiting, so these chunks @@ -53,18 +53,20 @@ def run(self) -> None: log.info("TranscriptionPipeline draining remaining audio chunks") while True: try: - chunk_index, audio = self.audio_queue.get_nowait() + chunk_index, start_time, audio = self.audio_queue.get_nowait() except queue.Empty: break - self._process_chunk(chunk_index, audio) + self._process_chunk(chunk_index, start_time, audio) log.info("TranscriptionPipeline stopped") - def _process_chunk(self, chunk_index: int, audio) -> None: + def _process_chunk(self, chunk_index: int, start_time: float, audio) -> None: """Transcribe a single audio chunk and enqueue the result.""" try: t0 = time.perf_counter() - result = self.engine.transcribe(audio, chunk_index=chunk_index) + result = self.engine.transcribe( + audio, chunk_index=chunk_index, start_time=start_time + ) elapsed = time.perf_counter() - t0 log.info( "Chunk %d transcribed in %.1fs: %s", @@ -125,6 +127,7 @@ def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: language=result.language, language_probability=result.language_probability, chunk_index=result.chunk_index, + start_time=result.start_time, ) # Rebuild text and trim leading segments that were fully covered by the overlap. @@ -147,4 +150,5 @@ def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: language=result.language, language_probability=result.language_probability, chunk_index=result.chunk_index, + start_time=result.start_time, ) From f8f08ee39fa24554af17bf9302f5f8aa70dc3672 Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 16:41:19 +0900 Subject: [PATCH 10/17] feat: hotkey, beep notifications, and clipboard copy on save MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Settings window additions: - Hotkey: configurable global hotkey (default ctrl+alt+r) with live capture — click Capture then press any modifier+key combo - Beep notifications: three independent checkboxes for recording start, stop, and MD file save completion (winsound.Beep) - Clipboard: optional checkbox to copy full transcript body (no timestamps, no header/footer) to clipboard after MD save App wiring: - keyboard.add_hotkey registers/re-registers on startup, wizard complete, and settings save; unregistered on quit - Hotkey callback dispatches to main thread via safe_after so tkinter state is never touched from the keyboard thread - Beeps run in daemon threads to avoid blocking recording teardown - Clipboard extraction reads the finalized MD body between header and --- footer marker; written to tk clipboard on main thread Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 1 + src/hearsay/app.py | 94 ++++++++++++++++++++++- src/hearsay/config.py | 11 +++ src/hearsay/ui/settings_window.py | 120 +++++++++++++++++++++++++++++- 4 files changed, 221 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6dee6b3..bacc068 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ Pillow>=10.0.0 nvidia-cublas-cu12>=12.0 nvidia-cuda-runtime-cu12>=12.0 transformers>=4.23.0 +keyboard>=0.13.5 diff --git a/src/hearsay/app.py b/src/hearsay/app.py index 0c9adb9..c850692 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -52,6 +52,7 @@ def __init__(self) -> None: self._recording = False self._recording_start_time: float | None = None self._teardown_thread: threading.Thread | None = None + self._hotkey_combo: str | None = None # UI apply_theme() @@ -83,6 +84,7 @@ def run(self) -> None: self._root.after(500, self._show_wizard) else: log.info("Config loaded, ready to record") + self._register_hotkey() # Start tkinter event loop self._root.mainloop() @@ -99,6 +101,7 @@ def _on_wizard_complete(self) -> None: """Called when the setup wizard finishes.""" self._config = self._config_manager.config log.info("Wizard complete, app ready") + self._register_hotkey() def _start_recording(self, source: str) -> None: """Start recording from the given source.""" @@ -178,7 +181,8 @@ def _on_recording_started(self) -> None: self._tray.set_recording(True) if self._live_view: self._live_view.set_status("Recording...") - # Start polling transcript queue + if self._config.beep_on_start: + threading.Thread(target=self._play_beep, args=("start",), daemon=True).start() self._poll_transcripts() def _stop_recording(self) -> None: @@ -194,6 +198,9 @@ def _stop_recording(self) -> None: log.info("Stopping recording") self._recording = False + if self._config.beep_on_stop: + threading.Thread(target=self._play_beep, args=("stop",), daemon=True).start() + # Update tray immediately so the menu is responsive if self._tray: self._tray.set_recording(False) @@ -286,6 +293,14 @@ def _teardown_recording( )) writer.post_process() + if self._config.beep_on_save: + self._play_beep("save") + + if self._config.copy_to_clipboard: + text = self._extract_clipboard_text(writer) + if text: + safe_after(self._root, 0, lambda t=text: self._copy_to_clipboard(t)) + # Insert session separator in live view end_time = time.strftime("%I:%M %p") safe_after(self._root, 0, lambda: ( @@ -338,9 +353,17 @@ def _open_settings(self) -> None: safe_after( self._root, 0, - lambda: SettingsWindow(self._root, self._config_manager), + lambda: SettingsWindow( + self._root, + self._config_manager, + on_save=self._on_settings_saved, + ), ) + def _on_settings_saved(self) -> None: + self._config = self._config_manager.config + self._register_hotkey() + def _open_about(self) -> None: """Open the about window.""" safe_after( @@ -414,6 +437,72 @@ def open_cuda_download() -> None: command=open_cuda_download, ).pack(side="left", padx=8) + # ── Hotkey ──────────────────────────────────────────────────────────────── + + def _register_hotkey(self) -> None: + try: + import keyboard as kb + self._unregister_hotkey() + combo = self._config.hotkey + if combo: + kb.add_hotkey(combo, self._toggle_recording_hotkey) + self._hotkey_combo = combo + log.info("Hotkey registered: %s", combo) + except Exception: + log.warning("Failed to register hotkey", exc_info=True) + + def _unregister_hotkey(self) -> None: + try: + import keyboard as kb + if self._hotkey_combo: + kb.remove_hotkey(self._hotkey_combo) + self._hotkey_combo = None + except Exception: + pass + + def _toggle_recording_hotkey(self) -> None: + """Called from the keyboard library thread — must dispatch to main thread.""" + if self._recording: + safe_after(self._root, 0, self._stop_recording) + else: + safe_after(self._root, 0, lambda: self._start_recording(self._config.audio_source)) + + # ── Beep ────────────────────────────────────────────────────────────────── + + def _play_beep(self, event: str) -> None: + try: + import winsound + if event == "start": + winsound.Beep(880, 120) + elif event == "stop": + winsound.Beep(520, 180) + elif event == "save": + winsound.Beep(660, 80) + winsound.Beep(880, 160) + except Exception: + pass + + # ── Clipboard ───────────────────────────────────────────────────────────── + + def _extract_clipboard_text(self, writer: MarkdownWriter) -> str: + try: + content = writer.file_path.read_text(encoding="utf-8") + header_end = content.index("\n\n") + 2 + footer_idx = content.rfind("\n---\n") + body = content[header_end:footer_idx] if footer_idx != -1 else content[header_end:] + return body.strip() + except Exception: + log.warning("Failed to extract clipboard text", exc_info=True) + return "" + + def _copy_to_clipboard(self, text: str) -> None: + try: + self._root.clipboard_clear() + self._root.clipboard_append(text) + log.info("Transcript copied to clipboard (%d chars)", len(text)) + except Exception: + log.warning("Failed to copy to clipboard", exc_info=True) + def _open_output_dir(self) -> None: """Open the output directory in file explorer.""" path = self._config.output_dir @@ -442,6 +531,7 @@ def _quit(self) -> None: self._teardown_thread.join(timeout=30) self._teardown_thread = None + self._unregister_hotkey() if self._tray: self._tray.stop() safe_after(self._root, 100, self._root.quit) diff --git a/src/hearsay/config.py b/src/hearsay/config.py index ea804c5..c0415a0 100644 --- a/src/hearsay/config.py +++ b/src/hearsay/config.py @@ -42,6 +42,17 @@ class AppConfig: # UI show_live_view_on_start: bool = False + # Hotkey + hotkey: str = "ctrl+alt+r" + + # Beep notifications + beep_on_start: bool = True + beep_on_stop: bool = True + beep_on_save: bool = True + + # Clipboard + copy_to_clipboard: bool = False + class ConfigManager: """Load and save AppConfig to JSON in %APPDATA%\\Hearsay.""" diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index a4327ff..2e6e090 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -28,15 +28,22 @@ class SettingsWindow(ctk.CTkToplevel): """Settings editor window.""" - def __init__(self, master: ctk.CTk, config_manager: ConfigManager) -> None: + def __init__( + self, + master: ctk.CTk, + config_manager: ConfigManager, + on_save: "Callable | None" = None, + ) -> None: super().__init__(master) self.title(f"{APP_NAME} Settings") - self.geometry("550x520") + self.geometry("550x620") self.resizable(False, False) self._config_manager = config_manager self._config = config_manager.config self._dl_frame: ctk.CTkFrame | None = None + self._on_save = on_save + self._capturing = False self._build_ui() self.grab_set() @@ -50,7 +57,7 @@ def _build_ui(self) -> None: ).pack(pady=(15, 10)) # Scrollable content - scroll = ctk.CTkScrollableFrame(self, width=490, height=360) + scroll = ctk.CTkScrollableFrame(self, width=490, height=460) scroll.pack(fill="both", expand=True, padx=20, pady=(0, 10)) # ── Audio Source ── @@ -145,6 +152,55 @@ def _build_ui(self) -> None: dir_frame, text="Browse", width=70, command=self._browse ).pack(side="left") + # ── Hotkey ── + ctk.CTkLabel(scroll, text="Recording Hotkey", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + hotkey_frame = ctk.CTkFrame(scroll, fg_color="transparent") + hotkey_frame.pack(anchor="w", padx=15, fill="x") + + self._hotkey_var = ctk.StringVar(value=self._config.hotkey) + self._hotkey_entry = ctk.CTkEntry( + hotkey_frame, textvariable=self._hotkey_var, width=200, state="readonly" + ) + self._hotkey_entry.pack(side="left", padx=(0, 8)) + self._capture_btn = ctk.CTkButton( + hotkey_frame, text="Capture", width=80, command=self._start_capture + ) + self._capture_btn.pack(side="left") + ctk.CTkLabel( + scroll, text="Press Ctrl+Alt+R or any modifier+key combo", + font=("Segoe UI", 10), text_color="gray" + ).pack(anchor="w", padx=15) + + # ── Beep Notifications ── + ctk.CTkLabel(scroll, text="Beep Notifications", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + self._beep_start_var = ctk.BooleanVar(value=self._config.beep_on_start) + self._beep_stop_var = ctk.BooleanVar(value=self._config.beep_on_stop) + self._beep_save_var = ctk.BooleanVar(value=self._config.beep_on_save) + ctk.CTkCheckBox( + scroll, text="녹음 시작 시 비프음", variable=self._beep_start_var + ).pack(anchor="w", padx=15, pady=2) + ctk.CTkCheckBox( + scroll, text="녹음 완료 시 비프음", variable=self._beep_stop_var + ).pack(anchor="w", padx=15, pady=2) + ctk.CTkCheckBox( + scroll, text="MD 파일 저장 완료 시 비프음", variable=self._beep_save_var + ).pack(anchor="w", padx=15, pady=2) + + # ── Clipboard ── + ctk.CTkLabel(scroll, text="Clipboard", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + self._clipboard_var = ctk.BooleanVar(value=self._config.copy_to_clipboard) + ctk.CTkCheckBox( + scroll, + text="저장 완료 시 전체 텍스트를 클립보드에 복사", + variable=self._clipboard_var, + ).pack(anchor="w", padx=15, pady=2) + # ── Buttons ── self._btn_frame = ctk.CTkFrame(self) self._btn_frame.pack(fill="x", padx=20, pady=(0, 15)) @@ -159,6 +215,57 @@ def _build_ui(self) -> None: ) self._cancel_btn.pack(side="right", padx=5) + def _start_capture(self) -> None: + self._capturing = True + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set("Press hotkey...") + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Cancel", command=self._cancel_capture) + self._hotkey_entry.focus_set() + self.bind("", self._on_key_capture) + + def _cancel_capture(self) -> None: + self._capturing = False + self.unbind("") + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set(self._config.hotkey) + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Capture", command=self._start_capture) + + def _on_key_capture(self, event) -> str: + keysym = event.keysym.lower() + modifier_only = { + "control_l", "control_r", "alt_l", "alt_r", + "shift_l", "shift_r", "super_l", "super_r", + } + if keysym in modifier_only: + return "break" + if keysym == "escape": + self._cancel_capture() + return "break" + + parts = [] + if event.state & 0x4: # Ctrl + parts.append("ctrl") + if event.state & 0x1: # Shift + parts.append("shift") + if event.state & 0x20000: # Alt (Windows) + parts.append("alt") + + if not parts: + return "break" # require at least one modifier + + parts.append(keysym) + combo = "+".join(parts) + + self._capturing = False + self.unbind("") + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set(combo) + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Capture", command=self._start_capture) + return "break" + def _on_model_changed(self, name: str) -> None: self._update_model_hint(name) @@ -196,10 +303,17 @@ def _apply_and_close(self) -> None: self._config.language = self._lang_var.get() self._config.vad_filter = self._vad_var.get() self._config.output_dir = self._dir_var.get() + self._config.hotkey = self._hotkey_var.get() + self._config.beep_on_start = self._beep_start_var.get() + self._config.beep_on_stop = self._beep_stop_var.get() + self._config.beep_on_save = self._beep_save_var.get() + self._config.copy_to_clipboard = self._clipboard_var.get() self._config_manager.save() log.info("Settings saved") self.grab_release() self.destroy() + if self._on_save: + self._on_save() def _start_download(self, model_name: str) -> None: """Expand window, show progress, and download + convert the model.""" From ca352c050b0e4070573796dcd6a42a046512c703 Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 16:52:20 +0900 Subject: [PATCH 11/17] fix: add torch to requirements for HuggingFace model conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ct2-transformers-converter needs torch to load and convert HuggingFace Whisper models. torch is a one-time conversion dependency only — GPU inference continues to use ctranslate2 directly. Co-Authored-By: Claude Sonnet 4.6 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bacc068..41c4960 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ Pillow>=10.0.0 nvidia-cublas-cu12>=12.0 nvidia-cuda-runtime-cu12>=12.0 transformers>=4.23.0 +torch>=2.0.0 keyboard>=0.13.5 From 7d85cd28ee0db4ff4b0d5a333497bd16518c6d5b Mon Sep 17 00:00:00 2001 From: hoiyada7-maker Date: Mon, 1 Jun 2026 17:36:55 +0900 Subject: [PATCH 12/17] fix: translate all Korean UI strings to English, fix exc scope bug, add transformers dep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - settings_window: translate beep/clipboard checkbox labels to English - app: translate CUDA error dialog title, body, and buttons to English - settings_window: fix NameError in _download_bg — capture str(exc) into lambda default arg before it goes out of scope (Python 3.12+ behavior) - requirements.txt: transformers was already listed; verified present Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/app.py | 14 +++++++------- src/hearsay/ui/settings_window.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/hearsay/app.py b/src/hearsay/app.py index c850692..0cb7dc9 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -385,7 +385,7 @@ def _handle_cuda_error(self, source: str) -> None: def _show_cuda_error_dialog(self, source: str) -> None: """Show a dialog offering CPU fallback or CUDA Toolkit install link.""" dialog = ctk.CTkToplevel(self._root) - dialog.title("GPU를 사용할 수 없습니다") + dialog.title("GPU Unavailable") dialog.resizable(False, False) dialog.grab_set() @@ -398,16 +398,16 @@ def _show_cuda_error_dialog(self, source: str) -> None: ctk.CTkLabel( dialog, - text="CUDA 런타임 라이브러리를 찾을 수 없습니다.", + text="CUDA runtime library not found.", font=ctk.CTkFont(size=14, weight="bold"), ).pack(pady=(20, 4)) ctk.CTkLabel( dialog, text=( - "GPU 설정이 선택되어 있지만 CUDA Toolkit 12.x가\n" - "설치되어 있지 않아 GPU로 실행할 수 없습니다.\n\n" - "계속하려면 CPU로 변경하거나 CUDA Toolkit을 설치하세요." + "GPU is selected but CUDA Toolkit 12.x is not installed,\n" + "so inference cannot run on GPU.\n\n" + "Switch to CPU or install CUDA Toolkit to continue." ), justify="center", ).pack(pady=(0, 16)) @@ -428,11 +428,11 @@ def open_cuda_download() -> None: webbrowser.open("https://developer.nvidia.com/cuda-downloads") ctk.CTkButton( - btn_frame, text="CPU로 변경", width=160, command=switch_to_cpu, + btn_frame, text="Switch to CPU", width=160, command=switch_to_cpu, ).pack(side="left", padx=8) ctk.CTkButton( - btn_frame, text="CUDA Toolkit 설치", width=160, + btn_frame, text="Install CUDA Toolkit", width=160, fg_color="transparent", border_width=1, command=open_cuda_download, ).pack(side="left", padx=8) diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index 2e6e090..cd0f5be 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -181,13 +181,13 @@ def _build_ui(self) -> None: self._beep_stop_var = ctk.BooleanVar(value=self._config.beep_on_stop) self._beep_save_var = ctk.BooleanVar(value=self._config.beep_on_save) ctk.CTkCheckBox( - scroll, text="녹음 시작 시 비프음", variable=self._beep_start_var + scroll, text="Beep on recording start", variable=self._beep_start_var ).pack(anchor="w", padx=15, pady=2) ctk.CTkCheckBox( - scroll, text="녹음 완료 시 비프음", variable=self._beep_stop_var + scroll, text="Beep on recording stop", variable=self._beep_stop_var ).pack(anchor="w", padx=15, pady=2) ctk.CTkCheckBox( - scroll, text="MD 파일 저장 완료 시 비프음", variable=self._beep_save_var + scroll, text="Beep on transcript save", variable=self._beep_save_var ).pack(anchor="w", padx=15, pady=2) # ── Clipboard ── @@ -197,7 +197,7 @@ def _build_ui(self) -> None: self._clipboard_var = ctk.BooleanVar(value=self._config.copy_to_clipboard) ctk.CTkCheckBox( scroll, - text="저장 완료 시 전체 텍스트를 클립보드에 복사", + text="Copy transcript to clipboard on save", variable=self._clipboard_var, ).pack(anchor="w", padx=15, pady=2) @@ -360,7 +360,7 @@ def set_status(text: str) -> None: self.after(0, self._download_complete) except Exception as exc: log.error("Model download/conversion failed", exc_info=True) - self.after(0, lambda: self._download_failed(str(exc))) + self.after(0, lambda msg=str(exc): self._download_failed(msg)) def _download_complete(self) -> None: self._dl_bar.stop() From 353a61bcda2132d3a60fc93232d66f54628aa67f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 16:52:39 +0900 Subject: [PATCH 13/17] Add GitHub Actions release workflow Automates build and release on v*.*.* tag push: - PyInstaller onedir build via build.bat - Inno Setup installer compiled via choco-installed ISCC - installer.iss AppVersion auto-updated from the pushed tag - HearsaySetup.exe attached to GitHub Release with auto-generated notes Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/release.yml | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4055f54 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,50 @@ +name: Release + +on: + push: + tags: + - 'v*.*.*' + +permissions: + contents: write + +jobs: + build-and-release: + runs-on: windows-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: pip + + - name: Install dependencies + run: pip install -r requirements.txt pyinstaller + + - name: Update installer version + shell: pwsh + run: | + $version = "${{ github.ref_name }}".TrimStart("v") + (Get-Content installer.iss) -replace 'AppVersion=.*', "AppVersion=$version" | Set-Content installer.iss + + - name: Build with PyInstaller + shell: cmd + run: build.bat + + - name: Install Inno Setup + shell: pwsh + run: choco install innosetup --yes --no-progress + + - name: Build installer + shell: pwsh + run: '& "C:\Program Files (x86)\Inno Setup 6\ISCC.exe" installer.iss' + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: installer_output/HearsaySetup.exe + generate_release_notes: true From 5c898c1ffaa56d961f7c7e90b6eaac88d28594eb Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 21:37:05 +0900 Subject: [PATCH 14/17] fix: hotkey live reload, deferred model download, recording-locked warning, timestamped transcript - Hotkey re-registers immediately on settings save (was already wired, now also passes is_recording callback to settings window) - Korean HF model download deferred from settings save to recording start: settings save no longer blocks on download; download runs in ModelLoader thread with live-view progress updates; error dialog + state reset on failure - Settings window warns when recording-locked settings (Model, Device, Compute Type, Language, VAD Filter, Audio Source, Output Directory) are changed while a recording is active - MarkdownWriter now writes [M:SS] timestamped lines matching the live transcript view; post_process cleans filler/duplicates per line while preserving timestamps Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/app.py | 35 ++++++++++ src/hearsay/output/markdown_writer.py | 60 +++++++---------- src/hearsay/ui/settings_window.py | 93 +++++++-------------------- 3 files changed, 82 insertions(+), 106 deletions(-) diff --git a/src/hearsay/app.py b/src/hearsay/app.py index 0cb7dc9..2137ca1 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -143,6 +143,24 @@ def load_and_start() -> None: except queue.Empty: break + # Download HF model on-demand (deferred from settings save) + from hearsay.transcription.model_manager import ( + download_model, is_hf_custom_model, is_model_downloaded, + ) + if (is_hf_custom_model(self._engine.model_name) + and not is_model_downloaded(self._engine.model_name)): + safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Downloading model...")) + try: + def _dl_progress(msg: str) -> None: + safe_after(self._root, 0, + lambda m=msg: self._ensure_live_view().set_status(f"Downloading: {m}")) + download_model(self._engine.model_name, progress_callback=_dl_progress) + except Exception as exc: + log.error("Model download failed at recording start", exc_info=True) + safe_after(self._root, 0, lambda e=str(exc): self._on_model_download_failed(e)) + return + safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Loading model...")) + try: self._engine.load() except CudaUnavailableError: @@ -357,6 +375,7 @@ def _open_settings(self) -> None: self._root, self._config_manager, on_save=self._on_settings_saved, + is_recording=lambda: self._recording, ), ) @@ -372,6 +391,22 @@ def _open_about(self) -> None: lambda: AboutWindow(self._root), ) + def _on_model_download_failed(self, error: str) -> None: + """Called on main thread when model download fails at recording start.""" + self._recording = False + self._engine = None + if self._tray: + self._tray.set_recording(False) + if self._live_view: + self._live_view.set_status("Download failed") + from tkinter import messagebox + messagebox.showerror( + "Model Download Failed", + "Failed to download the selected model. Check your internet connection " + "or select a different model in Settings.\n\n" + error[:200], + parent=self._root, + ) + def _handle_cuda_error(self, source: str) -> None: """Called on main thread when CUDA runtime DLLs are missing.""" self._recording = False diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index 4f13afa..d5d4681 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -3,17 +3,17 @@ from __future__ import annotations import logging +import re from datetime import datetime from pathlib import Path -from hearsay.constants import PARAGRAPH_GAP_S -from hearsay.output.formatter import clean_transcript_text, make_title +from hearsay.output.formatter import clean_transcript_text, format_timestamp, make_title from hearsay.transcription.engine import TranscriptionResult log = logging.getLogger(__name__) -# Markers used to split header / body / footer for post-processing _FOOTER_MARKER = "\n---\n" +_TS_LINE_RE = re.compile(r"^(\[\d+:\d+(?::\d+)?\] )(.+)$") class MarkdownWriter: @@ -27,20 +27,16 @@ def __init__(self, output_dir: str | Path, title: str | None = None) -> None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.file_path = self.output_dir / f"transcript_{timestamp}.md" self._header_written = False - - # Track absolute timing across chunks for gap-based paragraph breaks - self._last_segment_end: float | None = None self._language: str = "en" def _write_header(self) -> None: - """Write the markdown header on first call.""" with open(self.file_path, "w", encoding="utf-8") as f: f.write(f"# {self.title}\n\n") self._header_written = True log.info("Transcript file created: %s", self.file_path) def append(self, result: TranscriptionResult) -> None: - """Append a transcription result using segment-level gap detection.""" + """Append a transcription result as timestamped lines matching the live view.""" if not self._header_written: self._write_header() @@ -50,30 +46,18 @@ def append(self, result: TranscriptionResult) -> None: self._append_fallback(result) return - chunk_offset = result.start_time # absolute seconds offset for this chunk - pieces: list[str] = [] - + chunk_offset = result.start_time + lines: list[str] = [] for seg in result.segments: - seg_start = chunk_offset + seg["start"] seg_text = seg["text"].strip() if not seg_text: continue + ts = format_timestamp(chunk_offset + seg["start"]) + lines.append(f"[{ts}] {seg_text}\n") - # Determine separator: paragraph break on long gap, space otherwise - if self._last_segment_end is not None: - gap = seg_start - self._last_segment_end - if gap >= PARAGRAPH_GAP_S: - pieces.append("\n\n") - else: - pieces.append(" ") - # else: very first segment, no separator needed - - pieces.append(seg_text) - self._last_segment_end = chunk_offset + seg["end"] - - if pieces: + if lines: with open(self.file_path, "a", encoding="utf-8") as f: - f.write("".join(pieces)) + f.write("".join(lines)) log.debug("Appended chunk %d to %s", result.chunk_index, self.file_path) @@ -82,10 +66,9 @@ def _append_fallback(self, result: TranscriptionResult) -> None: text = result.text.strip() if not text: return + ts = format_timestamp(result.start_time) with open(self.file_path, "a", encoding="utf-8") as f: - if self._last_segment_end is not None: - f.write(" ") - f.write(text) + f.write(f"[{ts}] {text}\n") def finalize(self, total_duration: float | None = None) -> Path: """Write a footer and return the file path.""" @@ -93,7 +76,7 @@ def finalize(self, total_duration: float | None = None) -> Path: self._write_header() with open(self.file_path, "a", encoding="utf-8") as f: - f.write("\n\n---\n\n") + f.write("\n---\n\n") f.write(f"*Generated by Hearsay on {datetime.now():%Y-%m-%d at %H:%M}*\n") if total_duration: from hearsay.output.formatter import format_duration @@ -103,28 +86,33 @@ def finalize(self, total_duration: float | None = None) -> Path: return self.file_path def post_process(self) -> None: - """Read the finalized transcript, clean up the body, and rewrite.""" + """Clean up the text portion of each timestamped line, preserving timestamps.""" if not self.file_path.exists(): return content = self.file_path.read_text(encoding="utf-8") - - # Split into header, body, footer using the --- marker footer_idx = content.rfind(_FOOTER_MARKER) if footer_idx == -1: log.warning("No footer marker found, skipping post-processing") return - # Header ends at first double newline after the title line header_end = content.index("\n\n") + 2 header = content[:header_end] body = content[header_end:footer_idx] footer = content[footer_idx:] - cleaned = clean_transcript_text(body, language=self._language) + cleaned_lines: list[str] = [] + for line in body.splitlines(keepends=True): + m = _TS_LINE_RE.match(line.rstrip("\n")) + if m: + ts_prefix, text = m.group(1), m.group(2) + text = clean_transcript_text(text, language=self._language) + cleaned_lines.append(f"{ts_prefix}{text}\n") + else: + cleaned_lines.append(line) self.file_path.write_text( - header + cleaned + footer, + header + "".join(cleaned_lines) + footer, encoding="utf-8", ) log.info("Post-processed transcript: %s", self.file_path) diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index cd0f5be..7486be6 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import threading from tkinter import filedialog import customtkinter as ctk @@ -17,7 +16,6 @@ MODEL_TABLE, ) from hearsay.transcription.model_manager import ( - download_model, is_hf_custom_model, is_model_downloaded, ) @@ -33,6 +31,7 @@ def __init__( master: ctk.CTk, config_manager: ConfigManager, on_save: "Callable | None" = None, + is_recording: "Callable[[], bool] | None" = None, ) -> None: super().__init__(master) self.title(f"{APP_NAME} Settings") @@ -41,8 +40,8 @@ def __init__( self._config_manager = config_manager self._config = config_manager.config - self._dl_frame: ctk.CTkFrame | None = None self._on_save = on_save + self._is_recording = is_recording or (lambda: False) self._capturing = False self._build_ui() @@ -275,7 +274,7 @@ def _update_model_hint(self, name: str) -> None: self._model_hint.configure(text="Korean model (converted, ready)", text_color="green") else: self._model_hint.configure( - text="Korean model — will download & convert on Save", text_color="#e07800" + text="Korean model — will download when recording starts", text_color="#e07800" ) else: self._model_hint.configure(text="") @@ -289,13 +288,29 @@ def _browse(self) -> None: self._dir_var.set(path) def _save(self) -> None: - new_model = self._model_var.get() - if is_hf_custom_model(new_model) and not is_model_downloaded(new_model): - self._start_download(new_model) - return self._apply_and_close() def _apply_and_close(self) -> None: + if self._is_recording(): + _LOCKED = [ + ("Model", self._model_var.get(), self._config.model_name), + ("Device", self._device_var.get(), self._config.device), + ("Compute Type", self._compute_var.get(), self._config.compute_type), + ("Language", self._lang_var.get().strip(), self._config.language), + ("VAD Filter", self._vad_var.get(), self._config.vad_filter), + ("Audio Source", self._source_var.get(), self._config.audio_source), + ("Output Directory", self._dir_var.get(), self._config.output_dir), + ] + changed = [name for name, new, old in _LOCKED if new != old] + if changed: + from tkinter import messagebox + messagebox.showinfo( + "Recording Active", + "Settings saved.\n\n" + "The following changes will take effect when you start the next recording:\n" + + "".join(f"\n - {c}" for c in changed), + parent=self, + ) self._config.audio_source = self._source_var.get() self._config.model_name = self._model_var.get() self._config.compute_type = self._compute_var.get() @@ -315,68 +330,6 @@ def _apply_and_close(self) -> None: if self._on_save: self._on_save() - def _start_download(self, model_name: str) -> None: - """Expand window, show progress, and download + convert the model.""" - self.geometry("550x640") - - self._save_btn.configure(state="disabled") - self._cancel_btn.configure(state="disabled") - - if self._dl_frame: - self._dl_frame.destroy() - - self._dl_frame = ctk.CTkFrame(self) - self._dl_frame.pack(fill="x", padx=20, pady=(0, 10)) - - ctk.CTkLabel( - self._dl_frame, - text=f"Downloading model '{model_name}'", - font=("Segoe UI", 13, "bold"), - ).pack(pady=(10, 2)) - - self._dl_status = ctk.CTkLabel( - self._dl_frame, - text="Starting...", - font=("Segoe UI", 11), - text_color="gray", - ) - self._dl_status.pack(pady=4) - - self._dl_bar = ctk.CTkProgressBar(self._dl_frame, width=460) - self._dl_bar.pack(pady=(4, 10)) - self._dl_bar.configure(mode="indeterminate") - self._dl_bar.start() - - threading.Thread( - target=self._download_bg, args=(model_name,), daemon=True - ).start() - - def _download_bg(self, model_name: str) -> None: - def set_status(text: str) -> None: - self.after(0, lambda: self._dl_status.configure(text=text)) - - try: - download_model(model_name, progress_callback=set_status) - self.after(0, self._download_complete) - except Exception as exc: - log.error("Model download/conversion failed", exc_info=True) - self.after(0, lambda msg=str(exc): self._download_failed(msg)) - - def _download_complete(self) -> None: - self._dl_bar.stop() - self._dl_bar.set(1) - self._dl_bar.configure(mode="determinate") - self._dl_status.configure(text="Done! Saving settings...", text_color="green") - self.after(600, self._apply_and_close) - - def _download_failed(self, error: str) -> None: - self._dl_bar.stop() - self._dl_bar.set(0) - short_error = error.splitlines()[0][:80] - self._dl_status.configure(text=f"Error: {short_error}", text_color="red") - self._save_btn.configure(state="normal") - self._cancel_btn.configure(state="normal") - def _cancel(self) -> None: self.grab_release() self.destroy() From dbf4460d4709e2332e61c0f460972e426ce07e2e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 3 Jun 2026 22:12:39 +0900 Subject: [PATCH 15/17] fix: add markdown line breaks to timestamped transcript lines Each [M:SS] line now ends with two trailing spaces before the newline, which renders as a line break in markdown preview instead of running lines together as a single paragraph. Co-Authored-By: Claude Sonnet 4.6 --- src/hearsay/output/markdown_writer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index d5d4681..47a2484 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -13,7 +13,7 @@ log = logging.getLogger(__name__) _FOOTER_MARKER = "\n---\n" -_TS_LINE_RE = re.compile(r"^(\[\d+:\d+(?::\d+)?\] )(.+)$") +_TS_LINE_RE = re.compile(r"^(\[\d+:\d+(?::\d+)?\] )(.+?)\ *$") class MarkdownWriter: @@ -53,7 +53,7 @@ def append(self, result: TranscriptionResult) -> None: if not seg_text: continue ts = format_timestamp(chunk_offset + seg["start"]) - lines.append(f"[{ts}] {seg_text}\n") + lines.append(f"[{ts}] {seg_text} \n") if lines: with open(self.file_path, "a", encoding="utf-8") as f: @@ -68,7 +68,7 @@ def _append_fallback(self, result: TranscriptionResult) -> None: return ts = format_timestamp(result.start_time) with open(self.file_path, "a", encoding="utf-8") as f: - f.write(f"[{ts}] {text}\n") + f.write(f"[{ts}] {text} \n") def finalize(self, total_duration: float | None = None) -> Path: """Write a footer and return the file path.""" @@ -107,7 +107,7 @@ def post_process(self) -> None: if m: ts_prefix, text = m.group(1), m.group(2) text = clean_transcript_text(text, language=self._language) - cleaned_lines.append(f"{ts_prefix}{text}\n") + cleaned_lines.append(f"{ts_prefix}{text} \n") else: cleaned_lines.append(line) From 35fffeabd64f650539e7a4de69ec7c6bc8790c58 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 4 Jun 2026 00:02:43 +0900 Subject: [PATCH 16/17] feat: dual-layer realtime transcription via RealtimeSTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the batch transcription pipeline with RealtimeSTT (KoljaB), running two whisper models concurrently: a fast model drives a tentative ("typing") layer revised as the user speaks, and the accurate main model produces the final text once VAD detects the end of an utterance. - New RealtimeEngine wraps AudioToTextRecorder (use_microphone=False); Hearsay's own AudioRecorder keeps capturing system loopback / mic / both and streams frames in via feed_audio (new on_frame streaming mode on the recorder). - Live view shows a gray in-progress line updated in place, then committed as the final timestamped line; markdown writes one finalized utterance per line. - Remove the old TranscriptionPipeline / TranscriptionEngine batch path. - Add multiprocessing.freeze_support() — RealtimeSTT spawns the main model in a child process. Critical fix: depend on silero-vad so its bundled ONNX is used directly. Without it RealtimeSTT falls back to torch.hub, which blocks forever on an interactive "trust repository (y/N)" prompt (hangs the GUI app). build.bat now collects silero_vad / onnxruntime / RealtimeSTT / torch data so frozen builds don't hang. Verified end-to-end by feeding a real WAV: 22 tentative updates + 2 finalized, punctuated utterances. Co-Authored-By: Claude Opus 4.8 --- build.bat | 15 ++ requirements.txt | 2 + src/hearsay/__main__.py | 5 + src/hearsay/app.py | 156 +++++++------------ src/hearsay/audio/recorder.py | 38 ++++- src/hearsay/config.py | 6 + src/hearsay/constants.py | 10 +- src/hearsay/output/markdown_writer.py | 49 ++---- src/hearsay/transcription/engine.py | 133 ---------------- src/hearsay/transcription/pipeline.py | 154 ------------------ src/hearsay/transcription/realtime_engine.py | 153 ++++++++++++++++++ src/hearsay/ui/live_view.py | 47 +++++- 12 files changed, 336 insertions(+), 432 deletions(-) delete mode 100644 src/hearsay/transcription/engine.py delete mode 100644 src/hearsay/transcription/pipeline.py create mode 100644 src/hearsay/transcription/realtime_engine.py diff --git a/build.bat b/build.bat index c85a855..5e6b1ca 100644 --- a/build.bat +++ b/build.bat @@ -14,9 +14,24 @@ pyinstaller --noconfirm --onedir --windowed ^ --hidden-import "sounddevice" ^ --hidden-import "customtkinter" ^ --hidden-import "pystray" ^ + --hidden-import "RealtimeSTT" ^ + --hidden-import "silero_vad" ^ + --hidden-import "webrtcvad" ^ + --hidden-import "onnxruntime" ^ + --hidden-import "scipy" ^ + --hidden-import "soundfile" ^ + --hidden-import "torch" ^ + --hidden-import "torchaudio" ^ --collect-all "customtkinter" ^ --collect-all "faster_whisper" ^ --collect-all "ctranslate2" ^ + --collect-all "RealtimeSTT" ^ + --collect-all "silero_vad" ^ + --collect-all "onnxruntime" ^ + --collect-all "scipy" ^ + --collect-all "soundfile" ^ + --collect-all "torch" ^ + --collect-all "torchaudio" ^ src\hearsay\__main__.py echo. diff --git a/requirements.txt b/requirements.txt index 41c4960..e02a8e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ faster-whisper>=1.0.0 +RealtimeSTT>=1.0.0 +silero-vad>=5.1 PyAudioWPatch>=0.2.12 sounddevice>=0.4.6 numpy>=1.24.0 diff --git a/src/hearsay/__main__.py b/src/hearsay/__main__.py index eeede52..b29065f 100644 --- a/src/hearsay/__main__.py +++ b/src/hearsay/__main__.py @@ -1,9 +1,14 @@ """Entry point for Hearsay: python -m hearsay""" +import multiprocessing import sys def main() -> None: + # RealtimeSTT spawns a child process (spawn start method) for the main + # transcription model; freeze_support is required for frozen/PyInstaller builds. + multiprocessing.freeze_support() + from hearsay.utils.logging_setup import setup_logging setup_logging() diff --git a/src/hearsay/app.py b/src/hearsay/app.py index 2137ca1..e63f0a6 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -15,10 +15,9 @@ from hearsay.audio.recorder import AudioRecorder from hearsay.config import ConfigManager -from hearsay.constants import APP_NAME, DEFAULT_CPU_COMPUTE, LIVE_VIEW_POLL_MS +from hearsay.constants import APP_NAME, DEFAULT_CPU_COMPUTE from hearsay.output.markdown_writer import MarkdownWriter -from hearsay.transcription.engine import CudaUnavailableError, TranscriptionEngine -from hearsay.transcription.pipeline import TranscriptionPipeline +from hearsay.transcription.realtime_engine import CudaUnavailableError, RealtimeEngine from hearsay.ui.about_window import AboutWindow from hearsay.ui.live_view import LiveTranscriptWindow from hearsay.ui.settings_window import SettingsWindow @@ -37,20 +36,16 @@ def __init__(self) -> None: self._config_manager = ConfigManager() self._config = self._config_manager.config - # Queues - self._audio_queue: queue.Queue = queue.Queue(maxsize=10) - self._transcript_queue: queue.Queue = queue.Queue() - # Threads / components self._recorder: AudioRecorder | None = None - self._engine: TranscriptionEngine | None = None - self._pipeline: TranscriptionPipeline | None = None + self._engine: RealtimeEngine | None = None self._writer: MarkdownWriter | None = None self._tray: SystemTrayIcon | None = None # State self._recording = False self._recording_start_time: float | None = None + self._utterance_start_elapsed: float | None = None self._teardown_thread: threading.Thread | None = None self._hotkey_combo: str | None = None @@ -112,17 +107,24 @@ def _start_recording(self, source: str) -> None: log.info("Starting recording (source=%s)", source) self._recording = True self._recording_start_time = time.time() + self._utterance_start_elapsed = None # Set up markdown writer - self._writer = MarkdownWriter(self._config.output_dir) + self._writer = MarkdownWriter( + self._config.output_dir, language=self._config.language + ) - # Load transcription engine - self._engine = TranscriptionEngine( + # Dual-layer realtime engine (tentative + final) + self._engine = RealtimeEngine( model_name=self._config.model_name, + realtime_model_name=self._config.realtime_model_name, device=self._config.device, compute_type=self._config.compute_type, language=self._config.language, - vad_filter=self._config.vad_filter, + on_tentative=self._on_tentative, + on_final=self._on_final, + on_utterance_start=self._on_utterance_start, + post_speech_silence_duration=self._config.post_speech_silence_duration, ) def load_and_start() -> None: @@ -131,18 +133,6 @@ def load_and_start() -> None: self._teardown_thread.join(timeout=30) self._teardown_thread = None - # Now safe to clear queues (old teardown has finished draining them) - while not self._audio_queue.empty(): - try: - self._audio_queue.get_nowait() - except queue.Empty: - break - while not self._transcript_queue.empty(): - try: - self._transcript_queue.get_nowait() - except queue.Empty: - break - # Download HF model on-demand (deferred from settings save) from hearsay.transcription.model_manager import ( download_model, is_hf_custom_model, is_model_downloaded, @@ -159,26 +149,19 @@ def _dl_progress(msg: str) -> None: log.error("Model download failed at recording start", exc_info=True) safe_after(self._root, 0, lambda e=str(exc): self._on_model_download_failed(e)) return - safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Loading model...")) + safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Loading model...")) try: self._engine.load() except CudaUnavailableError: safe_after(self._root, 0, lambda: self._handle_cuda_error(source)) return - # Start pipeline - self._pipeline = TranscriptionPipeline( - audio_queue=self._audio_queue, - transcript_queue=self._transcript_queue, - engine=self._engine, - ) - self._pipeline.start() - - # Start recorder + # Start recorder in streaming mode — frames feed straight into the engine self._recorder = AudioRecorder( - audio_queue=self._audio_queue, + queue.Queue(), source=source, + on_frame=self._engine.feed, ) self._recorder.start() @@ -201,7 +184,36 @@ def _on_recording_started(self) -> None: self._live_view.set_status("Recording...") if self._config.beep_on_start: threading.Thread(target=self._play_beep, args=("start",), daemon=True).start() - self._poll_transcripts() + + # ── Transcription callbacks (from the engine threads) ─────────────────────── + + def _on_utterance_start(self) -> None: + """RealtimeSTT detected speech onset — stamp the utterance's start time.""" + if self._recording_start_time is not None: + self._utterance_start_elapsed = time.time() - self._recording_start_time + + def _on_tentative(self, text: str) -> None: + """Revised in-progress text from the fast realtime model (gray layer).""" + safe_after(self._root, 0, lambda t=text: ( + self._live_view.update_tentative(t) if self._live_view else None + )) + + def _on_final(self, text: str) -> None: + """Finalized, accurate text for a completed utterance (committed layer).""" + elapsed = self._utterance_start_elapsed + if elapsed is None and self._recording_start_time is not None: + elapsed = time.time() - self._recording_start_time + elapsed = elapsed or 0.0 + self._utterance_start_elapsed = None + + if self._writer: + self._writer.append_utterance(elapsed, text) + + from hearsay.output.formatter import format_timestamp + line = f"[{format_timestamp(elapsed)}] {text}" + safe_after(self._root, 0, lambda l=line: ( + self._live_view.commit_final(l) if self._live_view else None + )) def _stop_recording(self) -> None: """Stop the current recording session. @@ -230,20 +242,18 @@ def _stop_recording(self) -> None: # Capture references for the background thread recorder = self._recorder - pipeline = self._pipeline engine = self._engine writer = self._writer start_time = self._recording_start_time self._recorder = None - self._pipeline = None self._engine = None self._writer = None self._recording_start_time = None self._teardown_thread = threading.Thread( target=self._teardown_recording, - args=(recorder, pipeline, engine, writer, start_time), + args=(recorder, engine, writer, start_time), daemon=True, name="RecordingTeardown", ) @@ -252,48 +262,19 @@ def _stop_recording(self) -> None: def _teardown_recording( self, recorder: AudioRecorder | None, - pipeline: TranscriptionPipeline | None, - engine: TranscriptionEngine | None, + engine: RealtimeEngine | None, writer: MarkdownWriter | None, start_time: float | None, ) -> None: """Blocking recording teardown — runs on a background thread.""" - # 1. Stop recorder first so it flushes remaining audio to the queue. + # 1. Stop recorder first so it stops feeding audio into the engine. if recorder: recorder.stop() recorder.join(timeout=5) - # 2. Stop pipeline -- it will drain any remaining audio chunks before - # exiting. Use a generous timeout so CPU transcription can finish. - if pipeline: - pipeline.stop() - pipeline.join(timeout=60) - if pipeline.is_alive(): - log.warning("Pipeline thread still running after join timeout") - - # 3. Unload model only after pipeline is done. + # 2. Shut down the engine (stops both models and the child process). if engine: - engine.unload() - - # Drain any remaining transcript results that arrived after polling stopped - if writer: - try: - while True: - result = self._transcript_queue.get_nowait() - writer.append(result) - if self._live_view: - for seg in result.segments: - from hearsay.output.formatter import format_timestamp - ts = format_timestamp( - result.start_time + seg["start"] - ) - safe_after(self._root, 0, - lambda t=f"[{ts}] {seg['text']}": ( - self._live_view.append_text(t) - if self._live_view else None - )) - except queue.Empty: - pass + engine.shutdown() # Finalize transcript duration = None @@ -330,32 +311,6 @@ def _teardown_recording( self._live_view.set_status("Idle") if self._live_view else None )) - def _poll_transcripts(self) -> None: - """Poll the transcript queue and update live view + markdown writer.""" - if not self._recording: - return - - try: - while True: - result = self._transcript_queue.get_nowait() - # Write to markdown - if self._writer: - self._writer.append(result) - # Update live view - if self._live_view: - for seg in result.segments: - from hearsay.output.formatter import format_timestamp - ts = format_timestamp( - result.start_time + seg["start"] - ) - self._live_view.append_text(f"[{ts}] {seg['text']}") - except queue.Empty: - pass - - # Schedule next poll - if self._recording: - safe_after(self._root, LIVE_VIEW_POLL_MS, self._poll_transcripts) - def _ensure_live_view(self) -> LiveTranscriptWindow: """Create live view if needed, return it.""" if self._live_view is None: @@ -554,11 +509,10 @@ def _quit(self) -> None: if self._recording: self._recording = False self._teardown_recording( - self._recorder, self._pipeline, self._engine, + self._recorder, self._engine, self._writer, self._recording_start_time, ) self._recorder = None - self._pipeline = None self._engine = None self._writer = None self._recording_start_time = None diff --git a/src/hearsay/audio/recorder.py b/src/hearsay/audio/recorder.py index bdcc3a3..4d054df 100644 --- a/src/hearsay/audio/recorder.py +++ b/src/hearsay/audio/recorder.py @@ -4,6 +4,7 @@ import logging import queue +from typing import Callable import numpy as np @@ -126,9 +127,14 @@ class AudioRecorder(StoppableThread): where ``start_time_s`` is the chunk's absolute offset from the start of the recording. + When ``on_frame`` is provided, the recorder streams every mono 16 kHz + float32 frame to that callback instead of accumulating chunks into + ``audio_queue`` — used to feed RealtimeSTT continuously for low latency. + Args: - audio_queue: Queue to push chunks to. + audio_queue: Queue to push chunks to (ignored when ``on_frame`` is set). source: One of 'system', 'microphone', 'both'. + on_frame: Optional per-frame callback for streaming (RealtimeSTT) mode. loopback_device_index: PyAudioWPatch device index for loopback. mic_device_index: sounddevice device index for mic. """ @@ -137,6 +143,7 @@ def __init__( self, audio_queue: queue.Queue, source: str = AUDIO_SOURCE_SYSTEM, + on_frame: Callable[[np.ndarray], None] | None = None, loopback_device_index: int | None = None, mic_device_index: int | None = None, loopback_channels: int = 2, @@ -147,6 +154,7 @@ def __init__( super().__init__(name="AudioRecorder") self.audio_queue = audio_queue self.source = source + self.on_frame = on_frame self.loopback_device_index = loopback_device_index self.mic_device_index = mic_device_index self.loopback_channels = loopback_channels @@ -212,6 +220,9 @@ def _record_mic(self) -> None: def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None: mono = resample(indata.copy(), self.mic_rate, self.mic_channels) + if self.on_frame is not None: + self.on_frame(mono) + return acc.add(mono) if acc.ready(): self.audio_queue.put(acc.pop()) @@ -226,6 +237,9 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object) while not self.stopped(): self.wait(timeout=0.5) + if self.on_frame is not None: + return + final = acc.flush() if final is not None: self.audio_queue.put(final) @@ -331,6 +345,11 @@ def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray: audio = np.frombuffer(raw, dtype=np.int16) lb_mono = resample(audio, self.loopback_rate, self.loopback_channels) + if self.on_frame is not None: + self.on_frame(mix_with_mic(lb_mono)) + mic_buffer.clear() + continue + # Combined silence: silent only when both sources are quiet. # The latest mic frame approximates current mic activity. mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True @@ -343,10 +362,11 @@ def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray: mic_buffer.clear() # --- Flush remaining audio --- - final = acc.flush() - if final is not None: - idx, start_time, lb_chunk = final - self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) + if self.on_frame is None: + final = acc.flush() + if final is not None: + idx, start_time, lb_chunk = final + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_stream.stop_stream() mic_stream.close() @@ -371,6 +391,11 @@ def _chunk_loop( break audio = np.frombuffer(raw, dtype=np.int16) mono = resample(audio, sr, channels) + + if self.on_frame is not None: + self.on_frame(mono) + continue + acc.add(mono) if acc.ready(): @@ -381,6 +406,9 @@ def _chunk_loop( idx, len(chunk), start_time, ) + if self.on_frame is not None: + return + final = acc.flush() if final is not None: idx, start_time, chunk = final diff --git a/src/hearsay/config.py b/src/hearsay/config.py index c0415a0..54a8497 100644 --- a/src/hearsay/config.py +++ b/src/hearsay/config.py @@ -11,6 +11,8 @@ AUDIO_SOURCE_SYSTEM, DEFAULT_CPU_COMPUTE, DEFAULT_CPU_MODEL, + DEFAULT_REALTIME_MODEL, + POST_SPEECH_SILENCE_S, ) from hearsay.utils.paths import get_config_path, get_default_output_dir @@ -36,6 +38,10 @@ class AppConfig: language: str = "en" vad_filter: bool = True + # Realtime dual-layer transcription (RealtimeSTT) + realtime_model_name: str = DEFAULT_REALTIME_MODEL + post_speech_silence_duration: float = POST_SPEECH_SILENCE_S + # Output output_dir: str = field(default_factory=lambda: str(get_default_output_dir())) diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py index 2a6dfd0..8ee3f98 100644 --- a/src/hearsay/constants.py +++ b/src/hearsay/constants.py @@ -58,6 +58,13 @@ DEFAULT_GPU_COMPUTE = "float16" DEFAULT_CPU_COMPUTE = "int8" +# RealtimeSTT dual-layer transcription. +# The fast model drives the tentative ("typing") layer; the main model +# (model_name above) produces the accurate final text once VAD detects the +# end of an utterance. +DEFAULT_REALTIME_MODEL = "tiny" # small/fast model for the tentative layer +POST_SPEECH_SILENCE_S = 0.7 # trailing silence (s) that finalizes an utterance + # Audio source options AUDIO_SOURCE_SYSTEM = "system" AUDIO_SOURCE_MIC = "microphone" @@ -70,6 +77,3 @@ # Transcript formatting PARAGRAPH_GAP_S = 2.0 # Silence gap (seconds) that triggers a paragraph break - -# UI -LIVE_VIEW_POLL_MS = 250 # Poll transcript queue every 250ms diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index 47a2484..0a5684a 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -8,7 +8,6 @@ from pathlib import Path from hearsay.output.formatter import clean_transcript_text, format_timestamp, make_title -from hearsay.transcription.engine import TranscriptionResult log = logging.getLogger(__name__) @@ -17,9 +16,14 @@ class MarkdownWriter: - """Writes transcript results to a .md file, appending as chunks arrive.""" - - def __init__(self, output_dir: str | Path, title: str | None = None) -> None: + """Writes transcript results to a .md file, appending as utterances are finalized.""" + + def __init__( + self, + output_dir: str | Path, + title: str | None = None, + language: str = "en", + ) -> None: self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) @@ -27,7 +31,7 @@ def __init__(self, output_dir: str | Path, title: str | None = None) -> None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.file_path = self.output_dir / f"transcript_{timestamp}.md" self._header_written = False - self._language: str = "en" + self._language: str = language or "en" def _write_header(self) -> None: with open(self.file_path, "w", encoding="utf-8") as f: @@ -35,38 +39,15 @@ def _write_header(self) -> None: self._header_written = True log.info("Transcript file created: %s", self.file_path) - def append(self, result: TranscriptionResult) -> None: - """Append a transcription result as timestamped lines matching the live view.""" + def append_utterance(self, elapsed_seconds: float, text: str) -> None: + """Append one finalized utterance as a timestamped line matching the live view.""" + text = text.strip() + if not text: + return if not self._header_written: self._write_header() - self._language = result.language or self._language - - if not result.segments: - self._append_fallback(result) - return - - chunk_offset = result.start_time - lines: list[str] = [] - for seg in result.segments: - seg_text = seg["text"].strip() - if not seg_text: - continue - ts = format_timestamp(chunk_offset + seg["start"]) - lines.append(f"[{ts}] {seg_text} \n") - - if lines: - with open(self.file_path, "a", encoding="utf-8") as f: - f.write("".join(lines)) - - log.debug("Appended chunk %d to %s", result.chunk_index, self.file_path) - - def _append_fallback(self, result: TranscriptionResult) -> None: - """Fallback for results with empty segments (e.g. after dedup).""" - text = result.text.strip() - if not text: - return - ts = format_timestamp(result.start_time) + ts = format_timestamp(elapsed_seconds) with open(self.file_path, "a", encoding="utf-8") as f: f.write(f"[{ts}] {text} \n") diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py deleted file mode 100644 index 06c408e..0000000 --- a/src/hearsay/transcription/engine.py +++ /dev/null @@ -1,133 +0,0 @@ -"""TranscriptionEngine: wraps faster-whisper for inference.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass - -import numpy as np - -from hearsay.utils.paths import get_models_dir - -log = logging.getLogger(__name__) - - -class CudaUnavailableError(RuntimeError): - """Raised when GPU is configured but CUDA runtime DLLs are missing.""" - - -@dataclass -class TranscriptionResult: - """Result from transcribing one audio chunk.""" - - text: str - segments: list[dict] # [{start, end, text}, ...] - language: str - language_probability: float - chunk_index: int - start_time: float = 0.0 # absolute offset (s) of this chunk from recording start - - -class TranscriptionEngine: - """Wraps faster-whisper WhisperModel for inference.""" - - def __init__( - self, - model_name: str = "small.en", - device: str = "cpu", - compute_type: str = "int8", - language: str = "en", - vad_filter: bool = True, - ) -> None: - self.model_name = model_name - self.device = device - self.compute_type = compute_type - self.language = language - self.vad_filter = vad_filter - self._model = None - - def load(self) -> None: - """Load the Whisper model into memory.""" - from faster_whisper import WhisperModel - from hearsay.transcription.model_manager import resolve_model_path - - model_path = resolve_model_path(self.model_name) - log.info( - "Loading model '%s' (device=%s, compute=%s)", - self.model_name, - self.device, - self.compute_type, - ) - try: - self._model = WhisperModel( - model_path, - device=self.device, - compute_type=self.compute_type, - download_root=str(get_models_dir()), - ) - except RuntimeError as exc: - if self.device != "cpu" and "cannot be loaded" in str(exc): - raise CudaUnavailableError(str(exc)) from exc - raise - log.info("Model loaded successfully (device=%s)", self.device) - - def transcribe( - self, - audio: np.ndarray, - chunk_index: int = 0, - start_time: float = 0.0, - ) -> TranscriptionResult: - """Transcribe a float32 16kHz mono audio array. - - Args: - audio: Audio data as float32 numpy array at 16kHz. - chunk_index: Index of this chunk (for ordering). - start_time: Absolute offset (s) of this chunk from recording start. - - Returns: - TranscriptionResult with text and segment details. - """ - if self._model is None: - raise RuntimeError("Model not loaded. Call load() first.") - - segments_iter, info = self._model.transcribe( - audio, - beam_size=5, - language=self.language if self.language else None, - vad_filter=self.vad_filter, - vad_parameters={"min_silence_duration_ms": 500}, - ) - - segments = [] - texts = [] - for seg in segments_iter: - segments.append({ - "start": seg.start, - "end": seg.end, - "text": seg.text.strip(), - }) - texts.append(seg.text.strip()) - - full_text = " ".join(texts) - log.debug( - "Chunk %d: %d segments, lang=%s (%.2f), text=%s", - chunk_index, - len(segments), - info.language, - info.language_probability, - full_text[:100], - ) - - return TranscriptionResult( - text=full_text, - segments=segments, - language=info.language, - language_probability=info.language_probability, - chunk_index=chunk_index, - start_time=start_time, - ) - - def unload(self) -> None: - """Free model memory.""" - self._model = None - log.info("Model unloaded") diff --git a/src/hearsay/transcription/pipeline.py b/src/hearsay/transcription/pipeline.py deleted file mode 100644 index 06e6a2f..0000000 --- a/src/hearsay/transcription/pipeline.py +++ /dev/null @@ -1,154 +0,0 @@ -"""TranscriptionPipeline thread: consumes audio chunks, produces transcript text.""" - -from __future__ import annotations - -import logging -import queue -import string -import time - -from hearsay.transcription.engine import TranscriptionEngine, TranscriptionResult -from hearsay.utils.threading_utils import StoppableThread - -log = logging.getLogger(__name__) - - -class TranscriptionPipeline(StoppableThread): - """Daemon thread that reads audio chunks from audio_queue, - transcribes them, and pushes results to transcript_queue. - - Args: - audio_queue: Input queue of (chunk_index, np.ndarray) tuples. - transcript_queue: Output queue of TranscriptionResult objects. - engine: Configured TranscriptionEngine (model already loaded). - """ - - _TAIL_WORD_COUNT = 15 # words kept from previous chunk for overlap matching - _MIN_MATCH_WORDS = 2 # minimum overlap length to avoid false positives - - def __init__( - self, - audio_queue: queue.Queue, - transcript_queue: queue.Queue, - engine: TranscriptionEngine, - ) -> None: - super().__init__(name="TranscriptionPipeline") - self.audio_queue = audio_queue - self.transcript_queue = transcript_queue - self.engine = engine - self._prev_tail_words: list[str] = [] - - def run(self) -> None: - log.info("TranscriptionPipeline started") - while not self.stopped(): - try: - chunk_index, start_time, audio = self.audio_queue.get(timeout=1.0) - except queue.Empty: - continue - self._process_chunk(chunk_index, start_time, audio) - - # Drain any audio chunks still in the queue after stop signal. - # The recorder flushes its buffer before exiting, so these chunks - # must be transcribed to avoid losing the tail of the recording. - log.info("TranscriptionPipeline draining remaining audio chunks") - while True: - try: - chunk_index, start_time, audio = self.audio_queue.get_nowait() - except queue.Empty: - break - self._process_chunk(chunk_index, start_time, audio) - - log.info("TranscriptionPipeline stopped") - - def _process_chunk(self, chunk_index: int, start_time: float, audio) -> None: - """Transcribe a single audio chunk and enqueue the result.""" - try: - t0 = time.perf_counter() - result = self.engine.transcribe( - audio, chunk_index=chunk_index, start_time=start_time - ) - elapsed = time.perf_counter() - t0 - log.info( - "Chunk %d transcribed in %.1fs: %s", - chunk_index, - elapsed, - result.text[:80] if result.text else "(empty)", - ) - if result.text: - original_words = result.text.split() - if chunk_index > 0 and self._prev_tail_words: - result = self._deduplicate(result) - self._prev_tail_words = original_words[-self._TAIL_WORD_COUNT:] - if result.text: - self.transcript_queue.put(result) - except Exception: - log.error("Transcription failed for chunk %d", chunk_index, exc_info=True) - - @staticmethod - def _normalize(word: str) -> str: - """Strip leading/trailing punctuation for comparison.""" - return word.strip(string.punctuation) - - def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: - """Remove overlapping prefix from *result* that duplicates the tail of the previous chunk.""" - new_words = result.text.split() - if len(new_words) < self._MIN_MATCH_WORDS: - return result - - # Find the longest prefix of new_words that matches a suffix of _prev_tail_words. - best = 0 - for length in range(self._MIN_MATCH_WORDS, min(len(self._prev_tail_words), len(new_words)) + 1): - suffix = self._prev_tail_words[-length:] - prefix = new_words[:length] - tail = [self._normalize(w).lower() for w in suffix] - head = [self._normalize(w).lower() for w in prefix] - # All words after the first must match exactly; the first word of the - # new chunk may be truncated (e.g. "replaced" -> "placed") so allow a - # suffix-of-word match when the fragment is at least 3 characters. - first_ok = tail[0] == head[0] or (len(head[0]) >= 3 and tail[0].endswith(head[0])) - if first_ok and tail[1:] == head[1:]: - best = length - - if best == 0: - return result - - stripped_words = new_words[best:] - log.info( - "Chunk %d: stripped %d overlapping words: %s", - result.chunk_index, - best, - " ".join(new_words[:best]), - ) - - if not stripped_words: - return TranscriptionResult( - text="", - segments=[], - language=result.language, - language_probability=result.language_probability, - chunk_index=result.chunk_index, - start_time=result.start_time, - ) - - # Rebuild text and trim leading segments that were fully covered by the overlap. - new_text = " ".join(stripped_words) - chars_removed = len(" ".join(new_words[:best])) + 1 # +1 for the space after - trimmed_segments = [] - for seg in result.segments: - seg_text = seg["text"] - if chars_removed >= len(seg_text): - chars_removed -= len(seg_text) + 1 # +1 for joining space - continue - if chars_removed > 0: - seg = {**seg, "text": seg_text[chars_removed:].lstrip()} - chars_removed = 0 - trimmed_segments.append(seg) - - return TranscriptionResult( - text=new_text, - segments=trimmed_segments if trimmed_segments else result.segments, - language=result.language, - language_probability=result.language_probability, - chunk_index=result.chunk_index, - start_time=result.start_time, - ) diff --git a/src/hearsay/transcription/realtime_engine.py b/src/hearsay/transcription/realtime_engine.py new file mode 100644 index 0000000..54d9a06 --- /dev/null +++ b/src/hearsay/transcription/realtime_engine.py @@ -0,0 +1,153 @@ +"""RealtimeEngine: dual-layer transcription via RealtimeSTT. + +Audio is captured by Hearsay's own AudioRecorder (system loopback / mic / both) +and fed into RealtimeSTT through ``feed_audio`` (``use_microphone=False``). Two +whisper models run concurrently: + + * a fast *realtime* model drives the tentative ("typing") layer, revised + continuously as the user speaks (``on_tentative``); + * the accurate *main* model produces the final text once VAD detects the end + of an utterance (``on_final``). +""" + +from __future__ import annotations + +import logging +import threading +from typing import Callable + +import numpy as np + +from hearsay.transcription.model_manager import resolve_model_path +from hearsay.utils.paths import get_models_dir + +log = logging.getLogger(__name__) + + +class CudaUnavailableError(RuntimeError): + """Raised when GPU is configured but CUDA is not available.""" + + +class RealtimeEngine: + """Drives RealtimeSTT with externally fed audio and two output layers.""" + + def __init__( + self, + model_name: str, + realtime_model_name: str, + device: str, + compute_type: str, + language: str, + on_tentative: Callable[[str], None], + on_final: Callable[[str], None], + on_utterance_start: Callable[[], None] | None = None, + post_speech_silence_duration: float = 0.7, + ) -> None: + self.model_name = model_name + self.realtime_model_name = realtime_model_name + self.device = device + self.compute_type = compute_type + self.language = language or "" + self._on_tentative = on_tentative + self._on_final = on_final + self._on_utterance_start = on_utterance_start + self._post_speech_silence_duration = post_speech_silence_duration + + self._recorder = None + self._final_thread: threading.Thread | None = None + self._stop = threading.Event() + + def load(self) -> None: + """Create the RealtimeSTT recorder (spawns the main-model process) and + start the final-text loop. Blocks until both models are ready.""" + if self.device == "cuda": + try: + import torch + if not torch.cuda.is_available(): + raise CudaUnavailableError("CUDA is not available") + except CudaUnavailableError: + raise + except Exception as exc: # torch import/init failure + raise CudaUnavailableError(str(exc)) from exc + + from RealtimeSTT import AudioToTextRecorder + + model = resolve_model_path(self.model_name) + log.info( + "Loading RealtimeSTT (main=%s, realtime=%s, device=%s, compute=%s)", + self.model_name, self.realtime_model_name, self.device, self.compute_type, + ) + self._recorder = AudioToTextRecorder( + model=model, + realtime_model_type=self.realtime_model_name, + language=self.language, + device=self.device, + compute_type=self.compute_type, + download_root=str(get_models_dir()), + use_microphone=False, + enable_realtime_transcription=True, + on_realtime_transcription_stabilized=self._handle_tentative, + on_recording_start=self._handle_utterance_start, + post_speech_silence_duration=self._post_speech_silence_duration, + spinner=False, + level=logging.WARNING, + no_log_file=True, + ) + log.info("RealtimeSTT ready") + + self._final_thread = threading.Thread( + target=self._final_loop, daemon=True, name="RealtimeFinal", + ) + self._final_thread.start() + + def feed(self, mono_float32: np.ndarray) -> None: + """Feed one mono 16 kHz float32 frame into RealtimeSTT. + + ``feed_audio`` casts directly to int16 without scaling, so float [-1, 1] + audio must be scaled into the int16 range first. + """ + rec = self._recorder + if rec is None or mono_float32 is None or len(mono_float32) == 0: + return + pcm16 = np.clip(mono_float32 * 32768.0, -32768, 32767).astype(np.int16) + try: + rec.feed_audio(pcm16, 16000) + except Exception: + log.error("feed_audio failed", exc_info=True) + + def _handle_tentative(self, text: str) -> None: + if text and text.strip() and not self._stop.is_set(): + self._on_tentative(text.strip()) + + def _handle_utterance_start(self) -> None: + if self._on_utterance_start is not None and not self._stop.is_set(): + self._on_utterance_start() + + def _final_loop(self) -> None: + """Block on recorder.text() and emit each finalized utterance.""" + while not self._stop.is_set(): + try: + text = self._recorder.text() + except Exception: + if self._stop.is_set(): + break + log.error("RealtimeSTT text() failed", exc_info=True) + break + if self._stop.is_set(): + break + if text and text.strip(): + self._on_final(text.strip()) + + def shutdown(self) -> None: + """Stop the final loop and tear down the recorder + child process.""" + self._stop.set() + rec = self._recorder + self._recorder = None + if rec is not None: + try: + rec.shutdown() + except Exception: + log.warning("RealtimeSTT shutdown error", exc_info=True) + if self._final_thread is not None: + self._final_thread.join(timeout=10) + self._final_thread = None diff --git a/src/hearsay/ui/live_view.py b/src/hearsay/ui/live_view.py index 8169357..82d7288 100644 --- a/src/hearsay/ui/live_view.py +++ b/src/hearsay/ui/live_view.py @@ -31,7 +31,7 @@ def __init__(self, master: ctk.CTk) -> None: # Delay disclaimer ctk.CTkLabel( self, - text="Transcript text appears with a delay of approximately 30\u201360 seconds depending on your hardware.", + text="Live text (gray) updates as you speak; it is replaced by the final, more accurate text after a brief pause.", font=("Segoe UI", 10, "italic"), text_color="gray", anchor="w", @@ -46,6 +46,11 @@ def __init__(self, master: ctk.CTk) -> None: ) self._textbox.pack(fill="both", expand=True, padx=10, pady=(10, 5)) + # The tentative (in-progress) line is rendered in gray and replaced in + # place each time RealtimeSTT revises it, then committed as a final line. + self._textbox.tag_config("tentative", foreground="#888888") + self._tent_start_index: str | None = None + # Bottom bar with status and controls bottom = ctk.CTkFrame(self) bottom.pack(fill="x", padx=10, pady=(0, 10)) @@ -96,15 +101,52 @@ def toggle(self) -> None: self.show() def append_text(self, text: str) -> None: - """Append text to the transcript view.""" + """Append a finished line to the transcript view.""" self._textbox.configure(state="normal") self._textbox.insert("end", text + "\n") self._textbox.configure(state="disabled") if self._autoscroll.get(): self._textbox.see("end") + def update_tentative(self, text: str) -> None: + """Show or revise the in-progress (gray) line at the bottom of the view.""" + tb = self._textbox + tb.configure(state="normal") + if self._tent_start_index is None: + self._tent_start_index = tb.index("end-1c") + else: + tb.delete(self._tent_start_index, "end-1c") + tb.insert(self._tent_start_index, text) + tb.tag_add("tentative", self._tent_start_index, "end-1c") + tb.configure(state="disabled") + if self._autoscroll.get(): + tb.see("end") + + def commit_final(self, line: str) -> None: + """Replace the tentative line (if any) with a committed final line.""" + tb = self._textbox + tb.configure(state="normal") + if self._tent_start_index is not None: + tb.delete(self._tent_start_index, "end-1c") + self._tent_start_index = None + tb.insert("end-1c", line + "\n") + tb.configure(state="disabled") + if self._autoscroll.get(): + tb.see("end") + + def drop_tentative(self) -> None: + """Discard the in-progress line without committing it.""" + if self._tent_start_index is None: + return + tb = self._textbox + tb.configure(state="normal") + tb.delete(self._tent_start_index, "end-1c") + self._tent_start_index = None + tb.configure(state="disabled") + def append_separator(self, timestamp: str) -> None: """Insert a visual divider marking the end of a recording session.""" + self.drop_tentative() self._textbox.configure(state="normal") self._textbox.insert("end", f"\n--- Recording ended at {timestamp} ---\n\n") self._textbox.configure(state="disabled") @@ -117,6 +159,7 @@ def set_status(self, text: str) -> None: def clear(self) -> None: """Clear all transcript text.""" + self._tent_start_index = None self._textbox.configure(state="normal") self._textbox.delete("1.0", "end") self._textbox.configure(state="disabled") From ad7a0abb1da18287d39fc6c8957db71a1ae3cff1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 4 Jun 2026 07:52:32 +0900 Subject: [PATCH 17/17] fix: finalize in-progress utterance when recording stops Stopping mid-sentence dropped the last utterance: shutdown() called rec.shutdown() directly, so RealtimeSTT text() returned "" and the buffered audio (still showing as tentative text) was discarded before VAD ever queued a final. shutdown() now gracefully stops an active recording (rec.stop()) and waits for _final_loop to emit the final transcription before tearing down. _final_loop emits on_final before the _stop check so a final that completes during shutdown is not dropped. Skips the wait for sub-0.5s clips that RealtimeSTT's min_length guard would reject, avoiding a hang. Co-Authored-By: Claude Opus 4.8 --- src/hearsay/transcription/realtime_engine.py | 24 ++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/hearsay/transcription/realtime_engine.py b/src/hearsay/transcription/realtime_engine.py index 54d9a06..0f8b40e 100644 --- a/src/hearsay/transcription/realtime_engine.py +++ b/src/hearsay/transcription/realtime_engine.py @@ -14,6 +14,7 @@ import logging import threading +import time from typing import Callable import numpy as np @@ -56,6 +57,7 @@ def __init__( self._recorder = None self._final_thread: threading.Thread | None = None self._stop = threading.Event() + self._final_emitted = threading.Event() def load(self) -> None: """Create the RealtimeSTT recorder (spawns the main-model process) and @@ -133,15 +135,29 @@ def _final_loop(self) -> None: break log.error("RealtimeSTT text() failed", exc_info=True) break - if self._stop.is_set(): - break if text and text.strip(): self._on_final(text.strip()) + self._final_emitted.set() + if self._stop.is_set(): + break def shutdown(self) -> None: - """Stop the final loop and tear down the recorder + child process.""" - self._stop.set() + """Finalize any in-progress utterance, then tear down the recorder.""" rec = self._recorder + if rec is not None and getattr(rec, "is_recording", False): + # Stopped mid-utterance: gracefully stop the active recording so its + # buffered audio gets a final transcription instead of being dropped. + started = getattr(rec, "recording_start_time", 0) or 0 + min_len = getattr(rec, "min_length_of_recording", 0.5) + if not started or (time.time() - started) >= min_len: + try: + self._final_emitted.clear() + rec.stop() + self._final_emitted.wait(timeout=15) + except Exception: + log.warning("Error finalizing in-progress utterance", exc_info=True) + + self._stop.set() self._recorder = None if rec is not None: try: