diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4055f54 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,50 @@ +name: Release + +on: + push: + tags: + - 'v*.*.*' + +permissions: + contents: write + +jobs: + build-and-release: + runs-on: windows-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: pip + + - name: Install dependencies + run: pip install -r requirements.txt pyinstaller + + - name: Update installer version + shell: pwsh + run: | + $version = "${{ github.ref_name }}".TrimStart("v") + (Get-Content installer.iss) -replace 'AppVersion=.*', "AppVersion=$version" | Set-Content installer.iss + + - name: Build with PyInstaller + shell: cmd + run: build.bat + + - name: Install Inno Setup + shell: pwsh + run: choco install innosetup --yes --no-progress + + - name: Build installer + shell: pwsh + run: '& "C:\Program Files (x86)\Inno Setup 6\ISCC.exe" installer.iss' + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: installer_output/HearsaySetup.exe + generate_release_notes: true diff --git a/build.bat b/build.bat index c85a855..5e6b1ca 100644 --- a/build.bat +++ b/build.bat @@ -14,9 +14,24 @@ pyinstaller --noconfirm --onedir --windowed ^ --hidden-import "sounddevice" ^ --hidden-import "customtkinter" ^ --hidden-import "pystray" ^ + --hidden-import "RealtimeSTT" ^ + --hidden-import "silero_vad" ^ + --hidden-import "webrtcvad" ^ + --hidden-import "onnxruntime" ^ + --hidden-import "scipy" ^ + --hidden-import "soundfile" ^ + --hidden-import "torch" ^ + --hidden-import "torchaudio" ^ --collect-all "customtkinter" ^ --collect-all "faster_whisper" ^ --collect-all "ctranslate2" ^ + --collect-all "RealtimeSTT" ^ + --collect-all "silero_vad" ^ + --collect-all "onnxruntime" ^ + --collect-all "scipy" ^ + --collect-all "soundfile" ^ + --collect-all "torch" ^ + --collect-all "torchaudio" ^ src\hearsay\__main__.py echo. diff --git a/requirements.txt b/requirements.txt index eb56c39..e02a8e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,14 @@ faster-whisper>=1.0.0 +RealtimeSTT>=1.0.0 +silero-vad>=5.1 PyAudioWPatch>=0.2.12 sounddevice>=0.4.6 numpy>=1.24.0 customtkinter>=5.2.0 pystray>=0.19.5 Pillow>=10.0.0 +nvidia-cublas-cu12>=12.0 +nvidia-cuda-runtime-cu12>=12.0 +transformers>=4.23.0 +torch>=2.0.0 +keyboard>=0.13.5 diff --git a/src/hearsay/__main__.py b/src/hearsay/__main__.py index 1ae94cf..b29065f 100644 --- a/src/hearsay/__main__.py +++ b/src/hearsay/__main__.py @@ -1,13 +1,23 @@ """Entry point for Hearsay: python -m hearsay""" +import multiprocessing import sys def main() -> None: + # RealtimeSTT spawns a child process (spawn start method) for the main + # transcription model; freeze_support is required for frozen/PyInstaller builds. + multiprocessing.freeze_support() + from hearsay.utils.logging_setup import setup_logging setup_logging() + # Must run before any ctranslate2 / faster-whisper import on Windows + from hearsay.utils.cuda_dlls import register_nvidia_dlls + + register_nvidia_dlls() + from hearsay.app import HearsayApp app = HearsayApp() diff --git a/src/hearsay/app.py b/src/hearsay/app.py index a7b78ba..e63f0a6 100644 --- a/src/hearsay/app.py +++ b/src/hearsay/app.py @@ -9,14 +9,15 @@ import threading import time +import webbrowser + import customtkinter as ctk from hearsay.audio.recorder import AudioRecorder from hearsay.config import ConfigManager -from hearsay.constants import APP_NAME, LIVE_VIEW_POLL_MS +from hearsay.constants import APP_NAME, DEFAULT_CPU_COMPUTE from hearsay.output.markdown_writer import MarkdownWriter -from hearsay.transcription.engine import TranscriptionEngine -from hearsay.transcription.pipeline import TranscriptionPipeline +from hearsay.transcription.realtime_engine import CudaUnavailableError, RealtimeEngine from hearsay.ui.about_window import AboutWindow from hearsay.ui.live_view import LiveTranscriptWindow from hearsay.ui.settings_window import SettingsWindow @@ -35,21 +36,18 @@ def __init__(self) -> None: self._config_manager = ConfigManager() self._config = self._config_manager.config - # Queues - self._audio_queue: queue.Queue = queue.Queue(maxsize=10) - self._transcript_queue: queue.Queue = queue.Queue() - # Threads / components self._recorder: AudioRecorder | None = None - self._engine: TranscriptionEngine | None = None - self._pipeline: TranscriptionPipeline | None = None + self._engine: RealtimeEngine | None = None self._writer: MarkdownWriter | None = None self._tray: SystemTrayIcon | None = None # State self._recording = False self._recording_start_time: float | None = None + self._utterance_start_elapsed: float | None = None self._teardown_thread: threading.Thread | None = None + self._hotkey_combo: str | None = None # UI apply_theme() @@ -81,6 +79,7 @@ def run(self) -> None: self._root.after(500, self._show_wizard) else: log.info("Config loaded, ready to record") + self._register_hotkey() # Start tkinter event loop self._root.mainloop() @@ -97,6 +96,7 @@ def _on_wizard_complete(self) -> None: """Called when the setup wizard finishes.""" self._config = self._config_manager.config log.info("Wizard complete, app ready") + self._register_hotkey() def _start_recording(self, source: str) -> None: """Start recording from the given source.""" @@ -107,17 +107,24 @@ def _start_recording(self, source: str) -> None: log.info("Starting recording (source=%s)", source) self._recording = True self._recording_start_time = time.time() + self._utterance_start_elapsed = None # Set up markdown writer - self._writer = MarkdownWriter(self._config.output_dir) + self._writer = MarkdownWriter( + self._config.output_dir, language=self._config.language + ) - # Load transcription engine - self._engine = TranscriptionEngine( + # Dual-layer realtime engine (tentative + final) + self._engine = RealtimeEngine( model_name=self._config.model_name, + realtime_model_name=self._config.realtime_model_name, device=self._config.device, compute_type=self._config.compute_type, language=self._config.language, - vad_filter=self._config.vad_filter, + on_tentative=self._on_tentative, + on_final=self._on_final, + on_utterance_start=self._on_utterance_start, + post_speech_silence_duration=self._config.post_speech_silence_duration, ) def load_and_start() -> None: @@ -126,32 +133,35 @@ def load_and_start() -> None: self._teardown_thread.join(timeout=30) self._teardown_thread = None - # Now safe to clear queues (old teardown has finished draining them) - while not self._audio_queue.empty(): - try: - self._audio_queue.get_nowait() - except queue.Empty: - break - while not self._transcript_queue.empty(): - try: - self._transcript_queue.get_nowait() - except queue.Empty: - break - - self._engine.load() - - # Start pipeline - self._pipeline = TranscriptionPipeline( - audio_queue=self._audio_queue, - transcript_queue=self._transcript_queue, - engine=self._engine, + # Download HF model on-demand (deferred from settings save) + from hearsay.transcription.model_manager import ( + download_model, is_hf_custom_model, is_model_downloaded, ) - self._pipeline.start() + if (is_hf_custom_model(self._engine.model_name) + and not is_model_downloaded(self._engine.model_name)): + safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Downloading model...")) + try: + def _dl_progress(msg: str) -> None: + safe_after(self._root, 0, + lambda m=msg: self._ensure_live_view().set_status(f"Downloading: {m}")) + download_model(self._engine.model_name, progress_callback=_dl_progress) + except Exception as exc: + log.error("Model download failed at recording start", exc_info=True) + safe_after(self._root, 0, lambda e=str(exc): self._on_model_download_failed(e)) + return + + safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Loading model...")) + try: + self._engine.load() + except CudaUnavailableError: + safe_after(self._root, 0, lambda: self._handle_cuda_error(source)) + return - # Start recorder + # Start recorder in streaming mode — frames feed straight into the engine self._recorder = AudioRecorder( - audio_queue=self._audio_queue, + queue.Queue(), source=source, + on_frame=self._engine.feed, ) self._recorder.start() @@ -172,8 +182,38 @@ def _on_recording_started(self) -> None: self._tray.set_recording(True) if self._live_view: self._live_view.set_status("Recording...") - # Start polling transcript queue - self._poll_transcripts() + if self._config.beep_on_start: + threading.Thread(target=self._play_beep, args=("start",), daemon=True).start() + + # ── Transcription callbacks (from the engine threads) ─────────────────────── + + def _on_utterance_start(self) -> None: + """RealtimeSTT detected speech onset — stamp the utterance's start time.""" + if self._recording_start_time is not None: + self._utterance_start_elapsed = time.time() - self._recording_start_time + + def _on_tentative(self, text: str) -> None: + """Revised in-progress text from the fast realtime model (gray layer).""" + safe_after(self._root, 0, lambda t=text: ( + self._live_view.update_tentative(t) if self._live_view else None + )) + + def _on_final(self, text: str) -> None: + """Finalized, accurate text for a completed utterance (committed layer).""" + elapsed = self._utterance_start_elapsed + if elapsed is None and self._recording_start_time is not None: + elapsed = time.time() - self._recording_start_time + elapsed = elapsed or 0.0 + self._utterance_start_elapsed = None + + if self._writer: + self._writer.append_utterance(elapsed, text) + + from hearsay.output.formatter import format_timestamp + line = f"[{format_timestamp(elapsed)}] {text}" + safe_after(self._root, 0, lambda l=line: ( + self._live_view.commit_final(l) if self._live_view else None + )) def _stop_recording(self) -> None: """Stop the current recording session. @@ -188,6 +228,9 @@ def _stop_recording(self) -> None: log.info("Stopping recording") self._recording = False + if self._config.beep_on_stop: + threading.Thread(target=self._play_beep, args=("stop",), daemon=True).start() + # Update tray immediately so the menu is responsive if self._tray: self._tray.set_recording(False) @@ -199,20 +242,18 @@ def _stop_recording(self) -> None: # Capture references for the background thread recorder = self._recorder - pipeline = self._pipeline engine = self._engine writer = self._writer start_time = self._recording_start_time self._recorder = None - self._pipeline = None self._engine = None self._writer = None self._recording_start_time = None self._teardown_thread = threading.Thread( target=self._teardown_recording, - args=(recorder, pipeline, engine, writer, start_time), + args=(recorder, engine, writer, start_time), daemon=True, name="RecordingTeardown", ) @@ -221,48 +262,19 @@ def _stop_recording(self) -> None: def _teardown_recording( self, recorder: AudioRecorder | None, - pipeline: TranscriptionPipeline | None, - engine: TranscriptionEngine | None, + engine: RealtimeEngine | None, writer: MarkdownWriter | None, start_time: float | None, ) -> None: """Blocking recording teardown — runs on a background thread.""" - # 1. Stop recorder first so it flushes remaining audio to the queue. + # 1. Stop recorder first so it stops feeding audio into the engine. if recorder: recorder.stop() recorder.join(timeout=5) - # 2. Stop pipeline -- it will drain any remaining audio chunks before - # exiting. Use a generous timeout so CPU transcription can finish. - if pipeline: - pipeline.stop() - pipeline.join(timeout=60) - if pipeline.is_alive(): - log.warning("Pipeline thread still running after join timeout") - - # 3. Unload model only after pipeline is done. + # 2. Shut down the engine (stops both models and the child process). if engine: - engine.unload() - - # Drain any remaining transcript results that arrived after polling stopped - if writer: - try: - while True: - result = self._transcript_queue.get_nowait() - writer.append(result) - if self._live_view: - for seg in result.segments: - from hearsay.output.formatter import format_timestamp - ts = format_timestamp( - result.chunk_index * 30 + seg["start"] - ) - safe_after(self._root, 0, - lambda t=f"[{ts}] {seg['text']}": ( - self._live_view.append_text(t) - if self._live_view else None - )) - except queue.Empty: - pass + engine.shutdown() # Finalize transcript duration = None @@ -280,6 +292,14 @@ def _teardown_recording( )) writer.post_process() + if self._config.beep_on_save: + self._play_beep("save") + + if self._config.copy_to_clipboard: + text = self._extract_clipboard_text(writer) + if text: + safe_after(self._root, 0, lambda t=text: self._copy_to_clipboard(t)) + # Insert session separator in live view end_time = time.strftime("%I:%M %p") safe_after(self._root, 0, lambda: ( @@ -291,32 +311,6 @@ def _teardown_recording( self._live_view.set_status("Idle") if self._live_view else None )) - def _poll_transcripts(self) -> None: - """Poll the transcript queue and update live view + markdown writer.""" - if not self._recording: - return - - try: - while True: - result = self._transcript_queue.get_nowait() - # Write to markdown - if self._writer: - self._writer.append(result) - # Update live view - if self._live_view: - for seg in result.segments: - from hearsay.output.formatter import format_timestamp - ts = format_timestamp( - result.chunk_index * 30 + seg["start"] - ) - self._live_view.append_text(f"[{ts}] {seg['text']}") - except queue.Empty: - pass - - # Schedule next poll - if self._recording: - safe_after(self._root, LIVE_VIEW_POLL_MS, self._poll_transcripts) - def _ensure_live_view(self) -> LiveTranscriptWindow: """Create live view if needed, return it.""" if self._live_view is None: @@ -332,9 +326,18 @@ def _open_settings(self) -> None: safe_after( self._root, 0, - lambda: SettingsWindow(self._root, self._config_manager), + lambda: SettingsWindow( + self._root, + self._config_manager, + on_save=self._on_settings_saved, + is_recording=lambda: self._recording, + ), ) + def _on_settings_saved(self) -> None: + self._config = self._config_manager.config + self._register_hotkey() + def _open_about(self) -> None: """Open the about window.""" safe_after( @@ -343,6 +346,153 @@ def _open_about(self) -> None: lambda: AboutWindow(self._root), ) + def _on_model_download_failed(self, error: str) -> None: + """Called on main thread when model download fails at recording start.""" + self._recording = False + self._engine = None + if self._tray: + self._tray.set_recording(False) + if self._live_view: + self._live_view.set_status("Download failed") + from tkinter import messagebox + messagebox.showerror( + "Model Download Failed", + "Failed to download the selected model. Check your internet connection " + "or select a different model in Settings.\n\n" + error[:200], + parent=self._root, + ) + + def _handle_cuda_error(self, source: str) -> None: + """Called on main thread when CUDA runtime DLLs are missing.""" + self._recording = False + self._engine = None + if self._tray: + self._tray.set_recording(False) + if self._live_view: + self._live_view.set_status("Idle") + self._show_cuda_error_dialog(source) + + def _show_cuda_error_dialog(self, source: str) -> None: + """Show a dialog offering CPU fallback or CUDA Toolkit install link.""" + dialog = ctk.CTkToplevel(self._root) + dialog.title("GPU Unavailable") + dialog.resizable(False, False) + dialog.grab_set() + + # Center on screen + dialog.update_idletasks() + w, h = 420, 220 + x = (dialog.winfo_screenwidth() - w) // 2 + y = (dialog.winfo_screenheight() - h) // 2 + dialog.geometry(f"{w}x{h}+{x}+{y}") + + ctk.CTkLabel( + dialog, + text="CUDA runtime library not found.", + font=ctk.CTkFont(size=14, weight="bold"), + ).pack(pady=(20, 4)) + + ctk.CTkLabel( + dialog, + text=( + "GPU is selected but CUDA Toolkit 12.x is not installed,\n" + "so inference cannot run on GPU.\n\n" + "Switch to CPU or install CUDA Toolkit to continue." + ), + justify="center", + ).pack(pady=(0, 16)) + + btn_frame = ctk.CTkFrame(dialog, fg_color="transparent") + btn_frame.pack() + + def switch_to_cpu() -> None: + dialog.destroy() + self._config.device = "cpu" + self._config.compute_type = DEFAULT_CPU_COMPUTE + self._config_manager.save() + log.info("Switched to CPU per user request after CUDA error") + self._start_recording(source) + + def open_cuda_download() -> None: + dialog.destroy() + webbrowser.open("https://developer.nvidia.com/cuda-downloads") + + ctk.CTkButton( + btn_frame, text="Switch to CPU", width=160, command=switch_to_cpu, + ).pack(side="left", padx=8) + + ctk.CTkButton( + btn_frame, text="Install CUDA Toolkit", width=160, + fg_color="transparent", border_width=1, + command=open_cuda_download, + ).pack(side="left", padx=8) + + # ── Hotkey ──────────────────────────────────────────────────────────────── + + def _register_hotkey(self) -> None: + try: + import keyboard as kb + self._unregister_hotkey() + combo = self._config.hotkey + if combo: + kb.add_hotkey(combo, self._toggle_recording_hotkey) + self._hotkey_combo = combo + log.info("Hotkey registered: %s", combo) + except Exception: + log.warning("Failed to register hotkey", exc_info=True) + + def _unregister_hotkey(self) -> None: + try: + import keyboard as kb + if self._hotkey_combo: + kb.remove_hotkey(self._hotkey_combo) + self._hotkey_combo = None + except Exception: + pass + + def _toggle_recording_hotkey(self) -> None: + """Called from the keyboard library thread — must dispatch to main thread.""" + if self._recording: + safe_after(self._root, 0, self._stop_recording) + else: + safe_after(self._root, 0, lambda: self._start_recording(self._config.audio_source)) + + # ── Beep ────────────────────────────────────────────────────────────────── + + def _play_beep(self, event: str) -> None: + try: + import winsound + if event == "start": + winsound.Beep(880, 120) + elif event == "stop": + winsound.Beep(520, 180) + elif event == "save": + winsound.Beep(660, 80) + winsound.Beep(880, 160) + except Exception: + pass + + # ── Clipboard ───────────────────────────────────────────────────────────── + + def _extract_clipboard_text(self, writer: MarkdownWriter) -> str: + try: + content = writer.file_path.read_text(encoding="utf-8") + header_end = content.index("\n\n") + 2 + footer_idx = content.rfind("\n---\n") + body = content[header_end:footer_idx] if footer_idx != -1 else content[header_end:] + return body.strip() + except Exception: + log.warning("Failed to extract clipboard text", exc_info=True) + return "" + + def _copy_to_clipboard(self, text: str) -> None: + try: + self._root.clipboard_clear() + self._root.clipboard_append(text) + log.info("Transcript copied to clipboard (%d chars)", len(text)) + except Exception: + log.warning("Failed to copy to clipboard", exc_info=True) + def _open_output_dir(self) -> None: """Open the output directory in file explorer.""" path = self._config.output_dir @@ -359,11 +509,10 @@ def _quit(self) -> None: if self._recording: self._recording = False self._teardown_recording( - self._recorder, self._pipeline, self._engine, + self._recorder, self._engine, self._writer, self._recording_start_time, ) self._recorder = None - self._pipeline = None self._engine = None self._writer = None self._recording_start_time = None @@ -371,6 +520,7 @@ def _quit(self) -> None: self._teardown_thread.join(timeout=30) self._teardown_thread = None + self._unregister_hotkey() if self._tray: self._tray.stop() safe_after(self._root, 100, self._root.quit) diff --git a/src/hearsay/audio/recorder.py b/src/hearsay/audio/recorder.py index 845b31c..4d054df 100644 --- a/src/hearsay/audio/recorder.py +++ b/src/hearsay/audio/recorder.py @@ -4,7 +4,7 @@ import logging import queue -import time +from typing import Callable import numpy as np @@ -14,21 +14,127 @@ AUDIO_SOURCE_BOTH, AUDIO_SOURCE_MIC, AUDIO_SOURCE_SYSTEM, - CHUNK_DURATION_S, + MAX_CHUNK_DURATION_S, + MIN_CHUNK_DURATION_S, OVERLAP_DURATION_S, SAMPLE_RATE, + SILENCE_DURATION_S, + SILENCE_RMS_THRESHOLD, ) from hearsay.utils.threading_utils import StoppableThread log = logging.getLogger(__name__) +class _ChunkAccumulator: + """Accumulates mono 16 kHz float32 audio and decides chunk boundaries. + + A chunk becomes ready when either: + * the buffer reaches ``MAX_CHUNK_DURATION_S`` (hard cap), or + * at least ``MIN_CHUNK_DURATION_S`` has accumulated AND the trailing + ``SILENCE_DURATION_S`` of audio is near-silent. + + Consecutive chunks share ``OVERLAP_DURATION_S`` of audio so the + transcription pipeline can stitch words across boundaries. Each emitted + chunk carries its absolute start time (seconds from the start of the + recording), so downstream timestamps stay correct despite variable lengths. + """ + + def __init__(self) -> None: + self._buffer: list[np.ndarray] = [] + self._total = 0 # samples currently buffered + self._silence_run = 0 # consecutive trailing near-silent samples + self._start_sample = 0 # absolute index of buffer[0] in the recording + self.chunk_index = 0 + + self._min = int(MIN_CHUNK_DURATION_S * SAMPLE_RATE) + self._max = int(MAX_CHUNK_DURATION_S * SAMPLE_RATE) + self._silence_needed = int(SILENCE_DURATION_S * SAMPLE_RATE) + self._overlap = int(OVERLAP_DURATION_S * SAMPLE_RATE) + + def add(self, mono: np.ndarray, silent: bool | None = None) -> None: + """Append a mono frame, updating the trailing-silence run. + + If *silent* is None, silence is computed from this frame's RMS. + Callers mixing multiple sources (Both mode) pass an explicit flag. + """ + if mono is None or len(mono) == 0: + return + self._buffer.append(mono) + self._total += len(mono) + + if silent is None: + rms = float(np.sqrt(np.mean(mono ** 2))) + silent = rms < SILENCE_RMS_THRESHOLD + + if silent: + self._silence_run += len(mono) + else: + self._silence_run = 0 + + def ready(self) -> bool: + """True when the current buffer should be emitted as a chunk.""" + if self._total >= self._max: + return True + return self._total >= self._min and self._silence_run >= self._silence_needed + + def pop(self) -> tuple[int, float, np.ndarray]: + """Emit a chunk and retain the overlap tail. Returns (index, start_s, audio).""" + data = np.concatenate(self._buffer) + emitted_len = min(len(data), self._max) + chunk = data[:emitted_len] + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + + # Advance by the unique (non-overlapping) audio we just consumed. + advance = max(0, emitted_len - self._overlap) + self._start_sample += advance + + if self._overlap > 0: + leftover = data[emitted_len - self._overlap:] + else: + leftover = data[emitted_len:] + self._buffer = [leftover] if len(leftover) else [] + self._total = int(len(leftover)) + self._silence_run = 0 + self.chunk_index += 1 + return idx, start_time, chunk + + def flush(self) -> tuple[int, float, np.ndarray] | None: + """Emit whatever remains (if > 1s) when recording stops.""" + if self._total <= SAMPLE_RATE: # less than 1 second — discard + return None + data = np.concatenate(self._buffer) + start_time = self._start_sample / SAMPLE_RATE + idx = self.chunk_index + self._buffer = [] + self._total = 0 + self.chunk_index += 1 + return idx, start_time, data + + +def _rms(mono: np.ndarray) -> float: + """Root-mean-square level of a mono float32 frame.""" + if mono is None or len(mono) == 0: + return 0.0 + return float(np.sqrt(np.mean(mono ** 2))) + + class AudioRecorder(StoppableThread): - """Record audio and push 30-second chunks to a queue. + """Record audio and push variable-length chunks to a queue. + + Each queue item is a ``(chunk_index, start_time_s, np.ndarray)`` tuple, + where ``start_time_s`` is the chunk's absolute offset from the start of the + recording. + + When ``on_frame`` is provided, the recorder streams every mono 16 kHz + float32 frame to that callback instead of accumulating chunks into + ``audio_queue`` — used to feed RealtimeSTT continuously for low latency. Args: - audio_queue: Queue to push (chunk_index, np.ndarray) tuples. + audio_queue: Queue to push chunks to (ignored when ``on_frame`` is set). source: One of 'system', 'microphone', 'both'. + on_frame: Optional per-frame callback for streaming (RealtimeSTT) mode. loopback_device_index: PyAudioWPatch device index for loopback. mic_device_index: sounddevice device index for mic. """ @@ -37,6 +143,7 @@ def __init__( self, audio_queue: queue.Queue, source: str = AUDIO_SOURCE_SYSTEM, + on_frame: Callable[[np.ndarray], None] | None = None, loopback_device_index: int | None = None, mic_device_index: int | None = None, loopback_channels: int = 2, @@ -47,6 +154,7 @@ def __init__( super().__init__(name="AudioRecorder") self.audio_queue = audio_queue self.source = source + self.on_frame = on_frame self.loopback_device_index = loopback_device_index self.mic_device_index = mic_device_index self.loopback_channels = loopback_channels @@ -108,32 +216,19 @@ def _record_mic(self) -> None: """Record microphone via sounddevice.""" import sounddevice as sd - buffer: list[np.ndarray] = [] - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - chunk_index = 0 + acc = _ChunkAccumulator() def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None: - nonlocal chunk_index mono = resample(indata.copy(), self.mic_rate, self.mic_channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - # Keep overlap - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - device = self.mic_device_index + if self.on_frame is not None: + self.on_frame(mono) + return + acc.add(mono) + if acc.ready(): + self.audio_queue.put(acc.pop()) + with sd.InputStream( - device=device, + device=self.mic_device_index, samplerate=self.mic_rate, channels=self.mic_channels, dtype="float32", @@ -142,11 +237,12 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object) while not self.stopped(): self.wait(timeout=0.5) - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) + if self.on_frame is not None: + return + + final = acc.flush() + if final is not None: + self.audio_queue.put(final) def _record_both(self) -> None: """Record both loopback and mic, mix them. @@ -156,7 +252,8 @@ def _record_both(self) -> None: occurs when PyAudioWPatch and sounddevice run on the same thread. The mic stream uses PyAudio's callback mode so it accumulates data asynchronously while the main loop drives off blocking loopback - reads. + reads. Chunk boundaries are decided on the *combined* activity, so a + chunk is only cut when both sources fall silent. """ import pyaudiowpatch as pyaudio @@ -230,10 +327,15 @@ def mic_callback(in_data, frame_count, time_info, status_flags): mic_stream.start_stream() # --- Main loop (driven by blocking loopback reads) --- - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - loopback_buf: list[np.ndarray] = [] - chunk_index = 0 + acc = _ChunkAccumulator() + + def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray: + if not mic_buffer: + return lb_chunk + mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] + if len(mic_chunk) < len(lb_chunk): + mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) + return mix_streams(lb_chunk, mic_chunk) while not self.stopped(): try: @@ -241,49 +343,30 @@ def mic_callback(in_data, frame_count, time_info, status_flags): except Exception: break audio = np.frombuffer(raw, dtype=np.int16) - mono = resample(audio, self.loopback_rate, self.loopback_channels) - loopback_buf.append(mono) - - total = sum(len(b) for b in loopback_buf) - if total >= chunk_samples: - lb_chunk = np.concatenate(loopback_buf)[:chunk_samples] - mic_samples = sum(len(b) for b in mic_buffer) - log.debug( - "Mixing chunk %d: loopback=%d mic=%d samples", - chunk_index, len(lb_chunk), mic_samples, - ) - - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:chunk_samples] - if len(mic_chunk) < chunk_samples: - mic_chunk = np.pad(mic_chunk, (0, chunk_samples - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - - self.audio_queue.put((chunk_index, mixed)) - chunk_index += 1 - - if overlap_samples > 0: - leftover = np.concatenate(loopback_buf)[chunk_samples - overlap_samples:] - loopback_buf.clear() - loopback_buf.append(leftover) - else: - loopback_buf.clear() + lb_mono = resample(audio, self.loopback_rate, self.loopback_channels) + + if self.on_frame is not None: + self.on_frame(mix_with_mic(lb_mono)) + mic_buffer.clear() + continue + + # Combined silence: silent only when both sources are quiet. + # The latest mic frame approximates current mic activity. + mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True + silent = (_rms(lb_mono) < SILENCE_RMS_THRESHOLD) and mic_silent + + acc.add(lb_mono, silent=silent) + if acc.ready(): + idx, start_time, lb_chunk = acc.pop() + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_buffer.clear() # --- Flush remaining audio --- - if loopback_buf: - lb_chunk = np.concatenate(loopback_buf) - if len(lb_chunk) > SAMPLE_RATE: # Only if > 1 second - if mic_buffer: - mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)] - if len(mic_chunk) < len(lb_chunk): - mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk))) - mixed = mix_streams(lb_chunk, mic_chunk) - else: - mixed = lb_chunk - self.audio_queue.put((chunk_index, mixed)) + if self.on_frame is None: + final = acc.flush() + if final is not None: + idx, start_time, lb_chunk = final + self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk))) mic_stream.stop_stream() mic_stream.close() @@ -298,11 +381,8 @@ def _chunk_loop( sr: int, channels: int, ) -> None: - """Generic chunking loop for loopback-style streams.""" - chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE) - overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE) - buffer: list[np.ndarray] = [] - chunk_index = 0 + """Generic chunking loop for loopback-style (blocking-read) streams.""" + acc = _ChunkAccumulator() while not self.stopped(): try: @@ -311,25 +391,26 @@ def _chunk_loop( break audio = np.frombuffer(raw, dtype=np.int16) mono = resample(audio, sr, channels) - buffer.append(mono) - - total = sum(len(b) for b in buffer) - if total >= chunk_samples: - chunk = np.concatenate(buffer)[:chunk_samples] - self.audio_queue.put((chunk_index, chunk)) - chunk_index += 1 - log.debug("Audio chunk %d queued (%d samples)", chunk_index - 1, len(chunk)) - - if overlap_samples > 0: - leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:] - buffer.clear() - buffer.append(leftover) - else: - buffer.clear() - - # Flush remaining audio - if buffer: - chunk = np.concatenate(buffer) - if len(chunk) > SAMPLE_RATE: # Only if > 1 second - self.audio_queue.put((chunk_index, chunk)) - log.debug("Final audio chunk %d queued (%d samples)", chunk_index, len(chunk)) + + if self.on_frame is not None: + self.on_frame(mono) + continue + + acc.add(mono) + + if acc.ready(): + idx, start_time, chunk = acc.pop() + self.audio_queue.put((idx, start_time, chunk)) + log.debug( + "Audio chunk %d queued (%d samples, t=%.1fs)", + idx, len(chunk), start_time, + ) + + if self.on_frame is not None: + return + + final = acc.flush() + if final is not None: + idx, start_time, chunk = final + self.audio_queue.put((idx, start_time, chunk)) + log.debug("Final audio chunk %d queued (%d samples)", idx, len(chunk)) diff --git a/src/hearsay/config.py b/src/hearsay/config.py index ea804c5..54a8497 100644 --- a/src/hearsay/config.py +++ b/src/hearsay/config.py @@ -11,6 +11,8 @@ AUDIO_SOURCE_SYSTEM, DEFAULT_CPU_COMPUTE, DEFAULT_CPU_MODEL, + DEFAULT_REALTIME_MODEL, + POST_SPEECH_SILENCE_S, ) from hearsay.utils.paths import get_config_path, get_default_output_dir @@ -36,12 +38,27 @@ class AppConfig: language: str = "en" vad_filter: bool = True + # Realtime dual-layer transcription (RealtimeSTT) + realtime_model_name: str = DEFAULT_REALTIME_MODEL + post_speech_silence_duration: float = POST_SPEECH_SILENCE_S + # Output output_dir: str = field(default_factory=lambda: str(get_default_output_dir())) # UI show_live_view_on_start: bool = False + # Hotkey + hotkey: str = "ctrl+alt+r" + + # Beep notifications + beep_on_start: bool = True + beep_on_stop: bool = True + beep_on_save: bool = True + + # Clipboard + copy_to_clipboard: bool = False + class ConfigManager: """Load and save AppConfig to JSON in %APPDATA%\\Hearsay.""" diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py index 710dfca..8ee3f98 100644 --- a/src/hearsay/constants.py +++ b/src/hearsay/constants.py @@ -7,10 +7,34 @@ # Audio settings SAMPLE_RATE = 16000 # Whisper expects 16kHz CHANNELS = 1 # Whisper expects mono -CHUNK_DURATION_S = 30 # Whisper's native context window -OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting +# Variable-length chunking driven by trailing-silence detection. +# A chunk is cut once at least MIN_CHUNK_DURATION_S has accumulated AND the +# trailing SILENCE_DURATION_S of audio is near-silent — or unconditionally once +# MAX_CHUNK_DURATION_S (Whisper's native context window) is reached. +MIN_CHUNK_DURATION_S = 5 # Minimum audio buffered before an early (silence) cut +MAX_CHUNK_DURATION_S = 30 # Hard cap — Whisper's native context window +SILENCE_DURATION_S = 1.0 # Trailing near-silence (seconds) that triggers a cut +SILENCE_RMS_THRESHOLD = 0.01 # RMS on [-1, 1] float audio below which ≈ silence +OVERLAP_DURATION_S = 1 # Overlap between chunks to prevent word splitting AUDIO_DTYPE = "float32" +# Custom HuggingFace models: short name -> {repo_id, parameters, vram_gb, english_only} +# These models are in Transformers format and must be converted to CTranslate2 on first use. +HF_CUSTOM_MODELS: dict[str, dict] = { + "small-ko": { + "repo_id": "SungBeom/whisper-small-ko", + "parameters": "244M", + "vram_gb": 2, + "english_only": False, + }, + "medium-ko-zeroth": { + "repo_id": "seastar105/whisper-medium-ko-zeroth", + "parameters": "769M", + "vram_gb": 5, + "english_only": False, + }, +} + # Model table: name -> (parameters, vram_gb, english_only) MODEL_TABLE = { "tiny": ("39M", 1, False), @@ -23,6 +47,9 @@ "medium.en": ("769M", 5, True), "large-v3": ("1550M", 10, False), "turbo": ("809M", 6, False), + # Korean fine-tuned models (HuggingFace, converted to CTranslate2 on first use) + "small-ko": ("244M", 2, False), + "medium-ko-zeroth": ("769M", 5, False), } # Default model recommendations @@ -31,6 +58,13 @@ DEFAULT_GPU_COMPUTE = "float16" DEFAULT_CPU_COMPUTE = "int8" +# RealtimeSTT dual-layer transcription. +# The fast model drives the tentative ("typing") layer; the main model +# (model_name above) produces the accurate final text once VAD detects the +# end of an utterance. +DEFAULT_REALTIME_MODEL = "tiny" # small/fast model for the tentative layer +POST_SPEECH_SILENCE_S = 0.7 # trailing silence (s) that finalizes an utterance + # Audio source options AUDIO_SOURCE_SYSTEM = "system" AUDIO_SOURCE_MIC = "microphone" @@ -43,6 +77,3 @@ # Transcript formatting PARAGRAPH_GAP_S = 2.0 # Silence gap (seconds) that triggers a paragraph break - -# UI -LIVE_VIEW_POLL_MS = 250 # Poll transcript queue every 250ms diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py index 912585a..0a5684a 100644 --- a/src/hearsay/output/markdown_writer.py +++ b/src/hearsay/output/markdown_writer.py @@ -3,23 +3,27 @@ from __future__ import annotations import logging +import re from datetime import datetime from pathlib import Path -from hearsay.constants import PARAGRAPH_GAP_S -from hearsay.output.formatter import clean_transcript_text, make_title -from hearsay.transcription.engine import TranscriptionResult +from hearsay.output.formatter import clean_transcript_text, format_timestamp, make_title log = logging.getLogger(__name__) -# Markers used to split header / body / footer for post-processing _FOOTER_MARKER = "\n---\n" +_TS_LINE_RE = re.compile(r"^(\[\d+:\d+(?::\d+)?\] )(.+?)\ *$") class MarkdownWriter: - """Writes transcript results to a .md file, appending as chunks arrive.""" - - def __init__(self, output_dir: str | Path, title: str | None = None) -> None: + """Writes transcript results to a .md file, appending as utterances are finalized.""" + + def __init__( + self, + output_dir: str | Path, + title: str | None = None, + language: str = "en", + ) -> None: self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) @@ -27,65 +31,25 @@ def __init__(self, output_dir: str | Path, title: str | None = None) -> None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.file_path = self.output_dir / f"transcript_{timestamp}.md" self._header_written = False - - # Track absolute timing across chunks for gap-based paragraph breaks - self._last_segment_end: float | None = None - self._language: str = "en" + self._language: str = language or "en" def _write_header(self) -> None: - """Write the markdown header on first call.""" with open(self.file_path, "w", encoding="utf-8") as f: f.write(f"# {self.title}\n\n") self._header_written = True log.info("Transcript file created: %s", self.file_path) - def append(self, result: TranscriptionResult) -> None: - """Append a transcription result using segment-level gap detection.""" + def append_utterance(self, elapsed_seconds: float, text: str) -> None: + """Append one finalized utterance as a timestamped line matching the live view.""" + text = text.strip() + if not text: + return if not self._header_written: self._write_header() - self._language = result.language or self._language - - if not result.segments: - self._append_fallback(result) - return - - chunk_offset = result.chunk_index * 30 # seconds offset for this chunk - pieces: list[str] = [] - - for seg in result.segments: - seg_start = chunk_offset + seg["start"] - seg_text = seg["text"].strip() - if not seg_text: - continue - - # Determine separator: paragraph break on long gap, space otherwise - if self._last_segment_end is not None: - gap = seg_start - self._last_segment_end - if gap >= PARAGRAPH_GAP_S: - pieces.append("\n\n") - else: - pieces.append(" ") - # else: very first segment, no separator needed - - pieces.append(seg_text) - self._last_segment_end = chunk_offset + seg["end"] - - if pieces: - with open(self.file_path, "a", encoding="utf-8") as f: - f.write("".join(pieces)) - - log.debug("Appended chunk %d to %s", result.chunk_index, self.file_path) - - def _append_fallback(self, result: TranscriptionResult) -> None: - """Fallback for results with empty segments (e.g. after dedup).""" - text = result.text.strip() - if not text: - return + ts = format_timestamp(elapsed_seconds) with open(self.file_path, "a", encoding="utf-8") as f: - if self._last_segment_end is not None: - f.write(" ") - f.write(text) + f.write(f"[{ts}] {text} \n") def finalize(self, total_duration: float | None = None) -> Path: """Write a footer and return the file path.""" @@ -93,7 +57,7 @@ def finalize(self, total_duration: float | None = None) -> Path: self._write_header() with open(self.file_path, "a", encoding="utf-8") as f: - f.write("\n\n---\n\n") + f.write("\n---\n\n") f.write(f"*Generated by Hearsay on {datetime.now():%Y-%m-%d at %H:%M}*\n") if total_duration: from hearsay.output.formatter import format_duration @@ -103,28 +67,33 @@ def finalize(self, total_duration: float | None = None) -> Path: return self.file_path def post_process(self) -> None: - """Read the finalized transcript, clean up the body, and rewrite.""" + """Clean up the text portion of each timestamped line, preserving timestamps.""" if not self.file_path.exists(): return content = self.file_path.read_text(encoding="utf-8") - - # Split into header, body, footer using the --- marker footer_idx = content.rfind(_FOOTER_MARKER) if footer_idx == -1: log.warning("No footer marker found, skipping post-processing") return - # Header ends at first double newline after the title line header_end = content.index("\n\n") + 2 header = content[:header_end] body = content[header_end:footer_idx] footer = content[footer_idx:] - cleaned = clean_transcript_text(body, language=self._language) + cleaned_lines: list[str] = [] + for line in body.splitlines(keepends=True): + m = _TS_LINE_RE.match(line.rstrip("\n")) + if m: + ts_prefix, text = m.group(1), m.group(2) + text = clean_transcript_text(text, language=self._language) + cleaned_lines.append(f"{ts_prefix}{text} \n") + else: + cleaned_lines.append(line) self.file_path.write_text( - header + cleaned + footer, + header + "".join(cleaned_lines) + footer, encoding="utf-8", ) log.info("Post-processed transcript: %s", self.file_path) diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py deleted file mode 100644 index 8495de7..0000000 --- a/src/hearsay/transcription/engine.py +++ /dev/null @@ -1,118 +0,0 @@ -"""TranscriptionEngine: wraps faster-whisper for inference.""" - -from __future__ import annotations - -import logging -from dataclasses import dataclass - -import numpy as np - -from hearsay.utils.paths import get_models_dir - -log = logging.getLogger(__name__) - - -@dataclass -class TranscriptionResult: - """Result from transcribing one audio chunk.""" - - text: str - segments: list[dict] # [{start, end, text}, ...] - language: str - language_probability: float - chunk_index: int - - -class TranscriptionEngine: - """Wraps faster-whisper WhisperModel for inference.""" - - def __init__( - self, - model_name: str = "small.en", - device: str = "cpu", - compute_type: str = "int8", - language: str = "en", - vad_filter: bool = True, - ) -> None: - self.model_name = model_name - self.device = device - self.compute_type = compute_type - self.language = language - self.vad_filter = vad_filter - self._model = None - - def load(self) -> None: - """Load the Whisper model into memory.""" - from faster_whisper import WhisperModel - - log.info( - "Loading model '%s' (device=%s, compute=%s)", - self.model_name, - self.device, - self.compute_type, - ) - self._model = WhisperModel( - self.model_name, - device=self.device, - compute_type=self.compute_type, - download_root=str(get_models_dir()), - ) - log.info("Model loaded successfully") - - def transcribe( - self, - audio: np.ndarray, - chunk_index: int = 0, - ) -> TranscriptionResult: - """Transcribe a float32 16kHz mono audio array. - - Args: - audio: Audio data as float32 numpy array at 16kHz. - chunk_index: Index of this chunk (for ordering). - - Returns: - TranscriptionResult with text and segment details. - """ - if self._model is None: - raise RuntimeError("Model not loaded. Call load() first.") - - segments_iter, info = self._model.transcribe( - audio, - beam_size=5, - language=self.language if self.language else None, - vad_filter=self.vad_filter, - vad_parameters={"min_silence_duration_ms": 500}, - ) - - segments = [] - texts = [] - for seg in segments_iter: - segments.append({ - "start": seg.start, - "end": seg.end, - "text": seg.text.strip(), - }) - texts.append(seg.text.strip()) - - full_text = " ".join(texts) - log.debug( - "Chunk %d: %d segments, lang=%s (%.2f), text=%s", - chunk_index, - len(segments), - info.language, - info.language_probability, - full_text[:100], - ) - - return TranscriptionResult( - text=full_text, - segments=segments, - language=info.language, - language_probability=info.language_probability, - chunk_index=chunk_index, - ) - - def unload(self) -> None: - """Free model memory.""" - self._model = None - log.info("Model unloaded") diff --git a/src/hearsay/transcription/gpu_detect.py b/src/hearsay/transcription/gpu_detect.py index 1c33a68..acfb283 100644 --- a/src/hearsay/transcription/gpu_detect.py +++ b/src/hearsay/transcription/gpu_detect.py @@ -27,34 +27,146 @@ class GPUInfo: recommended_device: str +def _gpu_name_from_nvidia_smi() -> str: + """Query GPU name via nvidia-smi without requiring torch.""" + try: + import subprocess + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + return result.stdout.strip().splitlines()[0].strip() + except Exception: + pass + return "" + + +def _vram_gb_from_nvidia_smi() -> float: + """Query total VRAM in GB via nvidia-smi.""" + try: + import subprocess + result = subprocess.run( + ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + mib = float(result.stdout.strip().splitlines()[0].strip()) + return round(mib / 1024, 1) + except Exception: + pass + return 0.0 + + +def _vram_gb_from_name(name: str) -> float: + """Estimate VRAM from GPU name when ctranslate2 doesn't expose memory info.""" + name_lower = name.lower() + # RTX 40xx series + if "4090" in name_lower: + return 24.0 + if "4080" in name_lower: + return 16.0 + if "4070 ti" in name_lower: + return 12.0 + if "4070" in name_lower: + return 12.0 + if "4060 ti" in name_lower: + return 8.0 + if "4060" in name_lower: + return 8.0 + # RTX 30xx series + if "3090" in name_lower: + return 24.0 + if "3080" in name_lower: + return 10.0 + if "3070" in name_lower: + return 8.0 + if "3060 ti" in name_lower: + return 8.0 + if "3060" in name_lower: + return 12.0 + if "3050" in name_lower: + return 8.0 + # RTX 20xx series + if "2080 ti" in name_lower: + return 11.0 + if "2080" in name_lower: + return 8.0 + if "2070" in name_lower: + return 8.0 + if "2060" in name_lower: + return 6.0 + return 4.0 # conservative default + + +def _cuda_runtime_usable() -> bool: + """Probe the CUDA runtime by allocating a tiny CTranslate2 storage object. + + ctranslate2.get_cuda_device_count() only checks the driver; the actual + runtime DLLs (cublas64_12.dll etc.) are loaded lazily on first use. + This call forces that load so we can detect a broken installation early. + """ + try: + import ctranslate2 + ctranslate2.StorageView([1], ctranslate2.DataType.int8, ctranslate2.Device.cuda) + return True + except Exception as exc: + log.warning("CUDA runtime probe failed: %s", exc) + return False + + def detect_gpu() -> GPUInfo: - """Detect CUDA GPU and return recommendation.""" + """Detect CUDA GPU via ctranslate2 (same backend faster-whisper uses).""" try: - import torch - - if torch.cuda.is_available(): - name = torch.cuda.get_device_name(0) - vram_bytes = torch.cuda.get_device_properties(0).total_mem - vram_gb = vram_bytes / (1024**3) - log.info("CUDA GPU found: %s (%.1f GB VRAM)", name, vram_gb) - - if vram_gb >= 6: - model = DEFAULT_GPU_MODEL - elif vram_gb >= 2: - model = "small.en" + import ctranslate2 + + cuda_count = ctranslate2.get_cuda_device_count() + if cuda_count > 0: + if not _cuda_runtime_usable(): + log.warning( + "CUDA device found but runtime DLLs are missing " + "(install CUDA Toolkit 12.x). Falling back to CPU." + ) + # Fall through to CPU return below else: - model = "tiny.en" - - return GPUInfo( - cuda_available=True, - gpu_name=name, - vram_gb=round(vram_gb, 1), - recommended_model=model, - recommended_compute=DEFAULT_GPU_COMPUTE, - recommended_device="cuda", - ) + # Try to get GPU name via torch if available; otherwise fall back gracefully + gpu_name = "" + vram_gb = 0.0 + try: + import torch + if torch.cuda.is_available(): + gpu_name = torch.cuda.get_device_name(0) + vram_bytes = torch.cuda.get_device_properties(0).total_mem + vram_gb = round(vram_bytes / (1024**3), 1) + except Exception: + pass + + if not gpu_name: + gpu_name = _gpu_name_from_nvidia_smi() or "CUDA Device 0" + + if vram_gb == 0.0: + vram_gb = _vram_gb_from_nvidia_smi() or _vram_gb_from_name(gpu_name) + + log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb) + + if vram_gb >= 6: + model = DEFAULT_GPU_MODEL + elif vram_gb >= 2: + model = "small.en" + else: + model = "tiny.en" + + return GPUInfo( + cuda_available=True, + gpu_name=gpu_name, + vram_gb=vram_gb, + recommended_model=model, + recommended_compute=DEFAULT_GPU_COMPUTE, + recommended_device="cuda", + ) + log.info("No CUDA devices found via ctranslate2") except ImportError: - log.info("PyTorch not installed, assuming CPU-only") + log.info("ctranslate2 not installed, assuming CPU-only") except Exception: log.warning("GPU detection failed", exc_info=True) diff --git a/src/hearsay/transcription/model_manager.py b/src/hearsay/transcription/model_manager.py index ed6150c..3fee329 100644 --- a/src/hearsay/transcription/model_manager.py +++ b/src/hearsay/transcription/model_manager.py @@ -3,9 +3,12 @@ from __future__ import annotations import logging +import shutil +import subprocess +import sys from pathlib import Path -from hearsay.constants import MODEL_TABLE +from hearsay.constants import HF_CUSTOM_MODELS, MODEL_TABLE from hearsay.utils.paths import get_models_dir log = logging.getLogger(__name__) @@ -21,57 +24,162 @@ def get_model_info(name: str) -> tuple[str, int, bool] | None: return MODEL_TABLE.get(name) +def is_hf_custom_model(name: str) -> bool: + """Return True if this model requires HuggingFace download + CTranslate2 conversion.""" + return name in HF_CUSTOM_MODELS + + +def get_hf_model_local_path(name: str) -> Path: + """Return the local CTranslate2 directory path for a custom HF model.""" + return get_models_dir() / f"hf-ct2-{name}" + + +def resolve_model_path(name: str) -> str: + """Return the model name or local path string for WhisperModel(). + + For standard models, returns the name as-is (faster-whisper handles download). + For custom HF models, returns the local CTranslate2 directory path. + """ + if is_hf_custom_model(name): + return str(get_hf_model_local_path(name)) + return name + + def is_model_downloaded(name: str) -> bool: """Check if a model is already cached locally.""" + if is_hf_custom_model(name): + local_path = get_hf_model_local_path(name) + return local_path.exists() and (local_path / "model.bin").exists() + model_dir = get_models_dir() - # faster-whisper stores models in subdirectories named after the model - # Check for the CTranslate2 model file model_path = model_dir / f"models--Systran--faster-whisper-{name}" if model_path.exists(): return True - # Also check for direct directory naming alt_path = model_dir / name return alt_path.exists() and any(alt_path.iterdir()) +def _get_converter_cmd() -> str: + """Find the ct2-transformers-converter executable.""" + converter = shutil.which("ct2-transformers-converter") + if converter: + return converter + + import site + candidate_dirs: list[Path] = [Path(sys.executable).parent] + + # pip --user installs scripts under {userbase}/PythonXY/Scripts on Windows + user_base = Path(site.getuserbase()) + for child in user_base.iterdir() if user_base.exists() else []: + if child.is_dir() and child.name.startswith("Python"): + candidate_dirs.append(child / "Scripts") + candidate_dirs.append(user_base / "Scripts") + candidate_dirs.append(user_base / "bin") + + for d in candidate_dirs: + for exe_name in ["ct2-transformers-converter", "ct2-transformers-converter.exe"]: + p = d / exe_name + if p.exists(): + return str(p) + + raise RuntimeError( + "ct2-transformers-converter not found.\n" + "Install required packages:\n" + " pip install ctranslate2 transformers torch" + ) + + +def _download_and_convert_hf_model( + name: str, + progress_callback: callable | None = None, +) -> None: + """Download a HuggingFace Whisper model and convert it to CTranslate2 format.""" + info = HF_CUSTOM_MODELS[name] + repo_id = info["repo_id"] + local_path = get_hf_model_local_path(name) + + log.info("Downloading and converting HF model '%s' -> %s", repo_id, local_path) + + try: + converter = _get_converter_cmd() + except RuntimeError as exc: + raise RuntimeError(str(exc)) from exc + + local_path.mkdir(parents=True, exist_ok=True) + + if progress_callback: + progress_callback(f"Downloading '{repo_id}' from HuggingFace...") + + result = subprocess.run( + [ + converter, + "--model", repo_id, + "--output_dir", str(local_path), + "--quantization", "int8", + "--force", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + shutil.rmtree(local_path, ignore_errors=True) + stderr_tail = result.stderr[-600:] if result.stderr else "(no output)" + raise RuntimeError( + f"CTranslate2 conversion failed for '{repo_id}':\n{stderr_tail}\n\n" + "Make sure torch is installed: pip install torch" + ) + + log.info("HF model '%s' converted successfully to %s", repo_id, local_path) + + if progress_callback: + progress_callback(f"Model '{name}' ready!") + + def download_model( name: str, progress_callback: callable | None = None, ) -> str: - """Download a model if not cached. Returns the model size string for faster-whisper. + """Download (and convert if needed) a model. Returns model path/name for WhisperModel(). Args: - name: Model name (e.g., 'turbo', 'small.en'). + name: Model name from MODEL_TABLE. progress_callback: Optional callable(status_text) for progress updates. Returns: - The model name/path string to pass to WhisperModel(). + The model name or local path string to pass to WhisperModel(). """ if name not in MODEL_TABLE: raise ValueError(f"Unknown model: {name}") + if is_hf_custom_model(name): + if not is_model_downloaded(name): + if progress_callback: + progress_callback(f"Converting '{name}' to CTranslate2 format (this may take several minutes)...") + _download_and_convert_hf_model(name, progress_callback) + elif progress_callback: + progress_callback(f"Model '{name}' already converted.") + return str(get_hf_model_local_path(name)) + + # Standard faster-whisper model if progress_callback: progress_callback(f"Preparing model '{name}'...") model_dir = get_models_dir() log.info("Downloading/loading model '%s' to %s", name, model_dir) - # faster-whisper downloads models from Hugging Face on first use. - # We trigger this by importing and constructing the model. - # The download_root parameter controls where models are cached. from faster_whisper import WhisperModel if progress_callback: progress_callback(f"Downloading '{name}' (this may take a few minutes)...") - # This will download if not cached _model = WhisperModel( name, device="cpu", compute_type="int8", download_root=str(model_dir), ) - del _model # Free memory; the real model will be loaded by the engine + del _model if progress_callback: progress_callback(f"Model '{name}' ready!") diff --git a/src/hearsay/transcription/pipeline.py b/src/hearsay/transcription/pipeline.py deleted file mode 100644 index 7f96ced..0000000 --- a/src/hearsay/transcription/pipeline.py +++ /dev/null @@ -1,150 +0,0 @@ -"""TranscriptionPipeline thread: consumes audio chunks, produces transcript text.""" - -from __future__ import annotations - -import logging -import queue -import string -import time - -from hearsay.transcription.engine import TranscriptionEngine, TranscriptionResult -from hearsay.utils.threading_utils import StoppableThread - -log = logging.getLogger(__name__) - - -class TranscriptionPipeline(StoppableThread): - """Daemon thread that reads audio chunks from audio_queue, - transcribes them, and pushes results to transcript_queue. - - Args: - audio_queue: Input queue of (chunk_index, np.ndarray) tuples. - transcript_queue: Output queue of TranscriptionResult objects. - engine: Configured TranscriptionEngine (model already loaded). - """ - - _TAIL_WORD_COUNT = 15 # words kept from previous chunk for overlap matching - _MIN_MATCH_WORDS = 2 # minimum overlap length to avoid false positives - - def __init__( - self, - audio_queue: queue.Queue, - transcript_queue: queue.Queue, - engine: TranscriptionEngine, - ) -> None: - super().__init__(name="TranscriptionPipeline") - self.audio_queue = audio_queue - self.transcript_queue = transcript_queue - self.engine = engine - self._prev_tail_words: list[str] = [] - - def run(self) -> None: - log.info("TranscriptionPipeline started") - while not self.stopped(): - try: - chunk_index, audio = self.audio_queue.get(timeout=1.0) - except queue.Empty: - continue - self._process_chunk(chunk_index, audio) - - # Drain any audio chunks still in the queue after stop signal. - # The recorder flushes its buffer before exiting, so these chunks - # must be transcribed to avoid losing the tail of the recording. - log.info("TranscriptionPipeline draining remaining audio chunks") - while True: - try: - chunk_index, audio = self.audio_queue.get_nowait() - except queue.Empty: - break - self._process_chunk(chunk_index, audio) - - log.info("TranscriptionPipeline stopped") - - def _process_chunk(self, chunk_index: int, audio) -> None: - """Transcribe a single audio chunk and enqueue the result.""" - try: - t0 = time.perf_counter() - result = self.engine.transcribe(audio, chunk_index=chunk_index) - elapsed = time.perf_counter() - t0 - log.info( - "Chunk %d transcribed in %.1fs: %s", - chunk_index, - elapsed, - result.text[:80] if result.text else "(empty)", - ) - if result.text: - original_words = result.text.split() - if chunk_index > 0 and self._prev_tail_words: - result = self._deduplicate(result) - self._prev_tail_words = original_words[-self._TAIL_WORD_COUNT:] - if result.text: - self.transcript_queue.put(result) - except Exception: - log.error("Transcription failed for chunk %d", chunk_index, exc_info=True) - - @staticmethod - def _normalize(word: str) -> str: - """Strip leading/trailing punctuation for comparison.""" - return word.strip(string.punctuation) - - def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult: - """Remove overlapping prefix from *result* that duplicates the tail of the previous chunk.""" - new_words = result.text.split() - if len(new_words) < self._MIN_MATCH_WORDS: - return result - - # Find the longest prefix of new_words that matches a suffix of _prev_tail_words. - best = 0 - for length in range(self._MIN_MATCH_WORDS, min(len(self._prev_tail_words), len(new_words)) + 1): - suffix = self._prev_tail_words[-length:] - prefix = new_words[:length] - tail = [self._normalize(w).lower() for w in suffix] - head = [self._normalize(w).lower() for w in prefix] - # All words after the first must match exactly; the first word of the - # new chunk may be truncated (e.g. "replaced" -> "placed") so allow a - # suffix-of-word match when the fragment is at least 3 characters. - first_ok = tail[0] == head[0] or (len(head[0]) >= 3 and tail[0].endswith(head[0])) - if first_ok and tail[1:] == head[1:]: - best = length - - if best == 0: - return result - - stripped_words = new_words[best:] - log.info( - "Chunk %d: stripped %d overlapping words: %s", - result.chunk_index, - best, - " ".join(new_words[:best]), - ) - - if not stripped_words: - return TranscriptionResult( - text="", - segments=[], - language=result.language, - language_probability=result.language_probability, - chunk_index=result.chunk_index, - ) - - # Rebuild text and trim leading segments that were fully covered by the overlap. - new_text = " ".join(stripped_words) - chars_removed = len(" ".join(new_words[:best])) + 1 # +1 for the space after - trimmed_segments = [] - for seg in result.segments: - seg_text = seg["text"] - if chars_removed >= len(seg_text): - chars_removed -= len(seg_text) + 1 # +1 for joining space - continue - if chars_removed > 0: - seg = {**seg, "text": seg_text[chars_removed:].lstrip()} - chars_removed = 0 - trimmed_segments.append(seg) - - return TranscriptionResult( - text=new_text, - segments=trimmed_segments if trimmed_segments else result.segments, - language=result.language, - language_probability=result.language_probability, - chunk_index=result.chunk_index, - ) diff --git a/src/hearsay/transcription/realtime_engine.py b/src/hearsay/transcription/realtime_engine.py new file mode 100644 index 0000000..0f8b40e --- /dev/null +++ b/src/hearsay/transcription/realtime_engine.py @@ -0,0 +1,169 @@ +"""RealtimeEngine: dual-layer transcription via RealtimeSTT. + +Audio is captured by Hearsay's own AudioRecorder (system loopback / mic / both) +and fed into RealtimeSTT through ``feed_audio`` (``use_microphone=False``). Two +whisper models run concurrently: + + * a fast *realtime* model drives the tentative ("typing") layer, revised + continuously as the user speaks (``on_tentative``); + * the accurate *main* model produces the final text once VAD detects the end + of an utterance (``on_final``). +""" + +from __future__ import annotations + +import logging +import threading +import time +from typing import Callable + +import numpy as np + +from hearsay.transcription.model_manager import resolve_model_path +from hearsay.utils.paths import get_models_dir + +log = logging.getLogger(__name__) + + +class CudaUnavailableError(RuntimeError): + """Raised when GPU is configured but CUDA is not available.""" + + +class RealtimeEngine: + """Drives RealtimeSTT with externally fed audio and two output layers.""" + + def __init__( + self, + model_name: str, + realtime_model_name: str, + device: str, + compute_type: str, + language: str, + on_tentative: Callable[[str], None], + on_final: Callable[[str], None], + on_utterance_start: Callable[[], None] | None = None, + post_speech_silence_duration: float = 0.7, + ) -> None: + self.model_name = model_name + self.realtime_model_name = realtime_model_name + self.device = device + self.compute_type = compute_type + self.language = language or "" + self._on_tentative = on_tentative + self._on_final = on_final + self._on_utterance_start = on_utterance_start + self._post_speech_silence_duration = post_speech_silence_duration + + self._recorder = None + self._final_thread: threading.Thread | None = None + self._stop = threading.Event() + self._final_emitted = threading.Event() + + def load(self) -> None: + """Create the RealtimeSTT recorder (spawns the main-model process) and + start the final-text loop. Blocks until both models are ready.""" + if self.device == "cuda": + try: + import torch + if not torch.cuda.is_available(): + raise CudaUnavailableError("CUDA is not available") + except CudaUnavailableError: + raise + except Exception as exc: # torch import/init failure + raise CudaUnavailableError(str(exc)) from exc + + from RealtimeSTT import AudioToTextRecorder + + model = resolve_model_path(self.model_name) + log.info( + "Loading RealtimeSTT (main=%s, realtime=%s, device=%s, compute=%s)", + self.model_name, self.realtime_model_name, self.device, self.compute_type, + ) + self._recorder = AudioToTextRecorder( + model=model, + realtime_model_type=self.realtime_model_name, + language=self.language, + device=self.device, + compute_type=self.compute_type, + download_root=str(get_models_dir()), + use_microphone=False, + enable_realtime_transcription=True, + on_realtime_transcription_stabilized=self._handle_tentative, + on_recording_start=self._handle_utterance_start, + post_speech_silence_duration=self._post_speech_silence_duration, + spinner=False, + level=logging.WARNING, + no_log_file=True, + ) + log.info("RealtimeSTT ready") + + self._final_thread = threading.Thread( + target=self._final_loop, daemon=True, name="RealtimeFinal", + ) + self._final_thread.start() + + def feed(self, mono_float32: np.ndarray) -> None: + """Feed one mono 16 kHz float32 frame into RealtimeSTT. + + ``feed_audio`` casts directly to int16 without scaling, so float [-1, 1] + audio must be scaled into the int16 range first. + """ + rec = self._recorder + if rec is None or mono_float32 is None or len(mono_float32) == 0: + return + pcm16 = np.clip(mono_float32 * 32768.0, -32768, 32767).astype(np.int16) + try: + rec.feed_audio(pcm16, 16000) + except Exception: + log.error("feed_audio failed", exc_info=True) + + def _handle_tentative(self, text: str) -> None: + if text and text.strip() and not self._stop.is_set(): + self._on_tentative(text.strip()) + + def _handle_utterance_start(self) -> None: + if self._on_utterance_start is not None and not self._stop.is_set(): + self._on_utterance_start() + + def _final_loop(self) -> None: + """Block on recorder.text() and emit each finalized utterance.""" + while not self._stop.is_set(): + try: + text = self._recorder.text() + except Exception: + if self._stop.is_set(): + break + log.error("RealtimeSTT text() failed", exc_info=True) + break + if text and text.strip(): + self._on_final(text.strip()) + self._final_emitted.set() + if self._stop.is_set(): + break + + def shutdown(self) -> None: + """Finalize any in-progress utterance, then tear down the recorder.""" + rec = self._recorder + if rec is not None and getattr(rec, "is_recording", False): + # Stopped mid-utterance: gracefully stop the active recording so its + # buffered audio gets a final transcription instead of being dropped. + started = getattr(rec, "recording_start_time", 0) or 0 + min_len = getattr(rec, "min_length_of_recording", 0.5) + if not started or (time.time() - started) >= min_len: + try: + self._final_emitted.clear() + rec.stop() + self._final_emitted.wait(timeout=15) + except Exception: + log.warning("Error finalizing in-progress utterance", exc_info=True) + + self._stop.set() + self._recorder = None + if rec is not None: + try: + rec.shutdown() + except Exception: + log.warning("RealtimeSTT shutdown error", exc_info=True) + if self._final_thread is not None: + self._final_thread.join(timeout=10) + self._final_thread = None diff --git a/src/hearsay/ui/live_view.py b/src/hearsay/ui/live_view.py index 8169357..82d7288 100644 --- a/src/hearsay/ui/live_view.py +++ b/src/hearsay/ui/live_view.py @@ -31,7 +31,7 @@ def __init__(self, master: ctk.CTk) -> None: # Delay disclaimer ctk.CTkLabel( self, - text="Transcript text appears with a delay of approximately 30\u201360 seconds depending on your hardware.", + text="Live text (gray) updates as you speak; it is replaced by the final, more accurate text after a brief pause.", font=("Segoe UI", 10, "italic"), text_color="gray", anchor="w", @@ -46,6 +46,11 @@ def __init__(self, master: ctk.CTk) -> None: ) self._textbox.pack(fill="both", expand=True, padx=10, pady=(10, 5)) + # The tentative (in-progress) line is rendered in gray and replaced in + # place each time RealtimeSTT revises it, then committed as a final line. + self._textbox.tag_config("tentative", foreground="#888888") + self._tent_start_index: str | None = None + # Bottom bar with status and controls bottom = ctk.CTkFrame(self) bottom.pack(fill="x", padx=10, pady=(0, 10)) @@ -96,15 +101,52 @@ def toggle(self) -> None: self.show() def append_text(self, text: str) -> None: - """Append text to the transcript view.""" + """Append a finished line to the transcript view.""" self._textbox.configure(state="normal") self._textbox.insert("end", text + "\n") self._textbox.configure(state="disabled") if self._autoscroll.get(): self._textbox.see("end") + def update_tentative(self, text: str) -> None: + """Show or revise the in-progress (gray) line at the bottom of the view.""" + tb = self._textbox + tb.configure(state="normal") + if self._tent_start_index is None: + self._tent_start_index = tb.index("end-1c") + else: + tb.delete(self._tent_start_index, "end-1c") + tb.insert(self._tent_start_index, text) + tb.tag_add("tentative", self._tent_start_index, "end-1c") + tb.configure(state="disabled") + if self._autoscroll.get(): + tb.see("end") + + def commit_final(self, line: str) -> None: + """Replace the tentative line (if any) with a committed final line.""" + tb = self._textbox + tb.configure(state="normal") + if self._tent_start_index is not None: + tb.delete(self._tent_start_index, "end-1c") + self._tent_start_index = None + tb.insert("end-1c", line + "\n") + tb.configure(state="disabled") + if self._autoscroll.get(): + tb.see("end") + + def drop_tentative(self) -> None: + """Discard the in-progress line without committing it.""" + if self._tent_start_index is None: + return + tb = self._textbox + tb.configure(state="normal") + tb.delete(self._tent_start_index, "end-1c") + self._tent_start_index = None + tb.configure(state="disabled") + def append_separator(self, timestamp: str) -> None: """Insert a visual divider marking the end of a recording session.""" + self.drop_tentative() self._textbox.configure(state="normal") self._textbox.insert("end", f"\n--- Recording ended at {timestamp} ---\n\n") self._textbox.configure(state="disabled") @@ -117,6 +159,7 @@ def set_status(self, text: str) -> None: def clear(self) -> None: """Clear all transcript text.""" + self._tent_start_index = None self._textbox.configure(state="normal") self._textbox.delete("1.0", "end") self._textbox.configure(state="disabled") diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py index a7f386b..7486be6 100644 --- a/src/hearsay/ui/settings_window.py +++ b/src/hearsay/ui/settings_window.py @@ -15,6 +15,10 @@ AUDIO_SOURCE_SYSTEM, MODEL_TABLE, ) +from hearsay.transcription.model_manager import ( + is_hf_custom_model, + is_model_downloaded, +) log = logging.getLogger(__name__) @@ -22,14 +26,23 @@ class SettingsWindow(ctk.CTkToplevel): """Settings editor window.""" - def __init__(self, master: ctk.CTk, config_manager: ConfigManager) -> None: + def __init__( + self, + master: ctk.CTk, + config_manager: ConfigManager, + on_save: "Callable | None" = None, + is_recording: "Callable[[], bool] | None" = None, + ) -> None: super().__init__(master) self.title(f"{APP_NAME} Settings") - self.geometry("550x520") + self.geometry("550x620") self.resizable(False, False) self._config_manager = config_manager self._config = config_manager.config + self._on_save = on_save + self._is_recording = is_recording or (lambda: False) + self._capturing = False self._build_ui() self.grab_set() @@ -43,7 +56,7 @@ def _build_ui(self) -> None: ).pack(pady=(15, 10)) # Scrollable content - scroll = ctk.CTkScrollableFrame(self, width=490, height=360) + scroll = ctk.CTkScrollableFrame(self, width=490, height=460) scroll.pack(fill="both", expand=True, padx=20, pady=(0, 10)) # ── Audio Source ── @@ -70,9 +83,16 @@ def _build_ui(self) -> None: variable=self._model_var, values=list(MODEL_TABLE.keys()), width=200, + command=self._on_model_changed, ) self._model_menu.pack(anchor="w", padx=15) + self._model_hint = ctk.CTkLabel( + scroll, text="", font=("Segoe UI", 10), text_color="gray" + ) + self._model_hint.pack(anchor="w", padx=15) + self._update_model_hint(self._config.model_name) + # ── Compute Type ── ctk.CTkLabel(scroll, text="Compute Type", font=("Segoe UI", 14, "bold")).pack( anchor="w", pady=(15, 5) @@ -106,7 +126,7 @@ def _build_ui(self) -> None: self._lang_entry = ctk.CTkEntry(scroll, textvariable=self._lang_var, width=100) self._lang_entry.pack(anchor="w", padx=15) ctk.CTkLabel( - scroll, text="ISO 639-1 code (e.g., en, es, fr) or empty for auto-detect", + scroll, text="ISO 639-1 code (e.g., en, ko, fr) or empty for auto-detect", font=("Segoe UI", 10), text_color="gray" ).pack(anchor="w", padx=15) @@ -131,17 +151,133 @@ def _build_ui(self) -> None: dir_frame, text="Browse", width=70, command=self._browse ).pack(side="left") + # ── Hotkey ── + ctk.CTkLabel(scroll, text="Recording Hotkey", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + hotkey_frame = ctk.CTkFrame(scroll, fg_color="transparent") + hotkey_frame.pack(anchor="w", padx=15, fill="x") + + self._hotkey_var = ctk.StringVar(value=self._config.hotkey) + self._hotkey_entry = ctk.CTkEntry( + hotkey_frame, textvariable=self._hotkey_var, width=200, state="readonly" + ) + self._hotkey_entry.pack(side="left", padx=(0, 8)) + self._capture_btn = ctk.CTkButton( + hotkey_frame, text="Capture", width=80, command=self._start_capture + ) + self._capture_btn.pack(side="left") + ctk.CTkLabel( + scroll, text="Press Ctrl+Alt+R or any modifier+key combo", + font=("Segoe UI", 10), text_color="gray" + ).pack(anchor="w", padx=15) + + # ── Beep Notifications ── + ctk.CTkLabel(scroll, text="Beep Notifications", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + self._beep_start_var = ctk.BooleanVar(value=self._config.beep_on_start) + self._beep_stop_var = ctk.BooleanVar(value=self._config.beep_on_stop) + self._beep_save_var = ctk.BooleanVar(value=self._config.beep_on_save) + ctk.CTkCheckBox( + scroll, text="Beep on recording start", variable=self._beep_start_var + ).pack(anchor="w", padx=15, pady=2) + ctk.CTkCheckBox( + scroll, text="Beep on recording stop", variable=self._beep_stop_var + ).pack(anchor="w", padx=15, pady=2) + ctk.CTkCheckBox( + scroll, text="Beep on transcript save", variable=self._beep_save_var + ).pack(anchor="w", padx=15, pady=2) + + # ── Clipboard ── + ctk.CTkLabel(scroll, text="Clipboard", font=("Segoe UI", 14, "bold")).pack( + anchor="w", pady=(15, 5) + ) + self._clipboard_var = ctk.BooleanVar(value=self._config.copy_to_clipboard) + ctk.CTkCheckBox( + scroll, + text="Copy transcript to clipboard on save", + variable=self._clipboard_var, + ).pack(anchor="w", padx=15, pady=2) + # ── Buttons ── - btn_frame = ctk.CTkFrame(self) - btn_frame.pack(fill="x", padx=20, pady=(0, 15)) + self._btn_frame = ctk.CTkFrame(self) + self._btn_frame.pack(fill="x", padx=20, pady=(0, 15)) - ctk.CTkButton( - btn_frame, text="Save", width=100, command=self._save - ).pack(side="right", padx=5) - ctk.CTkButton( - btn_frame, text="Cancel", width=100, fg_color="gray", + self._save_btn = ctk.CTkButton( + self._btn_frame, text="Save", width=100, command=self._save + ) + self._save_btn.pack(side="right", padx=5) + self._cancel_btn = ctk.CTkButton( + self._btn_frame, text="Cancel", width=100, fg_color="gray", command=self._cancel - ).pack(side="right", padx=5) + ) + self._cancel_btn.pack(side="right", padx=5) + + def _start_capture(self) -> None: + self._capturing = True + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set("Press hotkey...") + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Cancel", command=self._cancel_capture) + self._hotkey_entry.focus_set() + self.bind("", self._on_key_capture) + + def _cancel_capture(self) -> None: + self._capturing = False + self.unbind("") + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set(self._config.hotkey) + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Capture", command=self._start_capture) + + def _on_key_capture(self, event) -> str: + keysym = event.keysym.lower() + modifier_only = { + "control_l", "control_r", "alt_l", "alt_r", + "shift_l", "shift_r", "super_l", "super_r", + } + if keysym in modifier_only: + return "break" + if keysym == "escape": + self._cancel_capture() + return "break" + + parts = [] + if event.state & 0x4: # Ctrl + parts.append("ctrl") + if event.state & 0x1: # Shift + parts.append("shift") + if event.state & 0x20000: # Alt (Windows) + parts.append("alt") + + if not parts: + return "break" # require at least one modifier + + parts.append(keysym) + combo = "+".join(parts) + + self._capturing = False + self.unbind("") + self._hotkey_entry.configure(state="normal") + self._hotkey_var.set(combo) + self._hotkey_entry.configure(state="readonly") + self._capture_btn.configure(text="Capture", command=self._start_capture) + return "break" + + def _on_model_changed(self, name: str) -> None: + self._update_model_hint(name) + + def _update_model_hint(self, name: str) -> None: + if is_hf_custom_model(name): + if is_model_downloaded(name): + self._model_hint.configure(text="Korean model (converted, ready)", text_color="green") + else: + self._model_hint.configure( + text="Korean model — will download when recording starts", text_color="#e07800" + ) + else: + self._model_hint.configure(text="") def _browse(self) -> None: path = filedialog.askdirectory( @@ -152,6 +288,29 @@ def _browse(self) -> None: self._dir_var.set(path) def _save(self) -> None: + self._apply_and_close() + + def _apply_and_close(self) -> None: + if self._is_recording(): + _LOCKED = [ + ("Model", self._model_var.get(), self._config.model_name), + ("Device", self._device_var.get(), self._config.device), + ("Compute Type", self._compute_var.get(), self._config.compute_type), + ("Language", self._lang_var.get().strip(), self._config.language), + ("VAD Filter", self._vad_var.get(), self._config.vad_filter), + ("Audio Source", self._source_var.get(), self._config.audio_source), + ("Output Directory", self._dir_var.get(), self._config.output_dir), + ] + changed = [name for name, new, old in _LOCKED if new != old] + if changed: + from tkinter import messagebox + messagebox.showinfo( + "Recording Active", + "Settings saved.\n\n" + "The following changes will take effect when you start the next recording:\n" + + "".join(f"\n - {c}" for c in changed), + parent=self, + ) self._config.audio_source = self._source_var.get() self._config.model_name = self._model_var.get() self._config.compute_type = self._compute_var.get() @@ -159,10 +318,17 @@ def _save(self) -> None: self._config.language = self._lang_var.get() self._config.vad_filter = self._vad_var.get() self._config.output_dir = self._dir_var.get() + self._config.hotkey = self._hotkey_var.get() + self._config.beep_on_start = self._beep_start_var.get() + self._config.beep_on_stop = self._beep_stop_var.get() + self._config.beep_on_save = self._beep_save_var.get() + self._config.copy_to_clipboard = self._clipboard_var.get() self._config_manager.save() log.info("Settings saved") self.grab_release() self.destroy() + if self._on_save: + self._on_save() def _cancel(self) -> None: self.grab_release() diff --git a/src/hearsay/utils/cuda_dlls.py b/src/hearsay/utils/cuda_dlls.py new file mode 100644 index 0000000..b29df87 --- /dev/null +++ b/src/hearsay/utils/cuda_dlls.py @@ -0,0 +1,79 @@ +"""Register NVIDIA pip-package DLL directories on Windows before ctranslate2 loads.""" + +from __future__ import annotations + +import logging +import os +import site +import sys +from pathlib import Path + +log = logging.getLogger(__name__) + + +def _nvidia_bin_dirs() -> list[Path]: + """Yield every nvidia//bin directory found in any site-packages.""" + search_roots: list[Path] = [] + + # user site-packages (pip install --user) + try: + user_site = site.getusersitepackages() + if user_site: + search_roots.append(Path(user_site)) + except Exception: + pass + + # system / venv site-packages + for p in site.getsitepackages(): + search_roots.append(Path(p)) + + found: list[Path] = [] + seen: set[Path] = set() + for root in search_roots: + nvidia_root = root / "nvidia" + if not nvidia_root.is_dir(): + continue + for bin_dir in nvidia_root.glob("*/bin"): + if bin_dir.is_dir() and bin_dir not in seen: + seen.add(bin_dir) + found.append(bin_dir) + + return found + + +def register_nvidia_dlls() -> bool: + """Add NVIDIA pip-package bin dirs to the Windows DLL search path. + + Uses both os.add_dll_directory() (for Python extension modules) and + prepends to PATH (for ctranslate2's ctypes.CDLL calls, which only + respect PATH on Windows). + + Returns True if at least one directory was registered. + No-op on non-Windows platforms. + """ + if sys.platform != "win32": + return False + + dirs = _nvidia_bin_dirs() + if not dirs: + log.debug("No nvidia pip-package bin dirs found; skipping DLL registration") + return False + + registered = 0 + path_entries: list[str] = [] + for d in dirs: + try: + os.add_dll_directory(str(d)) + path_entries.append(str(d)) + log.debug("Registered DLL dir: %s", d) + registered += 1 + except Exception as exc: + log.warning("Could not register DLL dir %s: %s", d, exc) + + if path_entries: + os.environ["PATH"] = os.pathsep.join(path_entries) + os.pathsep + os.environ.get("PATH", "") + + if registered: + log.info("Registered %d NVIDIA DLL director%s from pip packages", + registered, "y" if registered == 1 else "ies") + return registered > 0