diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..4055f54
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,50 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+
+permissions:
+  contents: write
+
+jobs:
+  build-and-release:
+    runs-on: windows-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt pyinstaller
+
+      - name: Update installer version
+        shell: pwsh
+        run: |
+          $version = "${{ github.ref_name }}".TrimStart("v")
+          (Get-Content installer.iss) -replace 'AppVersion=.*', "AppVersion=$version" | Set-Content installer.iss
+
+      - name: Build with PyInstaller
+        shell: cmd
+        run: build.bat
+
+      - name: Install Inno Setup
+        shell: pwsh
+        run: choco install innosetup --yes --no-progress
+
+      - name: Build installer
+        shell: pwsh
+        run: '& "C:\Program Files (x86)\Inno Setup 6\ISCC.exe" installer.iss'
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          files: installer_output/HearsaySetup.exe
+          generate_release_notes: true
diff --git a/build.bat b/build.bat
index c85a855..5e6b1ca 100644
--- a/build.bat
+++ b/build.bat
@@ -14,9 +14,24 @@ pyinstaller --noconfirm --onedir --windowed ^
     --hidden-import "sounddevice" ^
     --hidden-import "customtkinter" ^
     --hidden-import "pystray" ^
+    --hidden-import "RealtimeSTT" ^
+    --hidden-import "silero_vad" ^
+    --hidden-import "webrtcvad" ^
+    --hidden-import "onnxruntime" ^
+    --hidden-import "scipy" ^
+    --hidden-import "soundfile" ^
+    --hidden-import "torch" ^
+    --hidden-import "torchaudio" ^
     --collect-all "customtkinter" ^
     --collect-all "faster_whisper" ^
     --collect-all "ctranslate2" ^
+    --collect-all "RealtimeSTT" ^
+    --collect-all "silero_vad" ^
+    --collect-all "onnxruntime" ^
+    --collect-all "scipy" ^
+    --collect-all "soundfile" ^
+    --collect-all "torch" ^
+    --collect-all "torchaudio" ^
     src\hearsay\__main__.py
 
 echo.
diff --git a/requirements.txt b/requirements.txt
index eb56c39..e02a8e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,14 @@
 faster-whisper>=1.0.0
+RealtimeSTT>=1.0.0
+silero-vad>=5.1
 PyAudioWPatch>=0.2.12
 sounddevice>=0.4.6
 numpy>=1.24.0
 customtkinter>=5.2.0
 pystray>=0.19.5
 Pillow>=10.0.0
+nvidia-cublas-cu12>=12.0
+nvidia-cuda-runtime-cu12>=12.0
+transformers>=4.23.0
+torch>=2.0.0
+keyboard>=0.13.5
diff --git a/src/hearsay/__main__.py b/src/hearsay/__main__.py
index 1ae94cf..b29065f 100644
--- a/src/hearsay/__main__.py
+++ b/src/hearsay/__main__.py
@@ -1,13 +1,23 @@
 """Entry point for Hearsay: python -m hearsay"""
 
+import multiprocessing
 import sys
 
 
 def main() -> None:
+    # RealtimeSTT spawns a child process (spawn start method) for the main
+    # transcription model; freeze_support is required for frozen/PyInstaller builds.
+    multiprocessing.freeze_support()
+
     from hearsay.utils.logging_setup import setup_logging
 
     setup_logging()
 
+    # Must run before any ctranslate2 / faster-whisper import on Windows
+    from hearsay.utils.cuda_dlls import register_nvidia_dlls
+
+    register_nvidia_dlls()
+
     from hearsay.app import HearsayApp
 
     app = HearsayApp()
diff --git a/src/hearsay/app.py b/src/hearsay/app.py
index a7b78ba..e63f0a6 100644
--- a/src/hearsay/app.py
+++ b/src/hearsay/app.py
@@ -9,14 +9,15 @@
 import threading
 import time
 
+import webbrowser
+
 import customtkinter as ctk
 
 from hearsay.audio.recorder import AudioRecorder
 from hearsay.config import ConfigManager
-from hearsay.constants import APP_NAME, LIVE_VIEW_POLL_MS
+from hearsay.constants import APP_NAME, DEFAULT_CPU_COMPUTE
 from hearsay.output.markdown_writer import MarkdownWriter
-from hearsay.transcription.engine import TranscriptionEngine
-from hearsay.transcription.pipeline import TranscriptionPipeline
+from hearsay.transcription.realtime_engine import CudaUnavailableError, RealtimeEngine
 from hearsay.ui.about_window import AboutWindow
 from hearsay.ui.live_view import LiveTranscriptWindow
 from hearsay.ui.settings_window import SettingsWindow
@@ -35,21 +36,18 @@ def __init__(self) -> None:
         self._config_manager = ConfigManager()
         self._config = self._config_manager.config
 
-        # Queues
-        self._audio_queue: queue.Queue = queue.Queue(maxsize=10)
-        self._transcript_queue: queue.Queue = queue.Queue()
-
         # Threads / components
         self._recorder: AudioRecorder | None = None
-        self._engine: TranscriptionEngine | None = None
-        self._pipeline: TranscriptionPipeline | None = None
+        self._engine: RealtimeEngine | None = None
         self._writer: MarkdownWriter | None = None
         self._tray: SystemTrayIcon | None = None
 
         # State
         self._recording = False
         self._recording_start_time: float | None = None
+        self._utterance_start_elapsed: float | None = None
         self._teardown_thread: threading.Thread | None = None
+        self._hotkey_combo: str | None = None
 
         # UI
         apply_theme()
@@ -81,6 +79,7 @@ def run(self) -> None:
             self._root.after(500, self._show_wizard)
         else:
             log.info("Config loaded, ready to record")
+            self._register_hotkey()
 
         # Start tkinter event loop
         self._root.mainloop()
@@ -97,6 +96,7 @@ def _on_wizard_complete(self) -> None:
         """Called when the setup wizard finishes."""
         self._config = self._config_manager.config
         log.info("Wizard complete, app ready")
+        self._register_hotkey()
 
     def _start_recording(self, source: str) -> None:
         """Start recording from the given source."""
@@ -107,17 +107,24 @@ def _start_recording(self, source: str) -> None:
         log.info("Starting recording (source=%s)", source)
         self._recording = True
         self._recording_start_time = time.time()
+        self._utterance_start_elapsed = None
 
         # Set up markdown writer
-        self._writer = MarkdownWriter(self._config.output_dir)
+        self._writer = MarkdownWriter(
+            self._config.output_dir, language=self._config.language
+        )
 
-        # Load transcription engine
-        self._engine = TranscriptionEngine(
+        # Dual-layer realtime engine (tentative + final)
+        self._engine = RealtimeEngine(
             model_name=self._config.model_name,
+            realtime_model_name=self._config.realtime_model_name,
             device=self._config.device,
             compute_type=self._config.compute_type,
             language=self._config.language,
-            vad_filter=self._config.vad_filter,
+            on_tentative=self._on_tentative,
+            on_final=self._on_final,
+            on_utterance_start=self._on_utterance_start,
+            post_speech_silence_duration=self._config.post_speech_silence_duration,
         )
 
         def load_and_start() -> None:
@@ -126,32 +133,35 @@ def load_and_start() -> None:
                 self._teardown_thread.join(timeout=30)
                 self._teardown_thread = None
 
-            # Now safe to clear queues (old teardown has finished draining them)
-            while not self._audio_queue.empty():
-                try:
-                    self._audio_queue.get_nowait()
-                except queue.Empty:
-                    break
-            while not self._transcript_queue.empty():
-                try:
-                    self._transcript_queue.get_nowait()
-                except queue.Empty:
-                    break
-
-            self._engine.load()
-
-            # Start pipeline
-            self._pipeline = TranscriptionPipeline(
-                audio_queue=self._audio_queue,
-                transcript_queue=self._transcript_queue,
-                engine=self._engine,
+            # Download HF model on-demand (deferred from settings save)
+            from hearsay.transcription.model_manager import (
+                download_model, is_hf_custom_model, is_model_downloaded,
             )
-            self._pipeline.start()
+            if (is_hf_custom_model(self._engine.model_name)
+                    and not is_model_downloaded(self._engine.model_name)):
+                safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Downloading model..."))
+                try:
+                    def _dl_progress(msg: str) -> None:
+                        safe_after(self._root, 0,
+                                   lambda m=msg: self._ensure_live_view().set_status(f"Downloading: {m}"))
+                    download_model(self._engine.model_name, progress_callback=_dl_progress)
+                except Exception as exc:
+                    log.error("Model download failed at recording start", exc_info=True)
+                    safe_after(self._root, 0, lambda e=str(exc): self._on_model_download_failed(e))
+                    return
+
+            safe_after(self._root, 0, lambda: self._ensure_live_view().set_status("Loading model..."))
+            try:
+                self._engine.load()
+            except CudaUnavailableError:
+                safe_after(self._root, 0, lambda: self._handle_cuda_error(source))
+                return
 
-            # Start recorder
+            # Start recorder in streaming mode — frames feed straight into the engine
             self._recorder = AudioRecorder(
-                audio_queue=self._audio_queue,
+                queue.Queue(),
                 source=source,
+                on_frame=self._engine.feed,
             )
             self._recorder.start()
 
@@ -172,8 +182,38 @@ def _on_recording_started(self) -> None:
             self._tray.set_recording(True)
         if self._live_view:
             self._live_view.set_status("Recording...")
-        # Start polling transcript queue
-        self._poll_transcripts()
+        if self._config.beep_on_start:
+            threading.Thread(target=self._play_beep, args=("start",), daemon=True).start()
+
+    # ── Transcription callbacks (from the engine threads) ───────────────────────
+
+    def _on_utterance_start(self) -> None:
+        """RealtimeSTT detected speech onset — stamp the utterance's start time."""
+        if self._recording_start_time is not None:
+            self._utterance_start_elapsed = time.time() - self._recording_start_time
+
+    def _on_tentative(self, text: str) -> None:
+        """Revised in-progress text from the fast realtime model (gray layer)."""
+        safe_after(self._root, 0, lambda t=text: (
+            self._live_view.update_tentative(t) if self._live_view else None
+        ))
+
+    def _on_final(self, text: str) -> None:
+        """Finalized, accurate text for a completed utterance (committed layer)."""
+        elapsed = self._utterance_start_elapsed
+        if elapsed is None and self._recording_start_time is not None:
+            elapsed = time.time() - self._recording_start_time
+        elapsed = elapsed or 0.0
+        self._utterance_start_elapsed = None
+
+        if self._writer:
+            self._writer.append_utterance(elapsed, text)
+
+        from hearsay.output.formatter import format_timestamp
+        line = f"[{format_timestamp(elapsed)}] {text}"
+        safe_after(self._root, 0, lambda l=line: (
+            self._live_view.commit_final(l) if self._live_view else None
+        ))
 
     def _stop_recording(self) -> None:
         """Stop the current recording session.
@@ -188,6 +228,9 @@ def _stop_recording(self) -> None:
         log.info("Stopping recording")
         self._recording = False
 
+        if self._config.beep_on_stop:
+            threading.Thread(target=self._play_beep, args=("stop",), daemon=True).start()
+
         # Update tray immediately so the menu is responsive
         if self._tray:
             self._tray.set_recording(False)
@@ -199,20 +242,18 @@ def _stop_recording(self) -> None:
 
         # Capture references for the background thread
         recorder = self._recorder
-        pipeline = self._pipeline
         engine = self._engine
         writer = self._writer
         start_time = self._recording_start_time
 
         self._recorder = None
-        self._pipeline = None
         self._engine = None
         self._writer = None
         self._recording_start_time = None
 
         self._teardown_thread = threading.Thread(
             target=self._teardown_recording,
-            args=(recorder, pipeline, engine, writer, start_time),
+            args=(recorder, engine, writer, start_time),
             daemon=True,
             name="RecordingTeardown",
         )
@@ -221,48 +262,19 @@ def _stop_recording(self) -> None:
     def _teardown_recording(
         self,
         recorder: AudioRecorder | None,
-        pipeline: TranscriptionPipeline | None,
-        engine: TranscriptionEngine | None,
+        engine: RealtimeEngine | None,
         writer: MarkdownWriter | None,
         start_time: float | None,
     ) -> None:
         """Blocking recording teardown — runs on a background thread."""
-        # 1. Stop recorder first so it flushes remaining audio to the queue.
+        # 1. Stop recorder first so it stops feeding audio into the engine.
         if recorder:
             recorder.stop()
             recorder.join(timeout=5)
 
-        # 2. Stop pipeline -- it will drain any remaining audio chunks before
-        #    exiting.  Use a generous timeout so CPU transcription can finish.
-        if pipeline:
-            pipeline.stop()
-            pipeline.join(timeout=60)
-            if pipeline.is_alive():
-                log.warning("Pipeline thread still running after join timeout")
-
-        # 3. Unload model only after pipeline is done.
+        # 2. Shut down the engine (stops both models and the child process).
         if engine:
-            engine.unload()
-
-        # Drain any remaining transcript results that arrived after polling stopped
-        if writer:
-            try:
-                while True:
-                    result = self._transcript_queue.get_nowait()
-                    writer.append(result)
-                    if self._live_view:
-                        for seg in result.segments:
-                            from hearsay.output.formatter import format_timestamp
-                            ts = format_timestamp(
-                                result.chunk_index * 30 + seg["start"]
-                            )
-                            safe_after(self._root, 0,
-                                       lambda t=f"[{ts}] {seg['text']}": (
-                                           self._live_view.append_text(t)
-                                           if self._live_view else None
-                                       ))
-            except queue.Empty:
-                pass
+            engine.shutdown()
 
         # Finalize transcript
         duration = None
@@ -280,6 +292,14 @@ def _teardown_recording(
             ))
             writer.post_process()
 
+            if self._config.beep_on_save:
+                self._play_beep("save")
+
+            if self._config.copy_to_clipboard:
+                text = self._extract_clipboard_text(writer)
+                if text:
+                    safe_after(self._root, 0, lambda t=text: self._copy_to_clipboard(t))
+
         # Insert session separator in live view
         end_time = time.strftime("%I:%M %p")
         safe_after(self._root, 0, lambda: (
@@ -291,32 +311,6 @@ def _teardown_recording(
             self._live_view.set_status("Idle") if self._live_view else None
         ))
 
-    def _poll_transcripts(self) -> None:
-        """Poll the transcript queue and update live view + markdown writer."""
-        if not self._recording:
-            return
-
-        try:
-            while True:
-                result = self._transcript_queue.get_nowait()
-                # Write to markdown
-                if self._writer:
-                    self._writer.append(result)
-                # Update live view
-                if self._live_view:
-                    for seg in result.segments:
-                        from hearsay.output.formatter import format_timestamp
-                        ts = format_timestamp(
-                            result.chunk_index * 30 + seg["start"]
-                        )
-                        self._live_view.append_text(f"[{ts}] {seg['text']}")
-        except queue.Empty:
-            pass
-
-        # Schedule next poll
-        if self._recording:
-            safe_after(self._root, LIVE_VIEW_POLL_MS, self._poll_transcripts)
-
     def _ensure_live_view(self) -> LiveTranscriptWindow:
         """Create live view if needed, return it."""
         if self._live_view is None:
@@ -332,9 +326,18 @@ def _open_settings(self) -> None:
         safe_after(
             self._root,
             0,
-            lambda: SettingsWindow(self._root, self._config_manager),
+            lambda: SettingsWindow(
+                self._root,
+                self._config_manager,
+                on_save=self._on_settings_saved,
+                is_recording=lambda: self._recording,
+            ),
         )
 
+    def _on_settings_saved(self) -> None:
+        self._config = self._config_manager.config
+        self._register_hotkey()
+
     def _open_about(self) -> None:
         """Open the about window."""
         safe_after(
@@ -343,6 +346,153 @@ def _open_about(self) -> None:
             lambda: AboutWindow(self._root),
         )
 
+    def _on_model_download_failed(self, error: str) -> None:
+        """Called on main thread when model download fails at recording start."""
+        self._recording = False
+        self._engine = None
+        if self._tray:
+            self._tray.set_recording(False)
+        if self._live_view:
+            self._live_view.set_status("Download failed")
+        from tkinter import messagebox
+        messagebox.showerror(
+            "Model Download Failed",
+            "Failed to download the selected model. Check your internet connection "
+            "or select a different model in Settings.\n\n" + error[:200],
+            parent=self._root,
+        )
+
+    def _handle_cuda_error(self, source: str) -> None:
+        """Called on main thread when CUDA runtime DLLs are missing."""
+        self._recording = False
+        self._engine = None
+        if self._tray:
+            self._tray.set_recording(False)
+        if self._live_view:
+            self._live_view.set_status("Idle")
+        self._show_cuda_error_dialog(source)
+
+    def _show_cuda_error_dialog(self, source: str) -> None:
+        """Show a dialog offering CPU fallback or CUDA Toolkit install link."""
+        dialog = ctk.CTkToplevel(self._root)
+        dialog.title("GPU Unavailable")
+        dialog.resizable(False, False)
+        dialog.grab_set()
+
+        # Center on screen
+        dialog.update_idletasks()
+        w, h = 420, 220
+        x = (dialog.winfo_screenwidth() - w) // 2
+        y = (dialog.winfo_screenheight() - h) // 2
+        dialog.geometry(f"{w}x{h}+{x}+{y}")
+
+        ctk.CTkLabel(
+            dialog,
+            text="CUDA runtime library not found.",
+            font=ctk.CTkFont(size=14, weight="bold"),
+        ).pack(pady=(20, 4))
+
+        ctk.CTkLabel(
+            dialog,
+            text=(
+                "GPU is selected but CUDA Toolkit 12.x is not installed,\n"
+                "so inference cannot run on GPU.\n\n"
+                "Switch to CPU or install CUDA Toolkit to continue."
+            ),
+            justify="center",
+        ).pack(pady=(0, 16))
+
+        btn_frame = ctk.CTkFrame(dialog, fg_color="transparent")
+        btn_frame.pack()
+
+        def switch_to_cpu() -> None:
+            dialog.destroy()
+            self._config.device = "cpu"
+            self._config.compute_type = DEFAULT_CPU_COMPUTE
+            self._config_manager.save()
+            log.info("Switched to CPU per user request after CUDA error")
+            self._start_recording(source)
+
+        def open_cuda_download() -> None:
+            dialog.destroy()
+            webbrowser.open("https://developer.nvidia.com/cuda-downloads")
+
+        ctk.CTkButton(
+            btn_frame, text="Switch to CPU", width=160, command=switch_to_cpu,
+        ).pack(side="left", padx=8)
+
+        ctk.CTkButton(
+            btn_frame, text="Install CUDA Toolkit", width=160,
+            fg_color="transparent", border_width=1,
+            command=open_cuda_download,
+        ).pack(side="left", padx=8)
+
+    # ── Hotkey ────────────────────────────────────────────────────────────────
+
+    def _register_hotkey(self) -> None:
+        try:
+            import keyboard as kb
+            self._unregister_hotkey()
+            combo = self._config.hotkey
+            if combo:
+                kb.add_hotkey(combo, self._toggle_recording_hotkey)
+                self._hotkey_combo = combo
+                log.info("Hotkey registered: %s", combo)
+        except Exception:
+            log.warning("Failed to register hotkey", exc_info=True)
+
+    def _unregister_hotkey(self) -> None:
+        try:
+            import keyboard as kb
+            if self._hotkey_combo:
+                kb.remove_hotkey(self._hotkey_combo)
+                self._hotkey_combo = None
+        except Exception:
+            pass
+
+    def _toggle_recording_hotkey(self) -> None:
+        """Called from the keyboard library thread — must dispatch to main thread."""
+        if self._recording:
+            safe_after(self._root, 0, self._stop_recording)
+        else:
+            safe_after(self._root, 0, lambda: self._start_recording(self._config.audio_source))
+
+    # ── Beep ──────────────────────────────────────────────────────────────────
+
+    def _play_beep(self, event: str) -> None:
+        try:
+            import winsound
+            if event == "start":
+                winsound.Beep(880, 120)
+            elif event == "stop":
+                winsound.Beep(520, 180)
+            elif event == "save":
+                winsound.Beep(660, 80)
+                winsound.Beep(880, 160)
+        except Exception:
+            pass
+
+    # ── Clipboard ─────────────────────────────────────────────────────────────
+
+    def _extract_clipboard_text(self, writer: MarkdownWriter) -> str:
+        try:
+            content = writer.file_path.read_text(encoding="utf-8")
+            header_end = content.index("\n\n") + 2
+            footer_idx = content.rfind("\n---\n")
+            body = content[header_end:footer_idx] if footer_idx != -1 else content[header_end:]
+            return body.strip()
+        except Exception:
+            log.warning("Failed to extract clipboard text", exc_info=True)
+            return ""
+
+    def _copy_to_clipboard(self, text: str) -> None:
+        try:
+            self._root.clipboard_clear()
+            self._root.clipboard_append(text)
+            log.info("Transcript copied to clipboard (%d chars)", len(text))
+        except Exception:
+            log.warning("Failed to copy to clipboard", exc_info=True)
+
     def _open_output_dir(self) -> None:
         """Open the output directory in file explorer."""
         path = self._config.output_dir
@@ -359,11 +509,10 @@ def _quit(self) -> None:
         if self._recording:
             self._recording = False
             self._teardown_recording(
-                self._recorder, self._pipeline, self._engine,
+                self._recorder, self._engine,
                 self._writer, self._recording_start_time,
             )
             self._recorder = None
-            self._pipeline = None
             self._engine = None
             self._writer = None
             self._recording_start_time = None
@@ -371,6 +520,7 @@ def _quit(self) -> None:
             self._teardown_thread.join(timeout=30)
             self._teardown_thread = None
 
+        self._unregister_hotkey()
         if self._tray:
             self._tray.stop()
         safe_after(self._root, 100, self._root.quit)
diff --git a/src/hearsay/audio/recorder.py b/src/hearsay/audio/recorder.py
index 845b31c..4d054df 100644
--- a/src/hearsay/audio/recorder.py
+++ b/src/hearsay/audio/recorder.py
@@ -4,7 +4,7 @@
 
 import logging
 import queue
-import time
+from typing import Callable
 
 import numpy as np
 
@@ -14,21 +14,127 @@
     AUDIO_SOURCE_BOTH,
     AUDIO_SOURCE_MIC,
     AUDIO_SOURCE_SYSTEM,
-    CHUNK_DURATION_S,
+    MAX_CHUNK_DURATION_S,
+    MIN_CHUNK_DURATION_S,
     OVERLAP_DURATION_S,
     SAMPLE_RATE,
+    SILENCE_DURATION_S,
+    SILENCE_RMS_THRESHOLD,
 )
 from hearsay.utils.threading_utils import StoppableThread
 
 log = logging.getLogger(__name__)
 
 
+class _ChunkAccumulator:
+    """Accumulates mono 16 kHz float32 audio and decides chunk boundaries.
+
+    A chunk becomes ready when either:
+      * the buffer reaches ``MAX_CHUNK_DURATION_S`` (hard cap), or
+      * at least ``MIN_CHUNK_DURATION_S`` has accumulated AND the trailing
+        ``SILENCE_DURATION_S`` of audio is near-silent.
+
+    Consecutive chunks share ``OVERLAP_DURATION_S`` of audio so the
+    transcription pipeline can stitch words across boundaries.  Each emitted
+    chunk carries its absolute start time (seconds from the start of the
+    recording), so downstream timestamps stay correct despite variable lengths.
+    """
+
+    def __init__(self) -> None:
+        self._buffer: list[np.ndarray] = []
+        self._total = 0          # samples currently buffered
+        self._silence_run = 0    # consecutive trailing near-silent samples
+        self._start_sample = 0   # absolute index of buffer[0] in the recording
+        self.chunk_index = 0
+
+        self._min = int(MIN_CHUNK_DURATION_S * SAMPLE_RATE)
+        self._max = int(MAX_CHUNK_DURATION_S * SAMPLE_RATE)
+        self._silence_needed = int(SILENCE_DURATION_S * SAMPLE_RATE)
+        self._overlap = int(OVERLAP_DURATION_S * SAMPLE_RATE)
+
+    def add(self, mono: np.ndarray, silent: bool | None = None) -> None:
+        """Append a mono frame, updating the trailing-silence run.
+
+        If *silent* is None, silence is computed from this frame's RMS.
+        Callers mixing multiple sources (Both mode) pass an explicit flag.
+        """
+        if mono is None or len(mono) == 0:
+            return
+        self._buffer.append(mono)
+        self._total += len(mono)
+
+        if silent is None:
+            rms = float(np.sqrt(np.mean(mono ** 2)))
+            silent = rms < SILENCE_RMS_THRESHOLD
+
+        if silent:
+            self._silence_run += len(mono)
+        else:
+            self._silence_run = 0
+
+    def ready(self) -> bool:
+        """True when the current buffer should be emitted as a chunk."""
+        if self._total >= self._max:
+            return True
+        return self._total >= self._min and self._silence_run >= self._silence_needed
+
+    def pop(self) -> tuple[int, float, np.ndarray]:
+        """Emit a chunk and retain the overlap tail. Returns (index, start_s, audio)."""
+        data = np.concatenate(self._buffer)
+        emitted_len = min(len(data), self._max)
+        chunk = data[:emitted_len]
+        start_time = self._start_sample / SAMPLE_RATE
+        idx = self.chunk_index
+
+        # Advance by the unique (non-overlapping) audio we just consumed.
+        advance = max(0, emitted_len - self._overlap)
+        self._start_sample += advance
+
+        if self._overlap > 0:
+            leftover = data[emitted_len - self._overlap:]
+        else:
+            leftover = data[emitted_len:]
+        self._buffer = [leftover] if len(leftover) else []
+        self._total = int(len(leftover))
+        self._silence_run = 0
+        self.chunk_index += 1
+        return idx, start_time, chunk
+
+    def flush(self) -> tuple[int, float, np.ndarray] | None:
+        """Emit whatever remains (if > 1s) when recording stops."""
+        if self._total <= SAMPLE_RATE:  # less than 1 second — discard
+            return None
+        data = np.concatenate(self._buffer)
+        start_time = self._start_sample / SAMPLE_RATE
+        idx = self.chunk_index
+        self._buffer = []
+        self._total = 0
+        self.chunk_index += 1
+        return idx, start_time, data
+
+
+def _rms(mono: np.ndarray) -> float:
+    """Root-mean-square level of a mono float32 frame."""
+    if mono is None or len(mono) == 0:
+        return 0.0
+    return float(np.sqrt(np.mean(mono ** 2)))
+
+
 class AudioRecorder(StoppableThread):
-    """Record audio and push 30-second chunks to a queue.
+    """Record audio and push variable-length chunks to a queue.
+
+    Each queue item is a ``(chunk_index, start_time_s, np.ndarray)`` tuple,
+    where ``start_time_s`` is the chunk's absolute offset from the start of the
+    recording.
+
+    When ``on_frame`` is provided, the recorder streams every mono 16 kHz
+    float32 frame to that callback instead of accumulating chunks into
+    ``audio_queue`` — used to feed RealtimeSTT continuously for low latency.
 
     Args:
-        audio_queue: Queue to push (chunk_index, np.ndarray) tuples.
+        audio_queue: Queue to push chunks to (ignored when ``on_frame`` is set).
         source: One of 'system', 'microphone', 'both'.
+        on_frame: Optional per-frame callback for streaming (RealtimeSTT) mode.
         loopback_device_index: PyAudioWPatch device index for loopback.
         mic_device_index: sounddevice device index for mic.
     """
@@ -37,6 +143,7 @@ def __init__(
         self,
         audio_queue: queue.Queue,
         source: str = AUDIO_SOURCE_SYSTEM,
+        on_frame: Callable[[np.ndarray], None] | None = None,
         loopback_device_index: int | None = None,
         mic_device_index: int | None = None,
         loopback_channels: int = 2,
@@ -47,6 +154,7 @@ def __init__(
         super().__init__(name="AudioRecorder")
         self.audio_queue = audio_queue
         self.source = source
+        self.on_frame = on_frame
         self.loopback_device_index = loopback_device_index
         self.mic_device_index = mic_device_index
         self.loopback_channels = loopback_channels
@@ -108,32 +216,19 @@ def _record_mic(self) -> None:
         """Record microphone via sounddevice."""
         import sounddevice as sd
 
-        buffer: list[np.ndarray] = []
-        chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
-        overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
-        chunk_index = 0
+        acc = _ChunkAccumulator()
 
         def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None:
-            nonlocal chunk_index
             mono = resample(indata.copy(), self.mic_rate, self.mic_channels)
-            buffer.append(mono)
-
-            total = sum(len(b) for b in buffer)
-            if total >= chunk_samples:
-                chunk = np.concatenate(buffer)[:chunk_samples]
-                self.audio_queue.put((chunk_index, chunk))
-                chunk_index += 1
-                # Keep overlap
-                if overlap_samples > 0:
-                    leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:]
-                    buffer.clear()
-                    buffer.append(leftover)
-                else:
-                    buffer.clear()
-
-        device = self.mic_device_index
+            if self.on_frame is not None:
+                self.on_frame(mono)
+                return
+            acc.add(mono)
+            if acc.ready():
+                self.audio_queue.put(acc.pop())
+
         with sd.InputStream(
-            device=device,
+            device=self.mic_device_index,
             samplerate=self.mic_rate,
             channels=self.mic_channels,
             dtype="float32",
@@ -142,11 +237,12 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object)
             while not self.stopped():
                 self.wait(timeout=0.5)
 
-        # Flush remaining audio
-        if buffer:
-            chunk = np.concatenate(buffer)
-            if len(chunk) > SAMPLE_RATE:  # Only if > 1 second
-                self.audio_queue.put((chunk_index, chunk))
+        if self.on_frame is not None:
+            return
+
+        final = acc.flush()
+        if final is not None:
+            self.audio_queue.put(final)
 
     def _record_both(self) -> None:
         """Record both loopback and mic, mix them.
@@ -156,7 +252,8 @@ def _record_both(self) -> None:
         occurs when PyAudioWPatch and sounddevice run on the same thread.
         The mic stream uses PyAudio's callback mode so it accumulates data
         asynchronously while the main loop drives off blocking loopback
-        reads.
+        reads.  Chunk boundaries are decided on the *combined* activity, so a
+        chunk is only cut when both sources fall silent.
         """
         import pyaudiowpatch as pyaudio
 
@@ -230,10 +327,15 @@ def mic_callback(in_data, frame_count, time_info, status_flags):
             mic_stream.start_stream()
 
             # --- Main loop (driven by blocking loopback reads) ---
-            chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
-            overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
-            loopback_buf: list[np.ndarray] = []
-            chunk_index = 0
+            acc = _ChunkAccumulator()
+
+            def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray:
+                if not mic_buffer:
+                    return lb_chunk
+                mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)]
+                if len(mic_chunk) < len(lb_chunk):
+                    mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk)))
+                return mix_streams(lb_chunk, mic_chunk)
 
             while not self.stopped():
                 try:
@@ -241,49 +343,30 @@ def mic_callback(in_data, frame_count, time_info, status_flags):
                 except Exception:
                     break
                 audio = np.frombuffer(raw, dtype=np.int16)
-                mono = resample(audio, self.loopback_rate, self.loopback_channels)
-                loopback_buf.append(mono)
-
-                total = sum(len(b) for b in loopback_buf)
-                if total >= chunk_samples:
-                    lb_chunk = np.concatenate(loopback_buf)[:chunk_samples]
-                    mic_samples = sum(len(b) for b in mic_buffer)
-                    log.debug(
-                        "Mixing chunk %d: loopback=%d mic=%d samples",
-                        chunk_index, len(lb_chunk), mic_samples,
-                    )
-
-                    if mic_buffer:
-                        mic_chunk = np.concatenate(mic_buffer)[:chunk_samples]
-                        if len(mic_chunk) < chunk_samples:
-                            mic_chunk = np.pad(mic_chunk, (0, chunk_samples - len(mic_chunk)))
-                        mixed = mix_streams(lb_chunk, mic_chunk)
-                    else:
-                        mixed = lb_chunk
-
-                    self.audio_queue.put((chunk_index, mixed))
-                    chunk_index += 1
-
-                    if overlap_samples > 0:
-                        leftover = np.concatenate(loopback_buf)[chunk_samples - overlap_samples:]
-                        loopback_buf.clear()
-                        loopback_buf.append(leftover)
-                    else:
-                        loopback_buf.clear()
+                lb_mono = resample(audio, self.loopback_rate, self.loopback_channels)
+
+                if self.on_frame is not None:
+                    self.on_frame(mix_with_mic(lb_mono))
+                    mic_buffer.clear()
+                    continue
+
+                # Combined silence: silent only when both sources are quiet.
+                # The latest mic frame approximates current mic activity.
+                mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True
+                silent = (_rms(lb_mono) < SILENCE_RMS_THRESHOLD) and mic_silent
+
+                acc.add(lb_mono, silent=silent)
+                if acc.ready():
+                    idx, start_time, lb_chunk = acc.pop()
+                    self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk)))
                     mic_buffer.clear()
 
             # --- Flush remaining audio ---
-            if loopback_buf:
-                lb_chunk = np.concatenate(loopback_buf)
-                if len(lb_chunk) > SAMPLE_RATE:  # Only if > 1 second
-                    if mic_buffer:
-                        mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)]
-                        if len(mic_chunk) < len(lb_chunk):
-                            mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk)))
-                        mixed = mix_streams(lb_chunk, mic_chunk)
-                    else:
-                        mixed = lb_chunk
-                    self.audio_queue.put((chunk_index, mixed))
+            if self.on_frame is None:
+                final = acc.flush()
+                if final is not None:
+                    idx, start_time, lb_chunk = final
+                    self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk)))
 
             mic_stream.stop_stream()
             mic_stream.close()
@@ -298,11 +381,8 @@ def _chunk_loop(
         sr: int,
         channels: int,
     ) -> None:
-        """Generic chunking loop for loopback-style streams."""
-        chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
-        overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
-        buffer: list[np.ndarray] = []
-        chunk_index = 0
+        """Generic chunking loop for loopback-style (blocking-read) streams."""
+        acc = _ChunkAccumulator()
 
         while not self.stopped():
             try:
@@ -311,25 +391,26 @@ def _chunk_loop(
                 break
             audio = np.frombuffer(raw, dtype=np.int16)
             mono = resample(audio, sr, channels)
-            buffer.append(mono)
-
-            total = sum(len(b) for b in buffer)
-            if total >= chunk_samples:
-                chunk = np.concatenate(buffer)[:chunk_samples]
-                self.audio_queue.put((chunk_index, chunk))
-                chunk_index += 1
-                log.debug("Audio chunk %d queued (%d samples)", chunk_index - 1, len(chunk))
-
-                if overlap_samples > 0:
-                    leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:]
-                    buffer.clear()
-                    buffer.append(leftover)
-                else:
-                    buffer.clear()
-
-        # Flush remaining audio
-        if buffer:
-            chunk = np.concatenate(buffer)
-            if len(chunk) > SAMPLE_RATE:  # Only if > 1 second
-                self.audio_queue.put((chunk_index, chunk))
-                log.debug("Final audio chunk %d queued (%d samples)", chunk_index, len(chunk))
+
+            if self.on_frame is not None:
+                self.on_frame(mono)
+                continue
+
+            acc.add(mono)
+
+            if acc.ready():
+                idx, start_time, chunk = acc.pop()
+                self.audio_queue.put((idx, start_time, chunk))
+                log.debug(
+                    "Audio chunk %d queued (%d samples, t=%.1fs)",
+                    idx, len(chunk), start_time,
+                )
+
+        if self.on_frame is not None:
+            return
+
+        final = acc.flush()
+        if final is not None:
+            idx, start_time, chunk = final
+            self.audio_queue.put((idx, start_time, chunk))
+            log.debug("Final audio chunk %d queued (%d samples)", idx, len(chunk))
diff --git a/src/hearsay/config.py b/src/hearsay/config.py
index ea804c5..54a8497 100644
--- a/src/hearsay/config.py
+++ b/src/hearsay/config.py
@@ -11,6 +11,8 @@
     AUDIO_SOURCE_SYSTEM,
     DEFAULT_CPU_COMPUTE,
     DEFAULT_CPU_MODEL,
+    DEFAULT_REALTIME_MODEL,
+    POST_SPEECH_SILENCE_S,
 )
 from hearsay.utils.paths import get_config_path, get_default_output_dir
 
@@ -36,12 +38,27 @@ class AppConfig:
     language: str = "en"
     vad_filter: bool = True
 
+    # Realtime dual-layer transcription (RealtimeSTT)
+    realtime_model_name: str = DEFAULT_REALTIME_MODEL
+    post_speech_silence_duration: float = POST_SPEECH_SILENCE_S
+
     # Output
     output_dir: str = field(default_factory=lambda: str(get_default_output_dir()))
 
     # UI
     show_live_view_on_start: bool = False
 
+    # Hotkey
+    hotkey: str = "ctrl+alt+r"
+
+    # Beep notifications
+    beep_on_start: bool = True
+    beep_on_stop: bool = True
+    beep_on_save: bool = True
+
+    # Clipboard
+    copy_to_clipboard: bool = False
+
 
 class ConfigManager:
     """Load and save AppConfig to JSON in %APPDATA%\\Hearsay."""
diff --git a/src/hearsay/constants.py b/src/hearsay/constants.py
index 710dfca..8ee3f98 100644
--- a/src/hearsay/constants.py
+++ b/src/hearsay/constants.py
@@ -7,10 +7,34 @@
 # Audio settings
 SAMPLE_RATE = 16000  # Whisper expects 16kHz
 CHANNELS = 1  # Whisper expects mono
-CHUNK_DURATION_S = 30  # Whisper's native context window
-OVERLAP_DURATION_S = 1  # Overlap between chunks to prevent word splitting
+# Variable-length chunking driven by trailing-silence detection.
+# A chunk is cut once at least MIN_CHUNK_DURATION_S has accumulated AND the
+# trailing SILENCE_DURATION_S of audio is near-silent — or unconditionally once
+# MAX_CHUNK_DURATION_S (Whisper's native context window) is reached.
+MIN_CHUNK_DURATION_S = 5     # Minimum audio buffered before an early (silence) cut
+MAX_CHUNK_DURATION_S = 30    # Hard cap — Whisper's native context window
+SILENCE_DURATION_S = 1.0     # Trailing near-silence (seconds) that triggers a cut
+SILENCE_RMS_THRESHOLD = 0.01  # RMS on [-1, 1] float audio below which ≈ silence
+OVERLAP_DURATION_S = 1       # Overlap between chunks to prevent word splitting
 AUDIO_DTYPE = "float32"
 
+# Custom HuggingFace models: short name -> {repo_id, parameters, vram_gb, english_only}
+# These models are in Transformers format and must be converted to CTranslate2 on first use.
+HF_CUSTOM_MODELS: dict[str, dict] = {
+    "small-ko": {
+        "repo_id": "SungBeom/whisper-small-ko",
+        "parameters": "244M",
+        "vram_gb": 2,
+        "english_only": False,
+    },
+    "medium-ko-zeroth": {
+        "repo_id": "seastar105/whisper-medium-ko-zeroth",
+        "parameters": "769M",
+        "vram_gb": 5,
+        "english_only": False,
+    },
+}
+
 # Model table: name -> (parameters, vram_gb, english_only)
 MODEL_TABLE = {
     "tiny": ("39M", 1, False),
@@ -23,6 +47,9 @@
     "medium.en": ("769M", 5, True),
     "large-v3": ("1550M", 10, False),
     "turbo": ("809M", 6, False),
+    # Korean fine-tuned models (HuggingFace, converted to CTranslate2 on first use)
+    "small-ko": ("244M", 2, False),
+    "medium-ko-zeroth": ("769M", 5, False),
 }
 
 # Default model recommendations
@@ -31,6 +58,13 @@
 DEFAULT_GPU_COMPUTE = "float16"
 DEFAULT_CPU_COMPUTE = "int8"
 
+# RealtimeSTT dual-layer transcription.
+# The fast model drives the tentative ("typing") layer; the main model
+# (model_name above) produces the accurate final text once VAD detects the
+# end of an utterance.
+DEFAULT_REALTIME_MODEL = "tiny"   # small/fast model for the tentative layer
+POST_SPEECH_SILENCE_S = 0.7       # trailing silence (s) that finalizes an utterance
+
 # Audio source options
 AUDIO_SOURCE_SYSTEM = "system"
 AUDIO_SOURCE_MIC = "microphone"
@@ -43,6 +77,3 @@
 
 # Transcript formatting
 PARAGRAPH_GAP_S = 2.0  # Silence gap (seconds) that triggers a paragraph break
-
-# UI
-LIVE_VIEW_POLL_MS = 250  # Poll transcript queue every 250ms
diff --git a/src/hearsay/output/markdown_writer.py b/src/hearsay/output/markdown_writer.py
index 912585a..0a5684a 100644
--- a/src/hearsay/output/markdown_writer.py
+++ b/src/hearsay/output/markdown_writer.py
@@ -3,23 +3,27 @@
 from __future__ import annotations
 
 import logging
+import re
 from datetime import datetime
 from pathlib import Path
 
-from hearsay.constants import PARAGRAPH_GAP_S
-from hearsay.output.formatter import clean_transcript_text, make_title
-from hearsay.transcription.engine import TranscriptionResult
+from hearsay.output.formatter import clean_transcript_text, format_timestamp, make_title
 
 log = logging.getLogger(__name__)
 
-# Markers used to split header / body / footer for post-processing
 _FOOTER_MARKER = "\n---\n"
+_TS_LINE_RE = re.compile(r"^(\[\d+:\d+(?::\d+)?\] )(.+?)\ *$")
 
 
 class MarkdownWriter:
-    """Writes transcript results to a .md file, appending as chunks arrive."""
-
-    def __init__(self, output_dir: str | Path, title: str | None = None) -> None:
+    """Writes transcript results to a .md file, appending as utterances are finalized."""
+
+    def __init__(
+        self,
+        output_dir: str | Path,
+        title: str | None = None,
+        language: str = "en",
+    ) -> None:
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -27,65 +31,25 @@ def __init__(self, output_dir: str | Path, title: str | None = None) -> None:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         self.file_path = self.output_dir / f"transcript_{timestamp}.md"
         self._header_written = False
-
-        # Track absolute timing across chunks for gap-based paragraph breaks
-        self._last_segment_end: float | None = None
-        self._language: str = "en"
+        self._language: str = language or "en"
 
     def _write_header(self) -> None:
-        """Write the markdown header on first call."""
         with open(self.file_path, "w", encoding="utf-8") as f:
             f.write(f"# {self.title}\n\n")
         self._header_written = True
         log.info("Transcript file created: %s", self.file_path)
 
-    def append(self, result: TranscriptionResult) -> None:
-        """Append a transcription result using segment-level gap detection."""
+    def append_utterance(self, elapsed_seconds: float, text: str) -> None:
+        """Append one finalized utterance as a timestamped line matching the live view."""
+        text = text.strip()
+        if not text:
+            return
         if not self._header_written:
             self._write_header()
 
-        self._language = result.language or self._language
-
-        if not result.segments:
-            self._append_fallback(result)
-            return
-
-        chunk_offset = result.chunk_index * 30  # seconds offset for this chunk
-        pieces: list[str] = []
-
-        for seg in result.segments:
-            seg_start = chunk_offset + seg["start"]
-            seg_text = seg["text"].strip()
-            if not seg_text:
-                continue
-
-            # Determine separator: paragraph break on long gap, space otherwise
-            if self._last_segment_end is not None:
-                gap = seg_start - self._last_segment_end
-                if gap >= PARAGRAPH_GAP_S:
-                    pieces.append("\n\n")
-                else:
-                    pieces.append(" ")
-            # else: very first segment, no separator needed
-
-            pieces.append(seg_text)
-            self._last_segment_end = chunk_offset + seg["end"]
-
-        if pieces:
-            with open(self.file_path, "a", encoding="utf-8") as f:
-                f.write("".join(pieces))
-
-        log.debug("Appended chunk %d to %s", result.chunk_index, self.file_path)
-
-    def _append_fallback(self, result: TranscriptionResult) -> None:
-        """Fallback for results with empty segments (e.g. after dedup)."""
-        text = result.text.strip()
-        if not text:
-            return
+        ts = format_timestamp(elapsed_seconds)
         with open(self.file_path, "a", encoding="utf-8") as f:
-            if self._last_segment_end is not None:
-                f.write(" ")
-            f.write(text)
+            f.write(f"[{ts}] {text}  \n")
 
     def finalize(self, total_duration: float | None = None) -> Path:
         """Write a footer and return the file path."""
@@ -93,7 +57,7 @@ def finalize(self, total_duration: float | None = None) -> Path:
             self._write_header()
 
         with open(self.file_path, "a", encoding="utf-8") as f:
-            f.write("\n\n---\n\n")
+            f.write("\n---\n\n")
             f.write(f"*Generated by Hearsay on {datetime.now():%Y-%m-%d at %H:%M}*\n")
             if total_duration:
                 from hearsay.output.formatter import format_duration
@@ -103,28 +67,33 @@ def finalize(self, total_duration: float | None = None) -> Path:
         return self.file_path
 
     def post_process(self) -> None:
-        """Read the finalized transcript, clean up the body, and rewrite."""
+        """Clean up the text portion of each timestamped line, preserving timestamps."""
         if not self.file_path.exists():
             return
 
         content = self.file_path.read_text(encoding="utf-8")
-
-        # Split into header, body, footer using the --- marker
         footer_idx = content.rfind(_FOOTER_MARKER)
         if footer_idx == -1:
             log.warning("No footer marker found, skipping post-processing")
             return
 
-        # Header ends at first double newline after the title line
         header_end = content.index("\n\n") + 2
         header = content[:header_end]
         body = content[header_end:footer_idx]
         footer = content[footer_idx:]
 
-        cleaned = clean_transcript_text(body, language=self._language)
+        cleaned_lines: list[str] = []
+        for line in body.splitlines(keepends=True):
+            m = _TS_LINE_RE.match(line.rstrip("\n"))
+            if m:
+                ts_prefix, text = m.group(1), m.group(2)
+                text = clean_transcript_text(text, language=self._language)
+                cleaned_lines.append(f"{ts_prefix}{text}  \n")
+            else:
+                cleaned_lines.append(line)
 
         self.file_path.write_text(
-            header + cleaned + footer,
+            header + "".join(cleaned_lines) + footer,
             encoding="utf-8",
         )
         log.info("Post-processed transcript: %s", self.file_path)
diff --git a/src/hearsay/transcription/engine.py b/src/hearsay/transcription/engine.py
deleted file mode 100644
index 8495de7..0000000
--- a/src/hearsay/transcription/engine.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""TranscriptionEngine: wraps faster-whisper for inference."""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass
-
-import numpy as np
-
-from hearsay.utils.paths import get_models_dir
-
-log = logging.getLogger(__name__)
-
-
-@dataclass
-class TranscriptionResult:
-    """Result from transcribing one audio chunk."""
-
-    text: str
-    segments: list[dict]  # [{start, end, text}, ...]
-    language: str
-    language_probability: float
-    chunk_index: int
-
-
-class TranscriptionEngine:
-    """Wraps faster-whisper WhisperModel for inference."""
-
-    def __init__(
-        self,
-        model_name: str = "small.en",
-        device: str = "cpu",
-        compute_type: str = "int8",
-        language: str = "en",
-        vad_filter: bool = True,
-    ) -> None:
-        self.model_name = model_name
-        self.device = device
-        self.compute_type = compute_type
-        self.language = language
-        self.vad_filter = vad_filter
-        self._model = None
-
-    def load(self) -> None:
-        """Load the Whisper model into memory."""
-        from faster_whisper import WhisperModel
-
-        log.info(
-            "Loading model '%s' (device=%s, compute=%s)",
-            self.model_name,
-            self.device,
-            self.compute_type,
-        )
-        self._model = WhisperModel(
-            self.model_name,
-            device=self.device,
-            compute_type=self.compute_type,
-            download_root=str(get_models_dir()),
-        )
-        log.info("Model loaded successfully")
-
-    def transcribe(
-        self,
-        audio: np.ndarray,
-        chunk_index: int = 0,
-    ) -> TranscriptionResult:
-        """Transcribe a float32 16kHz mono audio array.
-
-        Args:
-            audio: Audio data as float32 numpy array at 16kHz.
-            chunk_index: Index of this chunk (for ordering).
-
-        Returns:
-            TranscriptionResult with text and segment details.
-        """
-        if self._model is None:
-            raise RuntimeError("Model not loaded. Call load() first.")
-
-        segments_iter, info = self._model.transcribe(
-            audio,
-            beam_size=5,
-            language=self.language if self.language else None,
-            vad_filter=self.vad_filter,
-            vad_parameters={"min_silence_duration_ms": 500},
-        )
-
-        segments = []
-        texts = []
-        for seg in segments_iter:
-            segments.append({
-                "start": seg.start,
-                "end": seg.end,
-                "text": seg.text.strip(),
-            })
-            texts.append(seg.text.strip())
-
-        full_text = " ".join(texts)
-        log.debug(
-            "Chunk %d: %d segments, lang=%s (%.2f), text=%s",
-            chunk_index,
-            len(segments),
-            info.language,
-            info.language_probability,
-            full_text[:100],
-        )
-
-        return TranscriptionResult(
-            text=full_text,
-            segments=segments,
-            language=info.language,
-            language_probability=info.language_probability,
-            chunk_index=chunk_index,
-        )
-
-    def unload(self) -> None:
-        """Free model memory."""
-        self._model = None
-        log.info("Model unloaded")
diff --git a/src/hearsay/transcription/gpu_detect.py b/src/hearsay/transcription/gpu_detect.py
index 1c33a68..acfb283 100644
--- a/src/hearsay/transcription/gpu_detect.py
+++ b/src/hearsay/transcription/gpu_detect.py
@@ -27,34 +27,146 @@ class GPUInfo:
     recommended_device: str
 
 
+def _gpu_name_from_nvidia_smi() -> str:
+    """Query GPU name via nvidia-smi without requiring torch."""
+    try:
+        import subprocess
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if result.returncode == 0:
+            return result.stdout.strip().splitlines()[0].strip()
+    except Exception:
+        pass
+    return ""
+
+
+def _vram_gb_from_nvidia_smi() -> float:
+    """Query total VRAM in GB via nvidia-smi."""
+    try:
+        import subprocess
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if result.returncode == 0:
+            mib = float(result.stdout.strip().splitlines()[0].strip())
+            return round(mib / 1024, 1)
+    except Exception:
+        pass
+    return 0.0
+
+
+def _vram_gb_from_name(name: str) -> float:
+    """Estimate VRAM from GPU name when ctranslate2 doesn't expose memory info."""
+    name_lower = name.lower()
+    # RTX 40xx series
+    if "4090" in name_lower:
+        return 24.0
+    if "4080" in name_lower:
+        return 16.0
+    if "4070 ti" in name_lower:
+        return 12.0
+    if "4070" in name_lower:
+        return 12.0
+    if "4060 ti" in name_lower:
+        return 8.0
+    if "4060" in name_lower:
+        return 8.0
+    # RTX 30xx series
+    if "3090" in name_lower:
+        return 24.0
+    if "3080" in name_lower:
+        return 10.0
+    if "3070" in name_lower:
+        return 8.0
+    if "3060 ti" in name_lower:
+        return 8.0
+    if "3060" in name_lower:
+        return 12.0
+    if "3050" in name_lower:
+        return 8.0
+    # RTX 20xx series
+    if "2080 ti" in name_lower:
+        return 11.0
+    if "2080" in name_lower:
+        return 8.0
+    if "2070" in name_lower:
+        return 8.0
+    if "2060" in name_lower:
+        return 6.0
+    return 4.0  # conservative default
+
+
+def _cuda_runtime_usable() -> bool:
+    """Probe the CUDA runtime by allocating a tiny CTranslate2 storage object.
+
+    ctranslate2.get_cuda_device_count() only checks the driver; the actual
+    runtime DLLs (cublas64_12.dll etc.) are loaded lazily on first use.
+    This call forces that load so we can detect a broken installation early.
+    """
+    try:
+        import ctranslate2
+        ctranslate2.StorageView([1], ctranslate2.DataType.int8, ctranslate2.Device.cuda)
+        return True
+    except Exception as exc:
+        log.warning("CUDA runtime probe failed: %s", exc)
+        return False
+
+
 def detect_gpu() -> GPUInfo:
-    """Detect CUDA GPU and return recommendation."""
+    """Detect CUDA GPU via ctranslate2 (same backend faster-whisper uses)."""
     try:
-        import torch
-
-        if torch.cuda.is_available():
-            name = torch.cuda.get_device_name(0)
-            vram_bytes = torch.cuda.get_device_properties(0).total_mem
-            vram_gb = vram_bytes / (1024**3)
-            log.info("CUDA GPU found: %s (%.1f GB VRAM)", name, vram_gb)
-
-            if vram_gb >= 6:
-                model = DEFAULT_GPU_MODEL
-            elif vram_gb >= 2:
-                model = "small.en"
+        import ctranslate2
+
+        cuda_count = ctranslate2.get_cuda_device_count()
+        if cuda_count > 0:
+            if not _cuda_runtime_usable():
+                log.warning(
+                    "CUDA device found but runtime DLLs are missing "
+                    "(install CUDA Toolkit 12.x). Falling back to CPU."
+                )
+                # Fall through to CPU return below
             else:
-                model = "tiny.en"
-
-            return GPUInfo(
-                cuda_available=True,
-                gpu_name=name,
-                vram_gb=round(vram_gb, 1),
-                recommended_model=model,
-                recommended_compute=DEFAULT_GPU_COMPUTE,
-                recommended_device="cuda",
-            )
+                # Try to get GPU name via torch if available; otherwise fall back gracefully
+                gpu_name = ""
+                vram_gb = 0.0
+                try:
+                    import torch
+                    if torch.cuda.is_available():
+                        gpu_name = torch.cuda.get_device_name(0)
+                        vram_bytes = torch.cuda.get_device_properties(0).total_mem
+                        vram_gb = round(vram_bytes / (1024**3), 1)
+                except Exception:
+                    pass
+
+                if not gpu_name:
+                    gpu_name = _gpu_name_from_nvidia_smi() or "CUDA Device 0"
+
+                if vram_gb == 0.0:
+                    vram_gb = _vram_gb_from_nvidia_smi() or _vram_gb_from_name(gpu_name)
+
+                log.info("CUDA GPU found: %s (%.1f GB VRAM)", gpu_name, vram_gb)
+
+                if vram_gb >= 6:
+                    model = DEFAULT_GPU_MODEL
+                elif vram_gb >= 2:
+                    model = "small.en"
+                else:
+                    model = "tiny.en"
+
+                return GPUInfo(
+                    cuda_available=True,
+                    gpu_name=gpu_name,
+                    vram_gb=vram_gb,
+                    recommended_model=model,
+                    recommended_compute=DEFAULT_GPU_COMPUTE,
+                    recommended_device="cuda",
+                )
+        log.info("No CUDA devices found via ctranslate2")
     except ImportError:
-        log.info("PyTorch not installed, assuming CPU-only")
+        log.info("ctranslate2 not installed, assuming CPU-only")
     except Exception:
         log.warning("GPU detection failed", exc_info=True)
 
diff --git a/src/hearsay/transcription/model_manager.py b/src/hearsay/transcription/model_manager.py
index ed6150c..3fee329 100644
--- a/src/hearsay/transcription/model_manager.py
+++ b/src/hearsay/transcription/model_manager.py
@@ -3,9 +3,12 @@
 from __future__ import annotations
 
 import logging
+import shutil
+import subprocess
+import sys
 from pathlib import Path
 
-from hearsay.constants import MODEL_TABLE
+from hearsay.constants import HF_CUSTOM_MODELS, MODEL_TABLE
 from hearsay.utils.paths import get_models_dir
 
 log = logging.getLogger(__name__)
@@ -21,57 +24,162 @@ def get_model_info(name: str) -> tuple[str, int, bool] | None:
     return MODEL_TABLE.get(name)
 
 
+def is_hf_custom_model(name: str) -> bool:
+    """Return True if this model requires HuggingFace download + CTranslate2 conversion."""
+    return name in HF_CUSTOM_MODELS
+
+
+def get_hf_model_local_path(name: str) -> Path:
+    """Return the local CTranslate2 directory path for a custom HF model."""
+    return get_models_dir() / f"hf-ct2-{name}"
+
+
+def resolve_model_path(name: str) -> str:
+    """Return the model name or local path string for WhisperModel().
+
+    For standard models, returns the name as-is (faster-whisper handles download).
+    For custom HF models, returns the local CTranslate2 directory path.
+    """
+    if is_hf_custom_model(name):
+        return str(get_hf_model_local_path(name))
+    return name
+
+
 def is_model_downloaded(name: str) -> bool:
     """Check if a model is already cached locally."""
+    if is_hf_custom_model(name):
+        local_path = get_hf_model_local_path(name)
+        return local_path.exists() and (local_path / "model.bin").exists()
+
     model_dir = get_models_dir()
-    # faster-whisper stores models in subdirectories named after the model
-    # Check for the CTranslate2 model file
     model_path = model_dir / f"models--Systran--faster-whisper-{name}"
     if model_path.exists():
         return True
-    # Also check for direct directory naming
     alt_path = model_dir / name
     return alt_path.exists() and any(alt_path.iterdir())
 
 
+def _get_converter_cmd() -> str:
+    """Find the ct2-transformers-converter executable."""
+    converter = shutil.which("ct2-transformers-converter")
+    if converter:
+        return converter
+
+    import site
+    candidate_dirs: list[Path] = [Path(sys.executable).parent]
+
+    # pip --user installs scripts under {userbase}/PythonXY/Scripts on Windows
+    user_base = Path(site.getuserbase())
+    for child in user_base.iterdir() if user_base.exists() else []:
+        if child.is_dir() and child.name.startswith("Python"):
+            candidate_dirs.append(child / "Scripts")
+    candidate_dirs.append(user_base / "Scripts")
+    candidate_dirs.append(user_base / "bin")
+
+    for d in candidate_dirs:
+        for exe_name in ["ct2-transformers-converter", "ct2-transformers-converter.exe"]:
+            p = d / exe_name
+            if p.exists():
+                return str(p)
+
+    raise RuntimeError(
+        "ct2-transformers-converter not found.\n"
+        "Install required packages:\n"
+        "  pip install ctranslate2 transformers torch"
+    )
+
+
+def _download_and_convert_hf_model(
+    name: str,
+    progress_callback: callable | None = None,
+) -> None:
+    """Download a HuggingFace Whisper model and convert it to CTranslate2 format."""
+    info = HF_CUSTOM_MODELS[name]
+    repo_id = info["repo_id"]
+    local_path = get_hf_model_local_path(name)
+
+    log.info("Downloading and converting HF model '%s' -> %s", repo_id, local_path)
+
+    try:
+        converter = _get_converter_cmd()
+    except RuntimeError as exc:
+        raise RuntimeError(str(exc)) from exc
+
+    local_path.mkdir(parents=True, exist_ok=True)
+
+    if progress_callback:
+        progress_callback(f"Downloading '{repo_id}' from HuggingFace...")
+
+    result = subprocess.run(
+        [
+            converter,
+            "--model", repo_id,
+            "--output_dir", str(local_path),
+            "--quantization", "int8",
+            "--force",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        shutil.rmtree(local_path, ignore_errors=True)
+        stderr_tail = result.stderr[-600:] if result.stderr else "(no output)"
+        raise RuntimeError(
+            f"CTranslate2 conversion failed for '{repo_id}':\n{stderr_tail}\n\n"
+            "Make sure torch is installed: pip install torch"
+        )
+
+    log.info("HF model '%s' converted successfully to %s", repo_id, local_path)
+
+    if progress_callback:
+        progress_callback(f"Model '{name}' ready!")
+
+
 def download_model(
     name: str,
     progress_callback: callable | None = None,
 ) -> str:
-    """Download a model if not cached. Returns the model size string for faster-whisper.
+    """Download (and convert if needed) a model. Returns model path/name for WhisperModel().
 
     Args:
-        name: Model name (e.g., 'turbo', 'small.en').
+        name: Model name from MODEL_TABLE.
         progress_callback: Optional callable(status_text) for progress updates.
 
     Returns:
-        The model name/path string to pass to WhisperModel().
+        The model name or local path string to pass to WhisperModel().
     """
     if name not in MODEL_TABLE:
         raise ValueError(f"Unknown model: {name}")
 
+    if is_hf_custom_model(name):
+        if not is_model_downloaded(name):
+            if progress_callback:
+                progress_callback(f"Converting '{name}' to CTranslate2 format (this may take several minutes)...")
+            _download_and_convert_hf_model(name, progress_callback)
+        elif progress_callback:
+            progress_callback(f"Model '{name}' already converted.")
+        return str(get_hf_model_local_path(name))
+
+    # Standard faster-whisper model
     if progress_callback:
         progress_callback(f"Preparing model '{name}'...")
 
     model_dir = get_models_dir()
     log.info("Downloading/loading model '%s' to %s", name, model_dir)
 
-    # faster-whisper downloads models from Hugging Face on first use.
-    # We trigger this by importing and constructing the model.
-    # The download_root parameter controls where models are cached.
     from faster_whisper import WhisperModel
 
     if progress_callback:
         progress_callback(f"Downloading '{name}' (this may take a few minutes)...")
 
-    # This will download if not cached
     _model = WhisperModel(
         name,
         device="cpu",
         compute_type="int8",
         download_root=str(model_dir),
     )
-    del _model  # Free memory; the real model will be loaded by the engine
+    del _model
 
     if progress_callback:
         progress_callback(f"Model '{name}' ready!")
diff --git a/src/hearsay/transcription/pipeline.py b/src/hearsay/transcription/pipeline.py
deleted file mode 100644
index 7f96ced..0000000
--- a/src/hearsay/transcription/pipeline.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""TranscriptionPipeline thread: consumes audio chunks, produces transcript text."""
-
-from __future__ import annotations
-
-import logging
-import queue
-import string
-import time
-
-from hearsay.transcription.engine import TranscriptionEngine, TranscriptionResult
-from hearsay.utils.threading_utils import StoppableThread
-
-log = logging.getLogger(__name__)
-
-
-class TranscriptionPipeline(StoppableThread):
-    """Daemon thread that reads audio chunks from audio_queue,
-    transcribes them, and pushes results to transcript_queue.
-
-    Args:
-        audio_queue: Input queue of (chunk_index, np.ndarray) tuples.
-        transcript_queue: Output queue of TranscriptionResult objects.
-        engine: Configured TranscriptionEngine (model already loaded).
-    """
-
-    _TAIL_WORD_COUNT = 15  # words kept from previous chunk for overlap matching
-    _MIN_MATCH_WORDS = 2   # minimum overlap length to avoid false positives
-
-    def __init__(
-        self,
-        audio_queue: queue.Queue,
-        transcript_queue: queue.Queue,
-        engine: TranscriptionEngine,
-    ) -> None:
-        super().__init__(name="TranscriptionPipeline")
-        self.audio_queue = audio_queue
-        self.transcript_queue = transcript_queue
-        self.engine = engine
-        self._prev_tail_words: list[str] = []
-
-    def run(self) -> None:
-        log.info("TranscriptionPipeline started")
-        while not self.stopped():
-            try:
-                chunk_index, audio = self.audio_queue.get(timeout=1.0)
-            except queue.Empty:
-                continue
-            self._process_chunk(chunk_index, audio)
-
-        # Drain any audio chunks still in the queue after stop signal.
-        # The recorder flushes its buffer before exiting, so these chunks
-        # must be transcribed to avoid losing the tail of the recording.
-        log.info("TranscriptionPipeline draining remaining audio chunks")
-        while True:
-            try:
-                chunk_index, audio = self.audio_queue.get_nowait()
-            except queue.Empty:
-                break
-            self._process_chunk(chunk_index, audio)
-
-        log.info("TranscriptionPipeline stopped")
-
-    def _process_chunk(self, chunk_index: int, audio) -> None:
-        """Transcribe a single audio chunk and enqueue the result."""
-        try:
-            t0 = time.perf_counter()
-            result = self.engine.transcribe(audio, chunk_index=chunk_index)
-            elapsed = time.perf_counter() - t0
-            log.info(
-                "Chunk %d transcribed in %.1fs: %s",
-                chunk_index,
-                elapsed,
-                result.text[:80] if result.text else "(empty)",
-            )
-            if result.text:
-                original_words = result.text.split()
-                if chunk_index > 0 and self._prev_tail_words:
-                    result = self._deduplicate(result)
-                self._prev_tail_words = original_words[-self._TAIL_WORD_COUNT:]
-                if result.text:
-                    self.transcript_queue.put(result)
-        except Exception:
-            log.error("Transcription failed for chunk %d", chunk_index, exc_info=True)
-
-    @staticmethod
-    def _normalize(word: str) -> str:
-        """Strip leading/trailing punctuation for comparison."""
-        return word.strip(string.punctuation)
-
-    def _deduplicate(self, result: TranscriptionResult) -> TranscriptionResult:
-        """Remove overlapping prefix from *result* that duplicates the tail of the previous chunk."""
-        new_words = result.text.split()
-        if len(new_words) < self._MIN_MATCH_WORDS:
-            return result
-
-        # Find the longest prefix of new_words that matches a suffix of _prev_tail_words.
-        best = 0
-        for length in range(self._MIN_MATCH_WORDS, min(len(self._prev_tail_words), len(new_words)) + 1):
-            suffix = self._prev_tail_words[-length:]
-            prefix = new_words[:length]
-            tail = [self._normalize(w).lower() for w in suffix]
-            head = [self._normalize(w).lower() for w in prefix]
-            # All words after the first must match exactly; the first word of the
-            # new chunk may be truncated (e.g. "replaced" -> "placed") so allow a
-            # suffix-of-word match when the fragment is at least 3 characters.
-            first_ok = tail[0] == head[0] or (len(head[0]) >= 3 and tail[0].endswith(head[0]))
-            if first_ok and tail[1:] == head[1:]:
-                best = length
-
-        if best == 0:
-            return result
-
-        stripped_words = new_words[best:]
-        log.info(
-            "Chunk %d: stripped %d overlapping words: %s",
-            result.chunk_index,
-            best,
-            " ".join(new_words[:best]),
-        )
-
-        if not stripped_words:
-            return TranscriptionResult(
-                text="",
-                segments=[],
-                language=result.language,
-                language_probability=result.language_probability,
-                chunk_index=result.chunk_index,
-            )
-
-        # Rebuild text and trim leading segments that were fully covered by the overlap.
-        new_text = " ".join(stripped_words)
-        chars_removed = len(" ".join(new_words[:best])) + 1  # +1 for the space after
-        trimmed_segments = []
-        for seg in result.segments:
-            seg_text = seg["text"]
-            if chars_removed >= len(seg_text):
-                chars_removed -= len(seg_text) + 1  # +1 for joining space
-                continue
-            if chars_removed > 0:
-                seg = {**seg, "text": seg_text[chars_removed:].lstrip()}
-                chars_removed = 0
-            trimmed_segments.append(seg)
-
-        return TranscriptionResult(
-            text=new_text,
-            segments=trimmed_segments if trimmed_segments else result.segments,
-            language=result.language,
-            language_probability=result.language_probability,
-            chunk_index=result.chunk_index,
-        )
diff --git a/src/hearsay/transcription/realtime_engine.py b/src/hearsay/transcription/realtime_engine.py
new file mode 100644
index 0000000..0f8b40e
--- /dev/null
+++ b/src/hearsay/transcription/realtime_engine.py
@@ -0,0 +1,169 @@
+"""RealtimeEngine: dual-layer transcription via RealtimeSTT.
+
+Audio is captured by Hearsay's own AudioRecorder (system loopback / mic / both)
+and fed into RealtimeSTT through ``feed_audio`` (``use_microphone=False``).  Two
+whisper models run concurrently:
+
+  * a fast *realtime* model drives the tentative ("typing") layer, revised
+    continuously as the user speaks (``on_tentative``);
+  * the accurate *main* model produces the final text once VAD detects the end
+    of an utterance (``on_final``).
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from typing import Callable
+
+import numpy as np
+
+from hearsay.transcription.model_manager import resolve_model_path
+from hearsay.utils.paths import get_models_dir
+
+log = logging.getLogger(__name__)
+
+
+class CudaUnavailableError(RuntimeError):
+    """Raised when GPU is configured but CUDA is not available."""
+
+
+class RealtimeEngine:
+    """Drives RealtimeSTT with externally fed audio and two output layers."""
+
+    def __init__(
+        self,
+        model_name: str,
+        realtime_model_name: str,
+        device: str,
+        compute_type: str,
+        language: str,
+        on_tentative: Callable[[str], None],
+        on_final: Callable[[str], None],
+        on_utterance_start: Callable[[], None] | None = None,
+        post_speech_silence_duration: float = 0.7,
+    ) -> None:
+        self.model_name = model_name
+        self.realtime_model_name = realtime_model_name
+        self.device = device
+        self.compute_type = compute_type
+        self.language = language or ""
+        self._on_tentative = on_tentative
+        self._on_final = on_final
+        self._on_utterance_start = on_utterance_start
+        self._post_speech_silence_duration = post_speech_silence_duration
+
+        self._recorder = None
+        self._final_thread: threading.Thread | None = None
+        self._stop = threading.Event()
+        self._final_emitted = threading.Event()
+
+    def load(self) -> None:
+        """Create the RealtimeSTT recorder (spawns the main-model process) and
+        start the final-text loop. Blocks until both models are ready."""
+        if self.device == "cuda":
+            try:
+                import torch
+                if not torch.cuda.is_available():
+                    raise CudaUnavailableError("CUDA is not available")
+            except CudaUnavailableError:
+                raise
+            except Exception as exc:  # torch import/init failure
+                raise CudaUnavailableError(str(exc)) from exc
+
+        from RealtimeSTT import AudioToTextRecorder
+
+        model = resolve_model_path(self.model_name)
+        log.info(
+            "Loading RealtimeSTT (main=%s, realtime=%s, device=%s, compute=%s)",
+            self.model_name, self.realtime_model_name, self.device, self.compute_type,
+        )
+        self._recorder = AudioToTextRecorder(
+            model=model,
+            realtime_model_type=self.realtime_model_name,
+            language=self.language,
+            device=self.device,
+            compute_type=self.compute_type,
+            download_root=str(get_models_dir()),
+            use_microphone=False,
+            enable_realtime_transcription=True,
+            on_realtime_transcription_stabilized=self._handle_tentative,
+            on_recording_start=self._handle_utterance_start,
+            post_speech_silence_duration=self._post_speech_silence_duration,
+            spinner=False,
+            level=logging.WARNING,
+            no_log_file=True,
+        )
+        log.info("RealtimeSTT ready")
+
+        self._final_thread = threading.Thread(
+            target=self._final_loop, daemon=True, name="RealtimeFinal",
+        )
+        self._final_thread.start()
+
+    def feed(self, mono_float32: np.ndarray) -> None:
+        """Feed one mono 16 kHz float32 frame into RealtimeSTT.
+
+        ``feed_audio`` casts directly to int16 without scaling, so float [-1, 1]
+        audio must be scaled into the int16 range first.
+        """
+        rec = self._recorder
+        if rec is None or mono_float32 is None or len(mono_float32) == 0:
+            return
+        pcm16 = np.clip(mono_float32 * 32768.0, -32768, 32767).astype(np.int16)
+        try:
+            rec.feed_audio(pcm16, 16000)
+        except Exception:
+            log.error("feed_audio failed", exc_info=True)
+
+    def _handle_tentative(self, text: str) -> None:
+        if text and text.strip() and not self._stop.is_set():
+            self._on_tentative(text.strip())
+
+    def _handle_utterance_start(self) -> None:
+        if self._on_utterance_start is not None and not self._stop.is_set():
+            self._on_utterance_start()
+
+    def _final_loop(self) -> None:
+        """Block on recorder.text() and emit each finalized utterance."""
+        while not self._stop.is_set():
+            try:
+                text = self._recorder.text()
+            except Exception:
+                if self._stop.is_set():
+                    break
+                log.error("RealtimeSTT text() failed", exc_info=True)
+                break
+            if text and text.strip():
+                self._on_final(text.strip())
+            self._final_emitted.set()
+            if self._stop.is_set():
+                break
+
+    def shutdown(self) -> None:
+        """Finalize any in-progress utterance, then tear down the recorder."""
+        rec = self._recorder
+        if rec is not None and getattr(rec, "is_recording", False):
+            # Stopped mid-utterance: gracefully stop the active recording so its
+            # buffered audio gets a final transcription instead of being dropped.
+            started = getattr(rec, "recording_start_time", 0) or 0
+            min_len = getattr(rec, "min_length_of_recording", 0.5)
+            if not started or (time.time() - started) >= min_len:
+                try:
+                    self._final_emitted.clear()
+                    rec.stop()
+                    self._final_emitted.wait(timeout=15)
+                except Exception:
+                    log.warning("Error finalizing in-progress utterance", exc_info=True)
+
+        self._stop.set()
+        self._recorder = None
+        if rec is not None:
+            try:
+                rec.shutdown()
+            except Exception:
+                log.warning("RealtimeSTT shutdown error", exc_info=True)
+        if self._final_thread is not None:
+            self._final_thread.join(timeout=10)
+            self._final_thread = None
diff --git a/src/hearsay/ui/live_view.py b/src/hearsay/ui/live_view.py
index 8169357..82d7288 100644
--- a/src/hearsay/ui/live_view.py
+++ b/src/hearsay/ui/live_view.py
@@ -31,7 +31,7 @@ def __init__(self, master: ctk.CTk) -> None:
         # Delay disclaimer
         ctk.CTkLabel(
             self,
-            text="Transcript text appears with a delay of approximately 30\u201360 seconds depending on your hardware.",
+            text="Live text (gray) updates as you speak; it is replaced by the final, more accurate text after a brief pause.",
             font=("Segoe UI", 10, "italic"),
             text_color="gray",
             anchor="w",
@@ -46,6 +46,11 @@ def __init__(self, master: ctk.CTk) -> None:
         )
         self._textbox.pack(fill="both", expand=True, padx=10, pady=(10, 5))
 
+        # The tentative (in-progress) line is rendered in gray and replaced in
+        # place each time RealtimeSTT revises it, then committed as a final line.
+        self._textbox.tag_config("tentative", foreground="#888888")
+        self._tent_start_index: str | None = None
+
         # Bottom bar with status and controls
         bottom = ctk.CTkFrame(self)
         bottom.pack(fill="x", padx=10, pady=(0, 10))
@@ -96,15 +101,52 @@ def toggle(self) -> None:
             self.show()
 
     def append_text(self, text: str) -> None:
-        """Append text to the transcript view."""
+        """Append a finished line to the transcript view."""
         self._textbox.configure(state="normal")
         self._textbox.insert("end", text + "\n")
         self._textbox.configure(state="disabled")
         if self._autoscroll.get():
             self._textbox.see("end")
 
+    def update_tentative(self, text: str) -> None:
+        """Show or revise the in-progress (gray) line at the bottom of the view."""
+        tb = self._textbox
+        tb.configure(state="normal")
+        if self._tent_start_index is None:
+            self._tent_start_index = tb.index("end-1c")
+        else:
+            tb.delete(self._tent_start_index, "end-1c")
+        tb.insert(self._tent_start_index, text)
+        tb.tag_add("tentative", self._tent_start_index, "end-1c")
+        tb.configure(state="disabled")
+        if self._autoscroll.get():
+            tb.see("end")
+
+    def commit_final(self, line: str) -> None:
+        """Replace the tentative line (if any) with a committed final line."""
+        tb = self._textbox
+        tb.configure(state="normal")
+        if self._tent_start_index is not None:
+            tb.delete(self._tent_start_index, "end-1c")
+            self._tent_start_index = None
+        tb.insert("end-1c", line + "\n")
+        tb.configure(state="disabled")
+        if self._autoscroll.get():
+            tb.see("end")
+
+    def drop_tentative(self) -> None:
+        """Discard the in-progress line without committing it."""
+        if self._tent_start_index is None:
+            return
+        tb = self._textbox
+        tb.configure(state="normal")
+        tb.delete(self._tent_start_index, "end-1c")
+        self._tent_start_index = None
+        tb.configure(state="disabled")
+
     def append_separator(self, timestamp: str) -> None:
         """Insert a visual divider marking the end of a recording session."""
+        self.drop_tentative()
         self._textbox.configure(state="normal")
         self._textbox.insert("end", f"\n--- Recording ended at {timestamp} ---\n\n")
         self._textbox.configure(state="disabled")
@@ -117,6 +159,7 @@ def set_status(self, text: str) -> None:
 
     def clear(self) -> None:
         """Clear all transcript text."""
+        self._tent_start_index = None
         self._textbox.configure(state="normal")
         self._textbox.delete("1.0", "end")
         self._textbox.configure(state="disabled")
diff --git a/src/hearsay/ui/settings_window.py b/src/hearsay/ui/settings_window.py
index a7f386b..7486be6 100644
--- a/src/hearsay/ui/settings_window.py
+++ b/src/hearsay/ui/settings_window.py
@@ -15,6 +15,10 @@
     AUDIO_SOURCE_SYSTEM,
     MODEL_TABLE,
 )
+from hearsay.transcription.model_manager import (
+    is_hf_custom_model,
+    is_model_downloaded,
+)
 
 log = logging.getLogger(__name__)
 
@@ -22,14 +26,23 @@
 class SettingsWindow(ctk.CTkToplevel):
     """Settings editor window."""
 
-    def __init__(self, master: ctk.CTk, config_manager: ConfigManager) -> None:
+    def __init__(
+        self,
+        master: ctk.CTk,
+        config_manager: ConfigManager,
+        on_save: "Callable | None" = None,
+        is_recording: "Callable[[], bool] | None" = None,
+    ) -> None:
         super().__init__(master)
         self.title(f"{APP_NAME} Settings")
-        self.geometry("550x520")
+        self.geometry("550x620")
         self.resizable(False, False)
 
         self._config_manager = config_manager
         self._config = config_manager.config
+        self._on_save = on_save
+        self._is_recording = is_recording or (lambda: False)
+        self._capturing = False
 
         self._build_ui()
         self.grab_set()
@@ -43,7 +56,7 @@ def _build_ui(self) -> None:
         ).pack(pady=(15, 10))
 
         # Scrollable content
-        scroll = ctk.CTkScrollableFrame(self, width=490, height=360)
+        scroll = ctk.CTkScrollableFrame(self, width=490, height=460)
         scroll.pack(fill="both", expand=True, padx=20, pady=(0, 10))
 
         # ── Audio Source ──
@@ -70,9 +83,16 @@ def _build_ui(self) -> None:
             variable=self._model_var,
             values=list(MODEL_TABLE.keys()),
             width=200,
+            command=self._on_model_changed,
         )
         self._model_menu.pack(anchor="w", padx=15)
 
+        self._model_hint = ctk.CTkLabel(
+            scroll, text="", font=("Segoe UI", 10), text_color="gray"
+        )
+        self._model_hint.pack(anchor="w", padx=15)
+        self._update_model_hint(self._config.model_name)
+
         # ── Compute Type ──
         ctk.CTkLabel(scroll, text="Compute Type", font=("Segoe UI", 14, "bold")).pack(
             anchor="w", pady=(15, 5)
@@ -106,7 +126,7 @@ def _build_ui(self) -> None:
         self._lang_entry = ctk.CTkEntry(scroll, textvariable=self._lang_var, width=100)
         self._lang_entry.pack(anchor="w", padx=15)
         ctk.CTkLabel(
-            scroll, text="ISO 639-1 code (e.g., en, es, fr) or empty for auto-detect",
+            scroll, text="ISO 639-1 code (e.g., en, ko, fr) or empty for auto-detect",
             font=("Segoe UI", 10), text_color="gray"
         ).pack(anchor="w", padx=15)
 
@@ -131,17 +151,133 @@ def _build_ui(self) -> None:
             dir_frame, text="Browse", width=70, command=self._browse
         ).pack(side="left")
 
+        # ── Hotkey ──
+        ctk.CTkLabel(scroll, text="Recording Hotkey", font=("Segoe UI", 14, "bold")).pack(
+            anchor="w", pady=(15, 5)
+        )
+        hotkey_frame = ctk.CTkFrame(scroll, fg_color="transparent")
+        hotkey_frame.pack(anchor="w", padx=15, fill="x")
+
+        self._hotkey_var = ctk.StringVar(value=self._config.hotkey)
+        self._hotkey_entry = ctk.CTkEntry(
+            hotkey_frame, textvariable=self._hotkey_var, width=200, state="readonly"
+        )
+        self._hotkey_entry.pack(side="left", padx=(0, 8))
+        self._capture_btn = ctk.CTkButton(
+            hotkey_frame, text="Capture", width=80, command=self._start_capture
+        )
+        self._capture_btn.pack(side="left")
+        ctk.CTkLabel(
+            scroll, text="Press Ctrl+Alt+R or any modifier+key combo",
+            font=("Segoe UI", 10), text_color="gray"
+        ).pack(anchor="w", padx=15)
+
+        # ── Beep Notifications ──
+        ctk.CTkLabel(scroll, text="Beep Notifications", font=("Segoe UI", 14, "bold")).pack(
+            anchor="w", pady=(15, 5)
+        )
+        self._beep_start_var = ctk.BooleanVar(value=self._config.beep_on_start)
+        self._beep_stop_var = ctk.BooleanVar(value=self._config.beep_on_stop)
+        self._beep_save_var = ctk.BooleanVar(value=self._config.beep_on_save)
+        ctk.CTkCheckBox(
+            scroll, text="Beep on recording start", variable=self._beep_start_var
+        ).pack(anchor="w", padx=15, pady=2)
+        ctk.CTkCheckBox(
+            scroll, text="Beep on recording stop", variable=self._beep_stop_var
+        ).pack(anchor="w", padx=15, pady=2)
+        ctk.CTkCheckBox(
+            scroll, text="Beep on transcript save", variable=self._beep_save_var
+        ).pack(anchor="w", padx=15, pady=2)
+
+        # ── Clipboard ──
+        ctk.CTkLabel(scroll, text="Clipboard", font=("Segoe UI", 14, "bold")).pack(
+            anchor="w", pady=(15, 5)
+        )
+        self._clipboard_var = ctk.BooleanVar(value=self._config.copy_to_clipboard)
+        ctk.CTkCheckBox(
+            scroll,
+            text="Copy transcript to clipboard on save",
+            variable=self._clipboard_var,
+        ).pack(anchor="w", padx=15, pady=2)
+
         # ── Buttons ──
-        btn_frame = ctk.CTkFrame(self)
-        btn_frame.pack(fill="x", padx=20, pady=(0, 15))
+        self._btn_frame = ctk.CTkFrame(self)
+        self._btn_frame.pack(fill="x", padx=20, pady=(0, 15))
 
-        ctk.CTkButton(
-            btn_frame, text="Save", width=100, command=self._save
-        ).pack(side="right", padx=5)
-        ctk.CTkButton(
-            btn_frame, text="Cancel", width=100, fg_color="gray",
+        self._save_btn = ctk.CTkButton(
+            self._btn_frame, text="Save", width=100, command=self._save
+        )
+        self._save_btn.pack(side="right", padx=5)
+        self._cancel_btn = ctk.CTkButton(
+            self._btn_frame, text="Cancel", width=100, fg_color="gray",
             command=self._cancel
-        ).pack(side="right", padx=5)
+        )
+        self._cancel_btn.pack(side="right", padx=5)
+
+    def _start_capture(self) -> None:
+        self._capturing = True
+        self._hotkey_entry.configure(state="normal")
+        self._hotkey_var.set("Press hotkey...")
+        self._hotkey_entry.configure(state="readonly")
+        self._capture_btn.configure(text="Cancel", command=self._cancel_capture)
+        self._hotkey_entry.focus_set()
+        self.bind("<KeyPress>", self._on_key_capture)
+
+    def _cancel_capture(self) -> None:
+        self._capturing = False
+        self.unbind("<KeyPress>")
+        self._hotkey_entry.configure(state="normal")
+        self._hotkey_var.set(self._config.hotkey)
+        self._hotkey_entry.configure(state="readonly")
+        self._capture_btn.configure(text="Capture", command=self._start_capture)
+
+    def _on_key_capture(self, event) -> str:
+        keysym = event.keysym.lower()
+        modifier_only = {
+            "control_l", "control_r", "alt_l", "alt_r",
+            "shift_l", "shift_r", "super_l", "super_r",
+        }
+        if keysym in modifier_only:
+            return "break"
+        if keysym == "escape":
+            self._cancel_capture()
+            return "break"
+
+        parts = []
+        if event.state & 0x4:       # Ctrl
+            parts.append("ctrl")
+        if event.state & 0x1:       # Shift
+            parts.append("shift")
+        if event.state & 0x20000:   # Alt (Windows)
+            parts.append("alt")
+
+        if not parts:
+            return "break"          # require at least one modifier
+
+        parts.append(keysym)
+        combo = "+".join(parts)
+
+        self._capturing = False
+        self.unbind("<KeyPress>")
+        self._hotkey_entry.configure(state="normal")
+        self._hotkey_var.set(combo)
+        self._hotkey_entry.configure(state="readonly")
+        self._capture_btn.configure(text="Capture", command=self._start_capture)
+        return "break"
+
+    def _on_model_changed(self, name: str) -> None:
+        self._update_model_hint(name)
+
+    def _update_model_hint(self, name: str) -> None:
+        if is_hf_custom_model(name):
+            if is_model_downloaded(name):
+                self._model_hint.configure(text="Korean model (converted, ready)", text_color="green")
+            else:
+                self._model_hint.configure(
+                    text="Korean model — will download when recording starts", text_color="#e07800"
+                )
+        else:
+            self._model_hint.configure(text="")
 
     def _browse(self) -> None:
         path = filedialog.askdirectory(
@@ -152,6 +288,29 @@ def _browse(self) -> None:
             self._dir_var.set(path)
 
     def _save(self) -> None:
+        self._apply_and_close()
+
+    def _apply_and_close(self) -> None:
+        if self._is_recording():
+            _LOCKED = [
+                ("Model",            self._model_var.get(),     self._config.model_name),
+                ("Device",           self._device_var.get(),    self._config.device),
+                ("Compute Type",     self._compute_var.get(),   self._config.compute_type),
+                ("Language",         self._lang_var.get().strip(), self._config.language),
+                ("VAD Filter",       self._vad_var.get(),       self._config.vad_filter),
+                ("Audio Source",     self._source_var.get(),    self._config.audio_source),
+                ("Output Directory", self._dir_var.get(),       self._config.output_dir),
+            ]
+            changed = [name for name, new, old in _LOCKED if new != old]
+            if changed:
+                from tkinter import messagebox
+                messagebox.showinfo(
+                    "Recording Active",
+                    "Settings saved.\n\n"
+                    "The following changes will take effect when you start the next recording:\n"
+                    + "".join(f"\n  - {c}" for c in changed),
+                    parent=self,
+                )
         self._config.audio_source = self._source_var.get()
         self._config.model_name = self._model_var.get()
         self._config.compute_type = self._compute_var.get()
@@ -159,10 +318,17 @@ def _save(self) -> None:
         self._config.language = self._lang_var.get()
         self._config.vad_filter = self._vad_var.get()
         self._config.output_dir = self._dir_var.get()
+        self._config.hotkey = self._hotkey_var.get()
+        self._config.beep_on_start = self._beep_start_var.get()
+        self._config.beep_on_stop = self._beep_stop_var.get()
+        self._config.beep_on_save = self._beep_save_var.get()
+        self._config.copy_to_clipboard = self._clipboard_var.get()
         self._config_manager.save()
         log.info("Settings saved")
         self.grab_release()
         self.destroy()
+        if self._on_save:
+            self._on_save()
 
     def _cancel(self) -> None:
         self.grab_release()
diff --git a/src/hearsay/utils/cuda_dlls.py b/src/hearsay/utils/cuda_dlls.py
new file mode 100644
index 0000000..b29df87
--- /dev/null
+++ b/src/hearsay/utils/cuda_dlls.py
@@ -0,0 +1,79 @@
+"""Register NVIDIA pip-package DLL directories on Windows before ctranslate2 loads."""
+
+from __future__ import annotations
+
+import logging
+import os
+import site
+import sys
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+
+def _nvidia_bin_dirs() -> list[Path]:
+    """Yield every nvidia/<pkg>/bin directory found in any site-packages."""
+    search_roots: list[Path] = []
+
+    # user site-packages (pip install --user)
+    try:
+        user_site = site.getusersitepackages()
+        if user_site:
+            search_roots.append(Path(user_site))
+    except Exception:
+        pass
+
+    # system / venv site-packages
+    for p in site.getsitepackages():
+        search_roots.append(Path(p))
+
+    found: list[Path] = []
+    seen: set[Path] = set()
+    for root in search_roots:
+        nvidia_root = root / "nvidia"
+        if not nvidia_root.is_dir():
+            continue
+        for bin_dir in nvidia_root.glob("*/bin"):
+            if bin_dir.is_dir() and bin_dir not in seen:
+                seen.add(bin_dir)
+                found.append(bin_dir)
+
+    return found
+
+
+def register_nvidia_dlls() -> bool:
+    """Add NVIDIA pip-package bin dirs to the Windows DLL search path.
+
+    Uses both os.add_dll_directory() (for Python extension modules) and
+    prepends to PATH (for ctranslate2's ctypes.CDLL calls, which only
+    respect PATH on Windows).
+
+    Returns True if at least one directory was registered.
+    No-op on non-Windows platforms.
+    """
+    if sys.platform != "win32":
+        return False
+
+    dirs = _nvidia_bin_dirs()
+    if not dirs:
+        log.debug("No nvidia pip-package bin dirs found; skipping DLL registration")
+        return False
+
+    registered = 0
+    path_entries: list[str] = []
+    for d in dirs:
+        try:
+            os.add_dll_directory(str(d))
+            path_entries.append(str(d))
+            log.debug("Registered DLL dir: %s", d)
+            registered += 1
+        except Exception as exc:
+            log.warning("Could not register DLL dir %s: %s", d, exc)
+
+    if path_entries:
+        os.environ["PATH"] = os.pathsep.join(path_entries) + os.pathsep + os.environ.get("PATH", "")
+
+    if registered:
+        log.info("Registered %d NVIDIA DLL director%s from pip packages",
+                 registered, "y" if registered == 1 else "ies")
+    return registered > 0