Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ numpy>=1.24.0
customtkinter>=5.2.0
pystray>=0.19.5
Pillow>=10.0.0
transformers>=4.23.0
4 changes: 2 additions & 2 deletions src/hearsay/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def _teardown_recording(
for seg in result.segments:
from hearsay.output.formatter import format_timestamp
ts = format_timestamp(
result.chunk_index * 30 + seg["start"]
result.start_time + seg["start"]
)
safe_after(self._root, 0,
lambda t=f"[{ts}] {seg['text']}": (
Expand Down Expand Up @@ -307,7 +307,7 @@ def _poll_transcripts(self) -> None:
for seg in result.segments:
from hearsay.output.formatter import format_timestamp
ts = format_timestamp(
result.chunk_index * 30 + seg["start"]
result.start_time + seg["start"]
)
self._live_view.append_text(f"[{ts}] {seg['text']}")
except queue.Empty:
Expand Down
259 changes: 156 additions & 103 deletions src/hearsay/audio/recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import logging
import queue
import time

import numpy as np

Expand All @@ -14,20 +13,121 @@
AUDIO_SOURCE_BOTH,
AUDIO_SOURCE_MIC,
AUDIO_SOURCE_SYSTEM,
CHUNK_DURATION_S,
MAX_CHUNK_DURATION_S,
MIN_CHUNK_DURATION_S,
OVERLAP_DURATION_S,
SAMPLE_RATE,
SILENCE_DURATION_S,
SILENCE_RMS_THRESHOLD,
)
from hearsay.utils.threading_utils import StoppableThread

log = logging.getLogger(__name__)


class _ChunkAccumulator:
"""Accumulates mono 16 kHz float32 audio and decides chunk boundaries.

A chunk becomes ready when either:
* the buffer reaches ``MAX_CHUNK_DURATION_S`` (hard cap), or
* at least ``MIN_CHUNK_DURATION_S`` has accumulated AND the trailing
``SILENCE_DURATION_S`` of audio is near-silent.

Consecutive chunks share ``OVERLAP_DURATION_S`` of audio so the
transcription pipeline can stitch words across boundaries. Each emitted
chunk carries its absolute start time (seconds from the start of the
recording), so downstream timestamps stay correct despite variable lengths.
"""

def __init__(self) -> None:
self._buffer: list[np.ndarray] = []
self._total = 0 # samples currently buffered
self._silence_run = 0 # consecutive trailing near-silent samples
self._start_sample = 0 # absolute index of buffer[0] in the recording
self.chunk_index = 0

self._min = int(MIN_CHUNK_DURATION_S * SAMPLE_RATE)
self._max = int(MAX_CHUNK_DURATION_S * SAMPLE_RATE)
self._silence_needed = int(SILENCE_DURATION_S * SAMPLE_RATE)
self._overlap = int(OVERLAP_DURATION_S * SAMPLE_RATE)

def add(self, mono: np.ndarray, silent: bool | None = None) -> None:
"""Append a mono frame, updating the trailing-silence run.

If *silent* is None, silence is computed from this frame's RMS.
Callers mixing multiple sources (Both mode) pass an explicit flag.
"""
if mono is None or len(mono) == 0:
return
self._buffer.append(mono)
self._total += len(mono)

if silent is None:
rms = float(np.sqrt(np.mean(mono ** 2)))
silent = rms < SILENCE_RMS_THRESHOLD

if silent:
self._silence_run += len(mono)
else:
self._silence_run = 0

def ready(self) -> bool:
"""True when the current buffer should be emitted as a chunk."""
if self._total >= self._max:
return True
return self._total >= self._min and self._silence_run >= self._silence_needed

def pop(self) -> tuple[int, float, np.ndarray]:
"""Emit a chunk and retain the overlap tail. Returns (index, start_s, audio)."""
data = np.concatenate(self._buffer)
emitted_len = min(len(data), self._max)
chunk = data[:emitted_len]
start_time = self._start_sample / SAMPLE_RATE
idx = self.chunk_index

# Advance by the unique (non-overlapping) audio we just consumed.
advance = max(0, emitted_len - self._overlap)
self._start_sample += advance

if self._overlap > 0:
leftover = data[emitted_len - self._overlap:]
else:
leftover = data[emitted_len:]
self._buffer = [leftover] if len(leftover) else []
self._total = int(len(leftover))
self._silence_run = 0
self.chunk_index += 1
return idx, start_time, chunk

def flush(self) -> tuple[int, float, np.ndarray] | None:
"""Emit whatever remains (if > 1s) when recording stops."""
if self._total <= SAMPLE_RATE: # less than 1 second — discard
return None
data = np.concatenate(self._buffer)
start_time = self._start_sample / SAMPLE_RATE
idx = self.chunk_index
self._buffer = []
self._total = 0
self.chunk_index += 1
return idx, start_time, data


def _rms(mono: np.ndarray) -> float:
"""Root-mean-square level of a mono float32 frame."""
if mono is None or len(mono) == 0:
return 0.0
return float(np.sqrt(np.mean(mono ** 2)))


class AudioRecorder(StoppableThread):
"""Record audio and push 30-second chunks to a queue.
"""Record audio and push variable-length chunks to a queue.

Each queue item is a ``(chunk_index, start_time_s, np.ndarray)`` tuple,
where ``start_time_s`` is the chunk's absolute offset from the start of the
recording.

Args:
audio_queue: Queue to push (chunk_index, np.ndarray) tuples.
audio_queue: Queue to push chunks to.
source: One of 'system', 'microphone', 'both'.
loopback_device_index: PyAudioWPatch device index for loopback.
mic_device_index: sounddevice device index for mic.
Expand Down Expand Up @@ -108,32 +208,16 @@ def _record_mic(self) -> None:
"""Record microphone via sounddevice."""
import sounddevice as sd

buffer: list[np.ndarray] = []
chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
chunk_index = 0
acc = _ChunkAccumulator()

def callback(indata: np.ndarray, frames: int, time_info: object, status: object) -> None:
nonlocal chunk_index
mono = resample(indata.copy(), self.mic_rate, self.mic_channels)
buffer.append(mono)

total = sum(len(b) for b in buffer)
if total >= chunk_samples:
chunk = np.concatenate(buffer)[:chunk_samples]
self.audio_queue.put((chunk_index, chunk))
chunk_index += 1
# Keep overlap
if overlap_samples > 0:
leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:]
buffer.clear()
buffer.append(leftover)
else:
buffer.clear()

device = self.mic_device_index
acc.add(mono)
if acc.ready():
self.audio_queue.put(acc.pop())

with sd.InputStream(
device=device,
device=self.mic_device_index,
samplerate=self.mic_rate,
channels=self.mic_channels,
dtype="float32",
Expand All @@ -142,11 +226,9 @@ def callback(indata: np.ndarray, frames: int, time_info: object, status: object)
while not self.stopped():
self.wait(timeout=0.5)

# Flush remaining audio
if buffer:
chunk = np.concatenate(buffer)
if len(chunk) > SAMPLE_RATE: # Only if > 1 second
self.audio_queue.put((chunk_index, chunk))
final = acc.flush()
if final is not None:
self.audio_queue.put(final)

def _record_both(self) -> None:
"""Record both loopback and mic, mix them.
Expand All @@ -156,7 +238,8 @@ def _record_both(self) -> None:
occurs when PyAudioWPatch and sounddevice run on the same thread.
The mic stream uses PyAudio's callback mode so it accumulates data
asynchronously while the main loop drives off blocking loopback
reads.
reads. Chunk boundaries are decided on the *combined* activity, so a
chunk is only cut when both sources fall silent.
"""
import pyaudiowpatch as pyaudio

Expand Down Expand Up @@ -230,60 +313,40 @@ def mic_callback(in_data, frame_count, time_info, status_flags):
mic_stream.start_stream()

# --- Main loop (driven by blocking loopback reads) ---
chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
loopback_buf: list[np.ndarray] = []
chunk_index = 0
acc = _ChunkAccumulator()

def mix_with_mic(lb_chunk: np.ndarray) -> np.ndarray:
if not mic_buffer:
return lb_chunk
mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)]
if len(mic_chunk) < len(lb_chunk):
mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk)))
return mix_streams(lb_chunk, mic_chunk)

while not self.stopped():
try:
raw = loopback_stream.read(frames_per_buffer, exception_on_overflow=False)
except Exception:
break
audio = np.frombuffer(raw, dtype=np.int16)
mono = resample(audio, self.loopback_rate, self.loopback_channels)
loopback_buf.append(mono)

total = sum(len(b) for b in loopback_buf)
if total >= chunk_samples:
lb_chunk = np.concatenate(loopback_buf)[:chunk_samples]
mic_samples = sum(len(b) for b in mic_buffer)
log.debug(
"Mixing chunk %d: loopback=%d mic=%d samples",
chunk_index, len(lb_chunk), mic_samples,
)

if mic_buffer:
mic_chunk = np.concatenate(mic_buffer)[:chunk_samples]
if len(mic_chunk) < chunk_samples:
mic_chunk = np.pad(mic_chunk, (0, chunk_samples - len(mic_chunk)))
mixed = mix_streams(lb_chunk, mic_chunk)
else:
mixed = lb_chunk

self.audio_queue.put((chunk_index, mixed))
chunk_index += 1

if overlap_samples > 0:
leftover = np.concatenate(loopback_buf)[chunk_samples - overlap_samples:]
loopback_buf.clear()
loopback_buf.append(leftover)
else:
loopback_buf.clear()
lb_mono = resample(audio, self.loopback_rate, self.loopback_channels)

# Combined silence: silent only when both sources are quiet.
# The latest mic frame approximates current mic activity.
mic_silent = _rms(mic_buffer[-1]) < SILENCE_RMS_THRESHOLD if mic_buffer else True
silent = (_rms(lb_mono) < SILENCE_RMS_THRESHOLD) and mic_silent

acc.add(lb_mono, silent=silent)
if acc.ready():
idx, start_time, lb_chunk = acc.pop()
self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk)))
mic_buffer.clear()

# --- Flush remaining audio ---
if loopback_buf:
lb_chunk = np.concatenate(loopback_buf)
if len(lb_chunk) > SAMPLE_RATE: # Only if > 1 second
if mic_buffer:
mic_chunk = np.concatenate(mic_buffer)[:len(lb_chunk)]
if len(mic_chunk) < len(lb_chunk):
mic_chunk = np.pad(mic_chunk, (0, len(lb_chunk) - len(mic_chunk)))
mixed = mix_streams(lb_chunk, mic_chunk)
else:
mixed = lb_chunk
self.audio_queue.put((chunk_index, mixed))
final = acc.flush()
if final is not None:
idx, start_time, lb_chunk = final
self.audio_queue.put((idx, start_time, mix_with_mic(lb_chunk)))

mic_stream.stop_stream()
mic_stream.close()
Expand All @@ -298,11 +361,8 @@ def _chunk_loop(
sr: int,
channels: int,
) -> None:
"""Generic chunking loop for loopback-style streams."""
chunk_samples = int(CHUNK_DURATION_S * SAMPLE_RATE)
overlap_samples = int(OVERLAP_DURATION_S * SAMPLE_RATE)
buffer: list[np.ndarray] = []
chunk_index = 0
"""Generic chunking loop for loopback-style (blocking-read) streams."""
acc = _ChunkAccumulator()

while not self.stopped():
try:
Expand All @@ -311,25 +371,18 @@ def _chunk_loop(
break
audio = np.frombuffer(raw, dtype=np.int16)
mono = resample(audio, sr, channels)
buffer.append(mono)

total = sum(len(b) for b in buffer)
if total >= chunk_samples:
chunk = np.concatenate(buffer)[:chunk_samples]
self.audio_queue.put((chunk_index, chunk))
chunk_index += 1
log.debug("Audio chunk %d queued (%d samples)", chunk_index - 1, len(chunk))

if overlap_samples > 0:
leftover = np.concatenate(buffer)[chunk_samples - overlap_samples:]
buffer.clear()
buffer.append(leftover)
else:
buffer.clear()

# Flush remaining audio
if buffer:
chunk = np.concatenate(buffer)
if len(chunk) > SAMPLE_RATE: # Only if > 1 second
self.audio_queue.put((chunk_index, chunk))
log.debug("Final audio chunk %d queued (%d samples)", chunk_index, len(chunk))
acc.add(mono)

if acc.ready():
idx, start_time, chunk = acc.pop()
self.audio_queue.put((idx, start_time, chunk))
log.debug(
"Audio chunk %d queued (%d samples, t=%.1fs)",
idx, len(chunk), start_time,
)

final = acc.flush()
if final is not None:
idx, start_time, chunk = final
self.audio_queue.put((idx, start_time, chunk))
log.debug("Final audio chunk %d queued (%d samples)", idx, len(chunk))
Loading