From b6ade7b984cde716996700da1e5b41b525838e22 Mon Sep 17 00:00:00 2001 From: manishEMS47 Date: Mon, 8 Jun 2026 16:16:42 +0530 Subject: [PATCH] Added 60dB integration --- .env.example | 6 + CLAUDE.md | 30 +- _internal/toolkit-registry.json | 50 ++- brands/default/voice.json | 9 + tools/config.py | 24 ++ tools/redub.py | 181 ++++++++-- tools/requirements.txt | 5 + tools/sixtydb_tts.py | 589 ++++++++++++++++++++++++++++++++ tools/voiceover.py | 166 ++++++++- 9 files changed, 1018 insertions(+), 42 deletions(-) create mode 100644 tools/sixtydb_tts.py diff --git a/.env.example b/.env.example index e1269f6..e48a1ce 100644 --- a/.env.example +++ b/.env.example @@ -52,3 +52,9 @@ # ElevenLabs: Premium cloud TTS (pay-per-character, optional) # ELEVENLABS_API_KEY=your_api_key_here # ELEVENLABS_VOICE_ID=your_voice_id_here +# +# 60db: Premium cloud TTS (https://60db.ai), used via --provider 60db +# Get a key at https://60db.ai. VOICE_ID is optional (falls back to the +# 60db default voice; list yours with: python tools/sixtydb_tts.py --list-voices) +# SIXTYDB_API_KEY=sk_live_your_key +# SIXTYDB_VOICE_ID=your_voice_id_here diff --git a/CLAUDE.md b/CLAUDE.md index b0ab055..4bc2c13 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -129,17 +129,45 @@ Utility tools work on any video file without requiring a project structure. ### Voiceover Generation +Three TTS providers share one `voiceover.py` interface (`--provider`): +**elevenlabs** (default), **qwen3** (self-hosted, free), and **60db** (premium cloud). +Voice settings (`--stability`, `--similarity`, `--speed`) use a **unified 0-1 scale** +across all providers — for 60db they are auto-converted to its native 0-100 scale. + ```bash -# Per-scene generation (recommended) +# Per-scene generation (recommended, ElevenLabs default) python tools/voiceover.py --scene-dir public/audio/scenes --json # Using Qwen3-TTS (self-hosted, free alternative to ElevenLabs) python tools/voiceover.py --provider qwen3 --tone warm --scene-dir public/audio/scenes --json +# Using 60db (premium cloud TTS — needs SIXTYDB_API_KEY) +python tools/voiceover.py --provider 60db --scene-dir public/audio/scenes --json +python tools/voiceover.py --provider 60db --voice-id --stability 0.6 --script SCRIPT.md --output out.mp3 + # Single file (legacy) python tools/voiceover.py --script SCRIPT.md --output out.mp3 ``` +#### 60db (standalone) + +`tools/sixtydb_tts.py` is the dedicated 60db tool (counterpart to `qwen3_tts.py`). +It exposes a `generate_audio()` used by `voiceover.py` and `redub.py`, plus a CLI. +Three transports all produce a finished audio file: `synthesize` (REST, default), +`stream` (NDJSON), and `websocket` (realtime; needs `pip install websocket-client`). + +```bash +python tools/sixtydb_tts.py --text "Hello world" --output hello.mp3 +python tools/sixtydb_tts.py --text "Hello" --transport stream --output hello.mp3 +python tools/sixtydb_tts.py --list-voices # GET /myvoices +``` + +Config: `SIXTYDB_API_KEY` (required), `SIXTYDB_VOICE_ID` (optional — falls back to +60db's default voice). Brands carry a `sixtydb` block in `voice.json` (voiceId + +settings). `redub.py --tts-provider 60db` uses 60db for the new voice while +transcription stays on ElevenLabs Scribe (60db has no STT); in `--sync` mode the +60db output is run back through Scribe to recover word timestamps. + ### Timing Sync (after voiceover) ```bash diff --git a/_internal/toolkit-registry.json b/_internal/toolkit-registry.json index 9cc21da..528af74 100644 --- a/_internal/toolkit-registry.json +++ b/_internal/toolkit-registry.json @@ -181,11 +181,18 @@ "tools": { "voiceover": { "path": "tools/voiceover.py", - "description": "Generate TTS voiceovers using ElevenLabs or Qwen3-TTS", + "description": "Generate TTS voiceovers using ElevenLabs, Qwen3-TTS, or 60db", "usage": "python tools/voiceover.py --script SCRIPT.md --output out.mp3", + "options": { + "provider": "TTS provider: elevenlabs (default), qwen3, 60db", + "transport": "60db API transport: synthesize (default), stream, websocket", + "no-enhance": "Disable 60db audio enhancement (on by default)", + "stability": "Voice stability 0-1 (unified scale; auto-converted to 0-100 for 60db)", + "similarity": "Similarity 0-1 (unified scale; auto-converted to 0-100 for 60db)" + }, "status": "stable", "created": "2025-12-08", - "updated": "2026-02-19" + "updated": "2026-06-08" }, "music": { "path": "tools/music.py", @@ -487,6 +494,44 @@ "created": "2026-02-19", "updated": "2026-02-19" }, + "sixtydb_tts": { + "path": "tools/sixtydb_tts.py", + "description": "Generate speech using 60db cloud TTS - REST, streaming, and websocket transports", + "usage": "python tools/sixtydb_tts.py --text \"Hello\" --output hello.mp3", + "status": "beta", + "category": "audio-generation", + "backend": "60db", + "requires": "60db API key", + "options": { + "voice-id": "60db voice UUID (defaults to SIXTYDB_VOICE_ID or 60db default voice)", + "stability": "Voice stability 0-1 (auto-converted to 0-100)", + "similarity": "Similarity 0-1 (auto-converted to 0-100)", + "speed": "Speech speed 0.5-2.0", + "no-enhance": "Disable 60db audio enhancement (on by default)", + "output-format": "Audio format: mp3 (default), wav, ogg, flac (synthesize transport)", + "transport": "API transport: synthesize (default), stream, websocket", + "sample-rate": "Websocket sample rate: 8000, 16000, 24000 (default), 48000", + "list-voices": "List your 60db voices via GET /myvoices" + }, + "transports": [ + "synthesize", + "stream", + "websocket" + ], + "endpoints": { + "synthesize": "https://api.60db.ai/tts-synthesize", + "stream": "https://api.60db.ai/tts-stream", + "voices": "https://api.60db.ai/myvoices", + "websocket": "wss://api.60db.ai/ws/tts" + }, + "envVars": [ + "SIXTYDB_API_KEY", + "SIXTYDB_VOICE_ID" + ], + "estimatedCost": "$0.00002 per character ($0.01 minimum per request)", + "created": "2026-06-08", + "updated": "2026-06-08" + }, "sync_timing": { "path": "tools/sync_timing.py", "description": "Sync scene durationSeconds in Remotion config with actual audio durations", @@ -1025,6 +1070,7 @@ }, "config": { "voiceId": "YOUR_VOICE_ID_HERE", + "sixtydbVoiceId": null, "defaultFps": 30, "defaultResolution": { "width": 1920, diff --git a/brands/default/voice.json b/brands/default/voice.json index a0a9396..9139ad5 100644 --- a/brands/default/voice.json +++ b/brands/default/voice.json @@ -14,5 +14,14 @@ "tone": "", "instruct": "", "clone": null + }, + "sixtydb": { + "voiceId": "YOUR_VOICE_ID_HERE", + "settings": { + "stability": 0.85, + "similarity": 0.95, + "speed": 1.0, + "enhance": true + } } } diff --git a/tools/config.py b/tools/config.py index 5a3ec5b..ad8b49c 100644 --- a/tools/config.py +++ b/tools/config.py @@ -51,6 +51,30 @@ def get_elevenlabs_api_key() -> str | None: return os.getenv("ELEVENLABS_API_KEY") +def get_sixtydb_api_key() -> str | None: + """Get 60db API key from environment.""" + from dotenv import load_dotenv + load_dotenv() + return os.getenv("SIXTYDB_API_KEY") + + +def get_sixtydb_voice_id() -> str | None: + """Get the 60db voice ID from env var, falling back to the registry. + + Returns None if neither is set — callers fall back to the documented + 60db default voice. + """ + from dotenv import load_dotenv + load_dotenv() + + voice_id = os.getenv("SIXTYDB_VOICE_ID") + if voice_id and voice_id != "your_voice_id_here": + return voice_id + + registry = load_registry() + return registry.get("config", {}).get("sixtydbVoiceId") + + def get_default_output_dir(project_path: str | None = None) -> Path: """Get default audio output directory for a project.""" if project_path: diff --git a/tools/redub.py b/tools/redub.py index b8154ee..d53b66f 100644 --- a/tools/redub.py +++ b/tools/redub.py @@ -54,7 +54,12 @@ # Add parent to path for local imports sys.path.insert(0, str(Path(__file__).parent)) -from config import get_elevenlabs_api_key, get_voice_id +from config import ( + get_elevenlabs_api_key, + get_sixtydb_api_key, + get_sixtydb_voice_id, + get_voice_id, +) def parse_args(): @@ -86,7 +91,15 @@ def parse_args(): "--voice-id", "-v", type=str, - help="Target ElevenLabs voice ID for the new voice", + help="Target voice ID for the new voice (ElevenLabs or 60db, depending on --tts-provider)", + ) + parser.add_argument( + "--tts-provider", + type=str, + default="elevenlabs", + choices=["elevenlabs", "60db"], + help="TTS engine for the new voice (default: elevenlabs). " + "Transcription (STT) always uses ElevenLabs Scribe.", ) parser.add_argument( "--transcript", @@ -280,6 +293,40 @@ def generate_tts( return False +def generate_tts_60db( + text: str, + voice_id: str, + output_path: str, + api_key: str, + stability: float, + similarity: float, + speed: float, + verbose: bool = True, +) -> bool: + """Generate TTS audio using 60db (settings on the unified 0-1 scale).""" + if verbose: + print(f"Generating TTS with 60db voice {voice_id}...", file=sys.stderr) + + from sixtydb_tts import generate_audio + + result = generate_audio( + text=text, + output_path=output_path, + voice_id=voice_id, + stability=stability, + similarity=similarity, + speed=speed, + enhance=True, + transport="synthesize", + api_key=api_key, + verbose=verbose, + ) + if not result.get("success"): + print(f"60db TTS error: {result.get('error')}", file=sys.stderr) + return False + return True + + def replace_audio(video_path: str, audio_path: str, output_path: str, verbose: bool = True) -> bool: """Replace audio track in video using FFmpeg.""" if verbose: @@ -539,14 +586,34 @@ def main(): ) sys.exit(1) - # Get voice ID - voice_id = args.voice_id or get_voice_id() - if not voice_id: - print( - "Error: No voice ID provided. Use --voice-id or set ELEVENLABS_VOICE_ID", - file=sys.stderr, - ) - sys.exit(1) + # 60db TTS target: resolve its key + voice. STT still uses ElevenLabs Scribe + # (the api_key above), so the ElevenLabs key remains required even here. + sixtydb_api_key = None + if args.tts_provider == "60db": + sixtydb_api_key = get_sixtydb_api_key() + if not sixtydb_api_key: + print( + "Error: --tts-provider 60db requires a 60db API key.\n" + " echo \"SIXTYDB_API_KEY=sk_live_your_key\" >> .env\n" + "\n" + "Note: transcription still uses ElevenLabs Scribe, so an " + "ELEVENLABS_API_KEY is also required.", + file=sys.stderr, + ) + sys.exit(1) + + # Get voice ID (resolution depends on the TTS provider) + if args.tts_provider == "60db": + from sixtydb_tts import DEFAULT_VOICE_ID as _SIXTYDB_DEFAULT_VOICE + voice_id = args.voice_id or get_sixtydb_voice_id() or _SIXTYDB_DEFAULT_VOICE + else: + voice_id = args.voice_id or get_voice_id() + if not voice_id: + print( + "Error: No voice ID provided. Use --voice-id or set ELEVENLABS_VOICE_ID", + file=sys.stderr, + ) + sys.exit(1) # Prepare output directory output_path = Path(args.output) @@ -564,6 +631,7 @@ def main(): "input": args.input, "output": str(output_path), "voice_id": voice_id, + "tts_provider": args.tts_provider, "tts_model": args.model, "stt_model": args.stt_model, "language": args.language, @@ -639,25 +707,54 @@ def main(): if verbose: print(f"Transcript saved to {args.save_transcript}", file=sys.stderr) - # Step 3a: Generate TTS with timestamps - tts_result = generate_tts_with_timestamps( - client, - transcript_text, - voice_id, - str(generated_audio), - args.model, - verbose=verbose, - ) - if not tts_result: - print("Error: Failed to generate TTS audio", file=sys.stderr) - sys.exit(1) + # Step 3a: Generate TTS with word timestamps. + # ElevenLabs: native character-timestamp endpoint. + # 60db: no timestamp API, so generate audio then run it back + # through ElevenLabs Scribe to recover word timestamps. + if args.tts_provider == "60db": + if not generate_tts_60db( + transcript_text, + voice_id, + str(generated_audio), + sixtydb_api_key, + args.stability, + args.similarity, + args.speed, + verbose=verbose, + ): + print("Error: Failed to generate TTS audio", file=sys.stderr) + sys.exit(1) + + tts_transcription = transcribe_with_timestamps( + client, + str(generated_audio), + args.stt_model, + args.language, + verbose=verbose, + ) + if not tts_transcription: + print("Error: Failed to transcribe generated 60db audio for sync", file=sys.stderr) + sys.exit(1) + tts_result = {"words": tts_transcription["words"]} + else: + tts_result = generate_tts_with_timestamps( + client, + transcript_text, + voice_id, + str(generated_audio), + args.model, + verbose=verbose, + ) + if not tts_result: + print("Error: Failed to generate TTS audio", file=sys.stderr) + sys.exit(1) tts_words = tts_result["words"] # Use actual audio file duration (more accurate than timestamp data) tts_duration = get_media_duration(str(generated_audio)) if not tts_duration: - tts_duration = tts_result["duration"] # Fallback to timestamp data + tts_duration = tts_result.get("duration") # Fallback to timestamp data (ElevenLabs only) if verbose: print(f"TTS: {len(tts_words)} words, {tts_duration:.1f}s duration", file=sys.stderr) @@ -726,18 +823,31 @@ def main(): print(f"Transcript: {len(transcript_text)} characters", file=sys.stderr) # Step 3: Generate TTS with new voice - if not generate_tts( - client, - transcript_text, - voice_id, - str(generated_audio), - args.model, - args.stability, - args.similarity, - args.style, - args.speed, - verbose=verbose, - ): + if args.tts_provider == "60db": + tts_ok = generate_tts_60db( + transcript_text, + voice_id, + str(generated_audio), + sixtydb_api_key, + args.stability, + args.similarity, + args.speed, + verbose=verbose, + ) + else: + tts_ok = generate_tts( + client, + transcript_text, + voice_id, + str(generated_audio), + args.model, + args.stability, + args.similarity, + args.style, + args.speed, + verbose=verbose, + ) + if not tts_ok: print("Error: Failed to generate TTS audio", file=sys.stderr) sys.exit(1) @@ -756,6 +866,7 @@ def main(): "input": args.input, "output": str(output_path), "voice_id": voice_id, + "tts_provider": args.tts_provider, "tts_model": args.model, "transcript_chars": len(transcript_text), "sync_mode": args.sync, diff --git a/tools/requirements.txt b/tools/requirements.txt index 092324a..3b925f9 100644 --- a/tools/requirements.txt +++ b/tools/requirements.txt @@ -4,6 +4,11 @@ python-dotenv>=1.0.0 requests>=2.28.0 boto3>=1.28.0 # For Cloudflare R2 (S3-compatible) +# 60db TTS websocket transport (tools/sixtydb_tts.py --transport websocket). +# Optional: only needed for the realtime websocket transport; the default +# REST 'synthesize' transport uses `requests` above. +websocket-client>=1.6.0 + # Image processing — used by tools/flux2.py, tools/image_edit.py, and the # moviepy examples for PIL-based text rendering Pillow>=10.0 diff --git a/tools/sixtydb_tts.py b/tools/sixtydb_tts.py new file mode 100644 index 0000000..ebdbe95 --- /dev/null +++ b/tools/sixtydb_tts.py @@ -0,0 +1,589 @@ +#!/usr/bin/env python3 +""" +Generate speech using 60db (https://60db.ai) — a premium cloud TTS provider. + +This is the 60db counterpart to ElevenLabs (`voiceover.py`) and Qwen3-TTS +(`qwen3_tts.py`). It exposes a `generate_audio()` function used by +`voiceover.py` (as `--provider 60db`) and `redub.py`, plus a standalone CLI. + +Three transports are supported (all produce a finished audio file): + - synthesize (default): POST /tts-synthesize -> JSON {audio_base64} + - stream: POST /tts-stream -> NDJSON audio chunks + - websocket: wss://api.60db.ai/ws/tts -> context protocol + +Voice settings use a UNIFIED 0-1 scale (same as ElevenLabs in this toolkit). +They are converted to 60db's native 0-100 scale internally. + +Usage: + # Quick generation (REST, default voice) + python tools/sixtydb_tts.py --text "Hello world" --output hello.mp3 + + # Pick a voice and tune settings (0-1 scale) + python tools/sixtydb_tts.py --text "Hello" --voice-id \ + --stability 0.6 --similarity 0.9 --speed 1.0 --output hello.mp3 + + # Streaming transport + python tools/sixtydb_tts.py --text "Hello" --transport stream --output hello.mp3 + + # Realtime websocket transport (writes a WAV, transcoded to --output format) + python tools/sixtydb_tts.py --text "Hello" --transport websocket --output hello.mp3 + + # List your voices (GET /myvoices) + python tools/sixtydb_tts.py --list-voices + +Setup: + echo "SIXTYDB_API_KEY=sk_live_your_key" >> .env + # Optional default voice: + echo "SIXTYDB_VOICE_ID=" >> .env +""" +from __future__ import annotations + +import argparse +import base64 +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +import requests + +sys.path.insert(0, str(Path(__file__).parent)) +from config import get_sixtydb_api_key, get_sixtydb_voice_id + +# --- 60db endpoints --- +SIXTYDB_BASE_URL = "https://api.60db.ai" +SIXTYDB_SYNTHESIZE_URL = f"{SIXTYDB_BASE_URL}/tts-synthesize" +SIXTYDB_STREAM_URL = f"{SIXTYDB_BASE_URL}/tts-stream" +SIXTYDB_VOICES_URL = f"{SIXTYDB_BASE_URL}/myvoices" +SIXTYDB_WS_URL = "wss://api.60db.ai/ws/tts" + +# Documented default voice (used when no voice id is configured anywhere). +DEFAULT_VOICE_ID = "fbb75ed2-975a-40c7-9e06-38e30524a9a1" + +SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac"] +SUPPORTED_TRANSPORTS = ["synthesize", "stream", "websocket"] + +# Hard limits from the 60db docs. +MAX_TEXT_CHARS = 5000 # per REST request +MAX_WS_BUFFER_CHARS = 50000 # cumulative per websocket context + + +def get_audio_duration(file_path: str) -> float | None: + """Get audio duration in seconds using ffprobe (if available).""" + try: + result = subprocess.run( + [ + "ffprobe", "-v", "error", + "-show_entries", "format=duration", + "-of", "csv=p=0", + file_path, + ], + capture_output=True, + text=True, + ) + if result.returncode == 0: + return float(result.stdout.strip()) + except (FileNotFoundError, ValueError): + pass # ffprobe not installed or invalid output + return None + + +def _unit_to_100(value: float | None, default_100: int) -> int: + """Convert a unified 0-1 setting to 60db's native 0-100 scale. + + Values are clamped to [0, 100]. A None value yields the 60db default. + Values already > 1 are assumed to be on the native 0-100 scale and are + passed through (clamped) — this keeps the tool forgiving if a caller + hands us 75 instead of 0.75. + """ + if value is None: + return default_100 + scaled = value * 100 if value <= 1 else value + return max(0, min(100, int(round(scaled)))) + + +def list_voices(api_key: str, timeout: int = 30) -> dict: + """Fetch the caller's voices from GET /myvoices. + + Returns {"success": True, "voices": [...]} or {"success": False, "error": ...}. + Each voice dict has at least: voice_id, name, category, model, labels. + """ + try: + resp = requests.get( + SIXTYDB_VOICES_URL, + headers={"Authorization": f"Bearer {api_key}"}, + timeout=timeout, + ) + except requests.RequestException as e: + return {"success": False, "error": f"Request failed: {e}"} + + if resp.status_code != 200: + return {"success": False, "error": f"HTTP {resp.status_code}: {resp.text[:300]}"} + + try: + data = resp.json() + except ValueError: + return {"success": False, "error": f"Invalid JSON response: {resp.text[:300]}"} + + if not data.get("success", True): + return {"success": False, "error": data.get("message", "Request unsuccessful")} + + return {"success": True, "voices": data.get("data", [])} + + +def _write_pcm_as_audio(pcm_bytes: bytes, sample_rate: int, output_path: str) -> bool: + """Write raw 16-bit mono PCM to a WAV, transcoding to the target format. + + The websocket transport returns LINEAR16 PCM chunks. We wrap them in a WAV + header. If the requested output is not .wav, we transcode with ffmpeg. + """ + import wave + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + + if out.suffix.lower() == ".wav": + wav_path = str(out) + else: + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + wav_path = tmp.name + + with wave.open(wav_path, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit + wf.setframerate(sample_rate) + wf.writeframes(pcm_bytes) + + if wav_path == str(out): + return True + + # Transcode WAV -> requested format + result = subprocess.run( + ["ffmpeg", "-y", "-i", wav_path, str(out)], + capture_output=True, + text=True, + ) + Path(wav_path).unlink(missing_ok=True) + if result.returncode != 0: + print(f"ffmpeg transcode error: {result.stderr[-400:]}", file=sys.stderr) + return False + return True + + +def _synthesize_rest( + text: str, voice_id: str, stability: int, similarity: int, + speed: float, enhance: bool, output_format: str, api_key: str, + output_path: str, timeout: int, verbose: bool, +) -> dict: + """POST /tts-synthesize — single JSON response with base64 audio.""" + body = { + "text": text, + "voice_id": voice_id, + "enhance": enhance, + "speed": speed, + "stability": stability, + "similarity": similarity, + "output_format": output_format, + } + try: + resp = requests.post( + SIXTYDB_SYNTHESIZE_URL, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=body, + timeout=timeout, + ) + except requests.RequestException as e: + return {"success": False, "error": f"Request failed: {e}"} + + if resp.status_code != 200: + return {"success": False, "error": f"HTTP {resp.status_code}: {resp.text[:300]}"} + + try: + data = resp.json() + except ValueError: + return {"success": False, "error": f"Invalid JSON response: {resp.text[:300]}"} + + if not data.get("success", True): + return {"success": False, "error": data.get("message", "Synthesis unsuccessful")} + + audio_b64 = data.get("audio_base64") + if not audio_b64: + return {"success": False, "error": f"No audio_base64 in response: {list(data.keys())}"} + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_bytes(base64.b64decode(audio_b64)) + return {"success": True} + + +def _synthesize_stream( + text: str, voice_id: str, stability: int, similarity: int, + speed: float, enhance: bool, api_key: str, + output_path: str, timeout: int, verbose: bool, +) -> dict: + """POST /tts-stream — NDJSON chunks, each carrying a base64 audio slice.""" + body = { + "text": text, + "voice_id": voice_id, + "enhance": enhance, + "speed": speed, + "stability": stability, + "similarity": similarity, + } + try: + resp = requests.post( + SIXTYDB_STREAM_URL, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=body, + timeout=timeout, + stream=True, + ) + except requests.RequestException as e: + return {"success": False, "error": f"Request failed: {e}"} + + if resp.status_code != 200: + return {"success": False, "error": f"HTTP {resp.status_code}: {resp.text[:300]}"} + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + audio = bytearray() + chunk_count = 0 + + for line in resp.iter_lines(): + if not line: + continue + try: + msg = json.loads(line) + except ValueError: + continue # skip malformed line + mtype = msg.get("type") + if mtype == "error": + return {"success": False, "error": msg.get("message", "stream error")} + if mtype in ("chunk", "complete"): + b64 = (msg.get("result") or {}).get("audioContent") + if b64: + audio.extend(base64.b64decode(b64)) + chunk_count += 1 + if mtype == "complete": + break + + if not audio: + return {"success": False, "error": "No audio received from stream"} + + out.write_bytes(bytes(audio)) + if verbose: + print(f" Received {chunk_count} audio chunk(s)", file=sys.stderr) + return {"success": True} + + +def _synthesize_websocket( + text: str, voice_id: str, stability: int, similarity: int, + speed: float, sample_rate: int, api_key: str, + output_path: str, timeout: int, verbose: bool, +) -> dict: + """Realtime websocket transport. Collects LINEAR16 PCM, writes an audio file. + + Uses the `websocket-client` package (lazy import). + """ + try: + from websocket import create_connection # websocket-client + except ImportError: + return { + "success": False, + "error": ( + "websocket transport requires the 'websocket-client' package.\n" + " Install it with: pip install websocket-client\n" + " Or use --transport synthesize (no extra dependency)." + ), + } + + context_id = "voiceover" + url = f"{SIXTYDB_WS_URL}?apiKey={api_key}" + try: + ws = create_connection(url, timeout=timeout) + except Exception as e: + return {"success": False, "error": f"WebSocket connect failed: {e}"} + + pcm = bytearray() + try: + ws.send(json.dumps({ + "create_context": { + "context_id": context_id, + "voice_id": voice_id, + "audio_config": { + "audio_encoding": "LINEAR16", + "sample_rate_hertz": sample_rate, + }, + "speed": speed, + "stability": stability, + "similarity": similarity, + } + })) + ws.send(json.dumps({"send_text": {"context_id": context_id, "text": text}})) + ws.send(json.dumps({"close_context": {"context_id": context_id}})) + + while True: + raw = ws.recv() + if not raw: + break + try: + msg = json.loads(raw) + except ValueError: + continue + if "audio_chunk" in msg: + b64 = msg["audio_chunk"].get("audioContent") + if b64: + pcm.extend(base64.b64decode(b64)) + elif "error" in msg: + return {"success": False, "error": msg["error"].get("message", "ws error")} + elif "context_closed" in msg: + break + except Exception as e: + return {"success": False, "error": f"WebSocket error: {e}"} + finally: + try: + ws.close() + except Exception: + pass + + if not pcm: + return {"success": False, "error": "No audio received from websocket"} + + if not _write_pcm_as_audio(bytes(pcm), sample_rate, output_path): + return {"success": False, "error": "Failed to write/transcode websocket audio"} + return {"success": True} + + +def generate_audio( + text: str, + output_path: str, + voice_id: str | None = None, + stability: float | None = 0.85, + similarity: float | None = 0.95, + speed: float = 1.0, + enhance: bool = True, + output_format: str = "mp3", + transport: str = "synthesize", + sample_rate: int = 24000, + api_key: str | None = None, + timeout: int = 120, + verbose: bool = True, +) -> dict: + """Generate a single audio file with 60db. + + `stability` / `similarity` are on a UNIFIED 0-1 scale and converted to + 60db's native 0-100 internally. `speed` (0.5-2.0) is the same on both. + + Returns dict with the same shape the other providers use: + {success, output, duration_seconds, duration_frames_30fps, script_chars} + or {success: False, error: ...}. + """ + if not text: + return {"success": False, "error": "text must be a non-empty string"} + + api_key = api_key or get_sixtydb_api_key() + if not api_key: + return {"success": False, "error": "No 60db API key (set SIXTYDB_API_KEY)"} + + voice_id = voice_id or get_sixtydb_voice_id() or DEFAULT_VOICE_ID + + if transport not in SUPPORTED_TRANSPORTS: + return {"success": False, "error": f"Unknown transport: {transport}"} + + limit = MAX_WS_BUFFER_CHARS if transport == "websocket" else MAX_TEXT_CHARS + if len(text) > limit: + return { + "success": False, + "error": f"text exceeds {limit} character limit for transport '{transport}' " + f"({len(text)} chars). Split into smaller scenes.", + } + + stab = _unit_to_100(stability, 50) + sim = _unit_to_100(similarity, 75) + + if verbose: + print( + f"60db: voice={voice_id} transport={transport} " + f"stability={stab} similarity={sim} speed={speed}", + file=sys.stderr, + ) + + if transport == "synthesize": + result = _synthesize_rest( + text, voice_id, stab, sim, speed, enhance, output_format, + api_key, output_path, timeout, verbose, + ) + elif transport == "stream": + result = _synthesize_stream( + text, voice_id, stab, sim, speed, enhance, + api_key, output_path, timeout, verbose, + ) + else: # websocket + result = _synthesize_websocket( + text, voice_id, stab, sim, speed, sample_rate, + api_key, output_path, timeout, verbose, + ) + + if not result.get("success"): + return result + + duration = get_audio_duration(output_path) + out = { + "success": True, + "output": output_path, + "script_chars": len(text), + "voice_id": voice_id, + "transport": transport, + } + if duration: + out["duration_seconds"] = round(duration, 2) + out["duration_frames_30fps"] = int(duration * 30) + return out + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate speech using 60db (cloud TTS)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python tools/sixtydb_tts.py --text "Hello world" --output hello.mp3 + python tools/sixtydb_tts.py --text "Hello" --voice-id --stability 0.6 --output hello.mp3 + python tools/sixtydb_tts.py --text "Hello" --transport stream --output hello.mp3 + python tools/sixtydb_tts.py --list-voices + """, + ) + parser.add_argument("--text", "-t", type=str, help="Text to synthesize (max 5000 chars for REST)") + parser.add_argument("--output", "-o", type=str, help="Output audio file path") + parser.add_argument("--voice-id", "-v", type=str, help="60db voice ID (defaults to SIXTYDB_VOICE_ID or the 60db default voice)") + parser.add_argument("--stability", type=float, default=0.85, help="Voice stability 0-1 (default: 0.85). Lower = more expressive.") + parser.add_argument("--similarity", type=float, default=0.95, help="Similarity 0-1 (default: 0.95). Higher = closer to source voice.") + parser.add_argument("--speed", type=float, default=1.0, help="Speech speed 0.5-2.0 (default: 1.0)") + parser.add_argument("--no-enhance", dest="enhance", action="store_false", help="Disable 60db audio enhancement (on by default)") + parser.set_defaults(enhance=True) + parser.add_argument("--output-format", type=str, default="mp3", choices=SUPPORTED_FORMATS, help="Audio format (default: mp3). REST/synthesize only.") + parser.add_argument("--transport", type=str, default="synthesize", choices=SUPPORTED_TRANSPORTS, help="API transport (default: synthesize)") + parser.add_argument("--sample-rate", type=int, default=24000, choices=[8000, 16000, 24000, 48000], help="Sample rate for websocket transport (default: 24000)") + parser.add_argument("--timeout", type=int, default=120, help="Request timeout in seconds (default: 120)") + parser.add_argument("--list-voices", action="store_true", help="List your 60db voices and exit") + parser.add_argument("--json", action="store_true", help="Output result as JSON") + parser.add_argument("--dry-run", action="store_true", help="Show what would be done without calling the API") + return parser.parse_args() + + +def main(): + args = parse_args() + verbose = not args.json + + api_key = get_sixtydb_api_key() + + if args.list_voices: + if not api_key: + print("Error: No 60db API key found. Add SIXTYDB_API_KEY to .env", file=sys.stderr) + sys.exit(1) + res = list_voices(api_key) + if not res.get("success"): + print(f"Error: {res['error']}", file=sys.stderr) + sys.exit(1) + if args.json: + print(json.dumps(res, indent=2)) + else: + voices = res["voices"] + print(f"Your 60db voices ({len(voices)}):\n") + print(f" {'voice_id':<38} {'name':<22} {'category':<12} {'lang'}") + print(f" {'-'*38} {'-'*22} {'-'*12} {'-'*6}") + for v in voices: + labels = v.get("labels") or {} + lang = labels.get("language", "") + print(f" {v.get('voice_id',''):<38} {(v.get('name') or '')[:22]:<22} " + f"{(v.get('category') or '')[:12]:<12} {lang}") + sys.exit(0) + + if not args.text: + print("Error: --text is required", file=sys.stderr) + sys.exit(1) + if not args.output: + print("Error: --output is required", file=sys.stderr) + sys.exit(1) + + voice_id = args.voice_id or get_sixtydb_voice_id() or DEFAULT_VOICE_ID + + if args.dry_run: + result = { + "dry_run": True, + "provider": "60db", + "text_chars": len(args.text), + "output": args.output, + "voice_id": voice_id, + "transport": args.transport, + "settings": { + "stability": args.stability, + "similarity": args.similarity, + "speed": args.speed, + "enhance": args.enhance, + "output_format": args.output_format, + }, + } + if args.json: + print(json.dumps(result, indent=2)) + else: + print("Would generate speech with 60db:") + print(f" Voice ID: {voice_id}") + print(f" Transport: {args.transport}") + print(f" Text: {len(args.text)} characters") + print(f" Output: {args.output}") + sys.exit(0) + + if not api_key: + print( + "Error: No 60db API key found.\n" + " echo \"SIXTYDB_API_KEY=sk_live_your_key\" >> .env", + file=sys.stderr, + ) + sys.exit(1) + + if verbose: + print(f"Generating speech with 60db ({len(args.text)} chars)...", file=sys.stderr) + + result = generate_audio( + text=args.text, + output_path=args.output, + voice_id=voice_id, + stability=args.stability, + similarity=args.similarity, + speed=args.speed, + enhance=args.enhance, + output_format=args.output_format, + transport=args.transport, + sample_rate=args.sample_rate, + api_key=api_key, + timeout=args.timeout, + verbose=verbose, + ) + + if not result.get("success"): + if args.json: + print(json.dumps(result, indent=2)) + else: + print(f"Error: {result.get('error', 'Unknown error')}", file=sys.stderr) + sys.exit(1) + + if args.json: + print(json.dumps(result, indent=2)) + else: + print(f"Generated: {result['output']}", file=sys.stderr) + duration = result.get("duration_seconds") + if duration: + print(f" Duration: {duration:.1f}s ({int(duration * 30)} frames @ 30fps)", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/tools/voiceover.py b/tools/voiceover.py index b3106c2..1a584b3 100644 --- a/tools/voiceover.py +++ b/tools/voiceover.py @@ -38,7 +38,14 @@ # Add parent to path for local imports sys.path.insert(0, str(Path(__file__).parent)) -from config import get_brand_dir, get_elevenlabs_api_key, get_voice_id, load_brand_voice_config +from config import ( + get_brand_dir, + get_elevenlabs_api_key, + get_sixtydb_api_key, + get_sixtydb_voice_id, + get_voice_id, + load_brand_voice_config, +) def _get_elevenlabs_imports(): @@ -111,7 +118,7 @@ def parse_args(): "--provider", type=str, default="elevenlabs", - choices=["elevenlabs", "qwen3"], + choices=["elevenlabs", "qwen3", "60db"], help="TTS provider (default: elevenlabs)", ) @@ -155,6 +162,22 @@ def parse_args(): help="Speech speed multiplier (default: 1.0)", ) + # 60db-specific options (stability/similarity/speed are shared, on the 0-1 scale) + parser.add_argument( + "--transport", + type=str, + default="synthesize", + choices=["synthesize", "stream", "websocket"], + help="60db API transport (default: synthesize)", + ) + parser.add_argument( + "--no-enhance", + dest="enhance", + action="store_false", + help="Disable 60db audio enhancement (on by default)", + ) + parser.set_defaults(enhance=True) + # Qwen3-TTS-specific options parser.add_argument( "--speaker", @@ -311,6 +334,40 @@ def generate_single_audio( return result +def generate_single_audio_60db( + script: str, + output_path: Path, + voice_id: str | None, + stability: float, + similarity: float, + speed: float, + enhance: bool, + transport: str, + api_key: str | None, +) -> dict: + """Generate a single audio file from script text using 60db. Returns result dict. + + stability/similarity are passed on the unified 0-1 scale and converted to + 60db's native 0-100 inside sixtydb_tts.generate_audio(). + """ + from sixtydb_tts import generate_audio + + output_path.parent.mkdir(parents=True, exist_ok=True) + + return generate_audio( + text=script, + output_path=str(output_path), + voice_id=voice_id, + stability=stability, + similarity=similarity, + speed=speed, + enhance=enhance, + transport=transport, + api_key=api_key, + verbose=False, + ) + + def generate_single_audio_qwen3( script: str, output_path: Path, @@ -407,6 +464,10 @@ def process_scene_directory( similarity: float = 0.95, style: float = 0.0, speed: float = 1.0, + # 60db params + enhance: bool = True, + transport: str = "synthesize", + sixtydb_api_key: str | None = None, # Qwen3 params speaker: str = "Ryan", language: str = "Auto", @@ -536,6 +597,18 @@ def process_scene_directory( top_p=top_p, cloud=cloud, ) + elif provider == "60db": + result = generate_single_audio_60db( + script=s["script"], + output_path=s["mp3_file"], + voice_id=voice_id or None, + stability=stability, + similarity=similarity, + speed=speed, + enhance=enhance, + transport=transport, + api_key=sixtydb_api_key, + ) else: result = generate_single_audio( client=client, @@ -704,6 +777,21 @@ def main(): # Apply voice ID from brand if not explicitly provided if not args.voice_id and voice_config.get("voiceId") and voice_config["voiceId"] != "YOUR_VOICE_ID_HERE": args.voice_id = voice_config["voiceId"] + elif provider == "60db": + # Apply 60db voice ID + settings from the brand's `sixtydb` block + sixtydb_cfg = voice_config.get("sixtydb", {}) + if not args.voice_id and sixtydb_cfg.get("voiceId") and sixtydb_cfg["voiceId"] != "YOUR_VOICE_ID_HERE": + args.voice_id = sixtydb_cfg["voiceId"] + sixtydb_settings = sixtydb_cfg.get("settings", {}) + # Brand settings fill in only when the CLI left a default in place. + if "stability" in sixtydb_settings and args.stability == 0.85: + args.stability = sixtydb_settings["stability"] + if "similarity" in sixtydb_settings and args.similarity == 0.95: + args.similarity = sixtydb_settings["similarity"] + if "speed" in sixtydb_settings and args.speed == 1.0: + args.speed = sixtydb_settings["speed"] + if "enhance" in sixtydb_settings and args.enhance is True: + args.enhance = sixtydb_settings["enhance"] # Resolve tone preset → instruct text for Qwen3 if provider == "qwen3" and (args.tone or args.instruct): @@ -723,6 +811,31 @@ def main(): # Provider-specific setup client = None voice_id = None + sixtydb_api_key = None + + if provider == "60db": + sixtydb_api_key = get_sixtydb_api_key() + if not sixtydb_api_key: + print( + "Error: No 60db API key found.\n" + "\n" + "You have 3 options:\n" + "\n" + " 1. Add a 60db key:\n" + " echo \"SIXTYDB_API_KEY=sk_live_your_key\" >> .env\n" + "\n" + " 2. Use ElevenLabs or Qwen3-TTS instead:\n" + " python3 tools/voiceover.py --provider elevenlabs --scene-dir public/audio/scenes --json\n" + " python3 tools/voiceover.py --provider qwen3 --scene-dir public/audio/scenes --json\n" + "\n" + " 3. Skip voiceover entirely:\n" + " Videos render fine without audio. Add voiceover later when ready.", + file=sys.stderr, + ) + sys.exit(1) + + from sixtydb_tts import DEFAULT_VOICE_ID as _SIXTYDB_DEFAULT_VOICE + voice_id = args.voice_id or get_sixtydb_voice_id() or _SIXTYDB_DEFAULT_VOICE if provider == "elevenlabs": api_key = get_elevenlabs_api_key() @@ -765,7 +878,7 @@ def main(): if not args.json: txt_count = len(list(scene_dir.glob("*.txt"))) - provider_label = "Qwen3-TTS" if provider == "qwen3" else "ElevenLabs" + provider_label = {"qwen3": "Qwen3-TTS", "60db": "60db"}.get(provider, "ElevenLabs") print(f"Processing {txt_count} scene scripts in {scene_dir} ({provider_label})...", file=sys.stderr) if args.dry_run: @@ -783,6 +896,9 @@ def main(): similarity=args.similarity, style=args.style, speed=args.speed, + enhance=args.enhance, + transport=args.transport, + sixtydb_api_key=sixtydb_api_key, speaker=args.speaker, language=args.language, instruct=args.instruct, @@ -809,6 +925,15 @@ def main(): "style": args.style, "speed": args.speed, } + elif provider == "60db": + result["voice_id"] = voice_id + result["transport"] = args.transport + result["settings"] = { + "stability": args.stability, + "similarity": args.similarity, + "speed": args.speed, + "enhance": args.enhance, + } else: result["speaker"] = args.speaker result["language"] = args.language @@ -837,6 +962,9 @@ def main(): similarity=args.similarity, style=args.style, speed=args.speed, + enhance=args.enhance, + transport=args.transport, + sixtydb_api_key=sixtydb_api_key, speaker=args.speaker, language=args.language, instruct=args.instruct, @@ -861,6 +989,9 @@ def main(): if provider == "elevenlabs": result["voice_id"] = voice_id result["model"] = args.model + elif provider == "60db": + result["voice_id"] = voice_id + result["transport"] = args.transport # Concat if requested if args.concat: @@ -907,6 +1038,15 @@ def main(): "style": args.style, "speed": args.speed, } + elif provider == "60db": + result["voice_id"] = voice_id + result["transport"] = args.transport + result["settings"] = { + "stability": args.stability, + "similarity": args.similarity, + "speed": args.speed, + "enhance": args.enhance, + } else: result["speaker"] = args.speaker result["language"] = args.language @@ -923,6 +1063,9 @@ def main(): if provider == "elevenlabs": print(f" Voice ID: {voice_id}") print(f" Model: {args.model}") + elif provider == "60db": + print(f" Voice ID: {voice_id}") + print(f" Transport: {args.transport}") else: print(f" Speaker: {args.speaker}") print(f" Language: {args.language}") @@ -932,7 +1075,7 @@ def main(): # Generate voiceover if not args.json: - provider_label = "Qwen3-TTS" if provider == "qwen3" else "ElevenLabs" + provider_label = {"qwen3": "Qwen3-TTS", "60db": "60db"}.get(provider, "ElevenLabs") print(f"Generating voiceover ({len(script)} chars, {provider_label})...", file=sys.stderr) if provider == "qwen3": @@ -948,6 +1091,18 @@ def main(): top_p=args.top_p, cloud=args.cloud, ) + elif provider == "60db": + result = generate_single_audio_60db( + script=script, + output_path=output_path, + voice_id=voice_id, + stability=args.stability, + similarity=args.similarity, + speed=args.speed, + enhance=args.enhance, + transport=args.transport, + api_key=sixtydb_api_key, + ) else: result = generate_single_audio( client=client, @@ -966,6 +1121,9 @@ def main(): if provider == "elevenlabs": result["voice_id"] = voice_id result["model"] = args.model + elif provider == "60db": + result["voice_id"] = voice_id + result["transport"] = args.transport if args.json: print(json.dumps(result, indent=2))