Skip to content

Commit e0104a1

Browse files
committed
Merge branch 'main' of https://github.com/AssemblyAI/assemblyai-api-spec into adinoto/async-turn-detection-cookbook
2 parents 743de92 + 5b0cf0f commit e0104a1

31 files changed

+1177
-400
lines changed

cookbook-master/streaming-stt/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ AssemblyAI's Streaming Speech-to-Text (STT) allows you to transcribe live audio
1919

2020
### Streaming with LeMUR
2121

22-
[Use LeMUR with Streaming STT](real_time_lemur.ipynb)\
22+
[Use LeMUR with Streaming STT](real_time_llm_gateway.ipynb)\
2323
🆕 [Use LeMUR for Real-Time Translation](real_time_translation.ipynb)
2424

2525
### Use Case Specific Streaming Workflows

cookbook-master/streaming-stt/real_time_lemur.ipynb

Lines changed: 0 additions & 101 deletions
This file was deleted.
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "mlzJgtaDIycN"
7+
},
8+
"source": [
9+
"# Using LLM Gateway with Streaming Speech-to-Text (STT)\n",
10+
"\n",
11+
"This script is modified to contain a global variable `conversation_data` that accumulates the transcribed text in the `on_data` function. Once the transcription session is closed, the `on_close` function sends `conversation_data` to LLM Gateway for analysis."
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"```python\n",
19+
"import pyaudio\n",
20+
"import websocket\n",
21+
"import json\n",
22+
"import threading\n",
23+
"import time\n",
24+
"import requests\n",
25+
"from urllib.parse import urlencode\n",
26+
"from datetime import datetime\n",
27+
"\n",
28+
"# --- Configuration ---\n",
29+
"YOUR_API_KEY = \"YOUR_API_KEY\" # Replace with your actual API key\n",
30+
"\n",
31+
"CONNECTION_PARAMS = {\n",
32+
" \"sample_rate\": 16000,\n",
33+
" \"format_turns\": True, # Request formatted final transcripts\n",
34+
"}\n",
35+
"API_ENDPOINT_BASE_URL = \"wss://streaming.assemblyai.com/v3/ws\"\n",
36+
"API_ENDPOINT = f\"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}\"\n",
37+
"\n",
38+
"# Audio Configuration\n",
39+
"FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz)\n",
40+
"SAMPLE_RATE = CONNECTION_PARAMS[\"sample_rate\"]\n",
41+
"CHANNELS = 1\n",
42+
"FORMAT = pyaudio.paInt16\n",
43+
"\n",
44+
"# Global variables for audio stream and websocket\n",
45+
"audio = None\n",
46+
"stream = None\n",
47+
"ws_app = None\n",
48+
"audio_thread = None\n",
49+
"stop_event = threading.Event() # To signal the audio thread to stop\n",
50+
"conversation_data = \"\"\n",
51+
"\n",
52+
"# WAV recording variables\n",
53+
"recorded_frames = [] # Store audio frames for WAV file\n",
54+
"recording_lock = threading.Lock() # Thread-safe access to recorded_frames\n",
55+
"\n",
56+
"# --- Function to Analyze Text with LLM Gateway ---\n",
57+
"\n",
58+
"def analyze_with_llm_gateway(text):\n",
59+
" \"\"\"Called when the WebSocket connection is closing and the transcript text is sent to LLM Gateway to be analyzed.\"\"\"\n",
60+
" headers = {\n",
61+
" \"authorization\": YOUR_API_KEY,\n",
62+
" \"content-type\": \"application/json\"\n",
63+
" }\n",
64+
"\n",
65+
" prompt = \"You are a helpful coach. Provide an analysis of the transcript and offer areas to improve with exact quotes. Include no preamble. Start with an overall summary then get into the examples with feedback.\"\n",
66+
"\n",
67+
" llm_gateway_data = {\n",
68+
" \"model\": \"claude-sonnet-4-20250514\",\n",
69+
" \"messages\": [\n",
70+
" {\"role\": \"user\", \"content\": f\"{prompt}\\n\\nTranscript: {text}\"}\n",
71+
" ],\n",
72+
" \"max_tokens\": 4000\n",
73+
" }\n",
74+
" \n",
75+
" result = requests.post(\n",
76+
" \"https://llm-gateway.assemblyai.com/v1/chat/completions\",\n",
77+
" headers=headers,\n",
78+
" json=llm_gateway_data\n",
79+
" )\n",
80+
" return result.json()[\"choices\"][0][\"message\"][\"content\"]\n",
81+
"\n",
82+
"# --- WebSocket Event Handlers ---\n",
83+
"\n",
84+
"def on_open(ws):\n",
85+
" \"\"\"Called when the WebSocket connection is established.\"\"\"\n",
86+
" print(\"WebSocket connection opened.\")\n",
87+
" print(f\"Connected to: {API_ENDPOINT}\")\n",
88+
"\n",
89+
" # Start sending audio data in a separate thread\n",
90+
" def stream_audio():\n",
91+
" global stream\n",
92+
" print(\"Starting audio streaming...\")\n",
93+
" while not stop_event.is_set():\n",
94+
" try:\n",
95+
" audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)\n",
96+
" \n",
97+
" # Store audio data for WAV recording\n",
98+
" with recording_lock:\n",
99+
" recorded_frames.append(audio_data)\n",
100+
" \n",
101+
" # Send audio data as binary message\n",
102+
" ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)\n",
103+
" except Exception as e:\n",
104+
" print(f\"Error streaming audio: {e}\")\n",
105+
" # If stream read fails, likely means it's closed, stop the loop\n",
106+
" break\n",
107+
" print(\"Audio streaming stopped.\")\n",
108+
"\n",
109+
" global audio_thread\n",
110+
" audio_thread = threading.Thread(target=stream_audio)\n",
111+
" audio_thread.daemon = (\n",
112+
" True # Allow main thread to exit even if this thread is running\n",
113+
" )\n",
114+
" audio_thread.start()\n",
115+
"\n",
116+
"def on_message(ws, message):\n",
117+
"\n",
118+
" try:\n",
119+
" data = json.loads(message)\n",
120+
" msg_type = data.get('type')\n",
121+
"\n",
122+
" if msg_type == \"Begin\":\n",
123+
" session_id = data.get('id')\n",
124+
" expires_at = data.get('expires_at')\n",
125+
" print(f\"\\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}\")\n",
126+
" elif msg_type == \"Turn\":\n",
127+
" transcript = data.get('transcript', '')\n",
128+
" formatted = data.get('turn_is_formatted', False)\n",
129+
"\n",
130+
" if formatted:\n",
131+
" global conversation_data\n",
132+
"\n",
133+
" print('\\r' + ' ' * 80 + '\\r', end='')\n",
134+
" print(transcript)\n",
135+
" conversation_data += f\"{transcript}\\n\"\n",
136+
"\n",
137+
" elif msg_type == \"Termination\":\n",
138+
" audio_duration = data.get('audio_duration_seconds', 0)\n",
139+
" session_duration = data.get('session_duration_seconds', 0)\n",
140+
" print(f\"\\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s\")\n",
141+
" except json.JSONDecodeError as e:\n",
142+
" print(f\"Error decoding message: {e}\")\n",
143+
" except Exception as e:\n",
144+
" print(f\"Error handling message: {e}\")\n",
145+
"\n",
146+
"def on_error(ws, error):\n",
147+
" \"\"\"Called when a WebSocket error occurs.\"\"\"\n",
148+
" print(f\"\\nWebSocket Error: {error}\")\n",
149+
" # Attempt to signal stop on error\n",
150+
" stop_event.set()\n",
151+
"\n",
152+
"def on_close(ws, close_status_code, close_msg):\n",
153+
" \"\"\"Called when the WebSocket connection is closed.\"\"\"\n",
154+
" print(f\"\\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}\")\n",
155+
" \n",
156+
" # Ensure audio resources are released\n",
157+
" global stream, audio\n",
158+
" stop_event.set() # Signal audio thread just in case it's still running\n",
159+
"\n",
160+
" if stream:\n",
161+
" if stream.is_active():\n",
162+
" stream.stop_stream()\n",
163+
" stream.close()\n",
164+
" stream = None\n",
165+
" if audio:\n",
166+
" audio.terminate()\n",
167+
" audio = None\n",
168+
" # Try to join the audio thread to ensure clean exit\n",
169+
" if audio_thread and audio_thread.is_alive():\n",
170+
" audio_thread.join(timeout=1.0)\n",
171+
"\n",
172+
"# --- Main Execution ---\n",
173+
"\n",
174+
"def run():\n",
175+
" global audio, stream, ws_app\n",
176+
"\n",
177+
" # Initialize PyAudio\n",
178+
" audio = pyaudio.PyAudio()\n",
179+
"\n",
180+
" # Open microphone stream\n",
181+
" try:\n",
182+
" stream = audio.open(\n",
183+
" input=True,\n",
184+
" frames_per_buffer=FRAMES_PER_BUFFER,\n",
185+
" channels=CHANNELS,\n",
186+
" format=FORMAT,\n",
187+
" rate=SAMPLE_RATE,\n",
188+
" )\n",
189+
" print(\"Microphone stream opened successfully.\")\n",
190+
" print(\"Speak into your microphone. Press Ctrl+C to stop.\")\n",
191+
" print(\"Audio will be saved to a WAV file when the session ends.\")\n",
192+
" except Exception as e:\n",
193+
" print(f\"Error opening microphone stream: {e}\")\n",
194+
" if audio:\n",
195+
" audio.terminate()\n",
196+
" return # Exit if microphone cannot be opened\n",
197+
"\n",
198+
" # Create WebSocketApp\n",
199+
" ws_app = websocket.WebSocketApp(\n",
200+
" API_ENDPOINT,\n",
201+
" header={\"Authorization\": YOUR_API_KEY},\n",
202+
" on_open=on_open,\n",
203+
" on_message=on_message,\n",
204+
" on_error=on_error,\n",
205+
" on_close=on_close,\n",
206+
" )\n",
207+
"\n",
208+
" # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt\n",
209+
" ws_thread = threading.Thread(target=ws_app.run_forever)\n",
210+
" ws_thread.daemon = True\n",
211+
" ws_thread.start()\n",
212+
"\n",
213+
" try:\n",
214+
" # Keep main thread alive until interrupted\n",
215+
" while ws_thread.is_alive():\n",
216+
" time.sleep(0.1)\n",
217+
" except KeyboardInterrupt:\n",
218+
" print(\"\\nCtrl+C received. Stopping...\")\n",
219+
" stop_event.set() # Signal audio thread to stop\n",
220+
"\n",
221+
" # Send termination message to the server\n",
222+
" if ws_app and ws_app.sock and ws_app.sock.connected:\n",
223+
" try:\n",
224+
" terminate_message = {\"type\": \"Terminate\"}\n",
225+
" print(f\"Sending termination message: {json.dumps(terminate_message)}\")\n",
226+
" ws_app.send(json.dumps(terminate_message))\n",
227+
" # Give a moment for messages to process before forceful close\n",
228+
" time.sleep(5)\n",
229+
" except Exception as e:\n",
230+
" print(f\"Error sending termination message: {e}\")\n",
231+
"\n",
232+
" # Close the WebSocket connection (will trigger on_close)\n",
233+
" if ws_app:\n",
234+
" ws_app.close()\n",
235+
"\n",
236+
" # Wait for WebSocket thread to finish\n",
237+
" ws_thread.join(timeout=2.0)\n",
238+
"\n",
239+
" # Analyze transcript with LLM Gateway\n",
240+
" if conversation_data.strip():\n",
241+
" print(\"Analyzing conversation with LLM Gateway...\")\n",
242+
" print(analyze_with_llm_gateway(conversation_data))\n",
243+
" else:\n",
244+
" print(\"No conversation data to analyze.\")\n",
245+
"\n",
246+
" except Exception as e:\n",
247+
" print(f\"\\nAn unexpected error occurred: {e}\")\n",
248+
" stop_event.set()\n",
249+
" if ws_app:\n",
250+
" ws_app.close()\n",
251+
" ws_thread.join(timeout=2.0)\n",
252+
"\n",
253+
" finally:\n",
254+
" # Final cleanup (already handled in on_close, but good as a fallback)\n",
255+
" if stream and stream.is_active():\n",
256+
" stream.stop_stream()\n",
257+
" if stream:\n",
258+
" stream.close()\n",
259+
" if audio:\n",
260+
" audio.terminate()\n",
261+
" print(\"Cleanup complete. Exiting.\")\n",
262+
"\n",
263+
"\n",
264+
"if __name__ == \"__main__\":\n",
265+
" run()\n",
266+
"```"
267+
]
268+
}
269+
],
270+
"metadata": {
271+
"colab": {
272+
"provenance": []
273+
},
274+
"kernelspec": {
275+
"display_name": "Python 3",
276+
"name": "python3"
277+
},
278+
"language_info": {
279+
"name": "python"
280+
}
281+
},
282+
"nbformat": 4,
283+
"nbformat_minor": 0
284+
}

0 commit comments

Comments
 (0)