GoogleCloudPlatform · Guiners · Oct 10, 2025 · Oct 10, 2025 · Oct 14, 2025 · Oct 15, 2025
@@ -0,0 +1,85 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile simpleaudio
+
+import asyncio
+
+
+async def generate_content() -> list:
+    # [START googlegenaisdk_live_audio_with_txt]
+    from google import genai
+    from google.genai.types import (
+        Content, LiveConnectConfig, Modality, Part,
+        PrebuiltVoiceConfig, SpeechConfig, VoiceConfig
+    )
+    import numpy as np
+    import soundfile as sf
+    import simpleaudio as sa
+
+    def play_audio(audio_array: np.ndarray, sample_rate: int = 24000) -> None:
+        sf.write("output.wav", audio_array, sample_rate)
+        wave_obj = sa.WaveObject.from_wave_file("output.wav")
+        play_obj = wave_obj.play()
+        play_obj.wait_done()
+
+    client = genai.Client()
+    voice_name = "Aoede"
+    model = "gemini-2.0-flash-live-preview-04-09"
+
+    config = LiveConnectConfig(
+        response_modalities=[Modality.AUDIO],
+        speech_config=SpeechConfig(
+            voice_config=VoiceConfig(
+                prebuilt_voice_config=PrebuiltVoiceConfig(
+                    voice_name=voice_name,
+                )
+            ),
+        ),
+    )
+
+    async with client.aio.live.connect(
+        model=model,
+        config=config,
+    ) as session:
+        text_input = "Hello? Gemini are you there?"
+        print("> ", text_input, "\n")
+
+        await session.send_client_content(
+            turns=Content(role="user", parts=[Part(text=text_input)])
+        )
+
+        audio_data = []
+        async for message in session.receive():
+            if (
+                message.server_content.model_turn
+                and message.server_content.model_turn.parts
+            ):
+                for part in message.server_content.model_turn.parts:
+                    if part.inline_data:
+                        audio_data.append(
+                            np.frombuffer(part.inline_data.data, dtype=np.int16)
+                        )
+
+        if audio_data:
+            print("Received audio answer: ")
+            play_audio(np.concatenate(audio_data), sample_rate=24000)
+
+    # [END googlegenaisdk_live_audio_with_txt]
+    return []
-    return []
+    return
-    return []
+    return
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -0,0 +1,133 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START googlegenaisdk_live_conversation_audio_with_audio]
+
+import asyncio
+import base64
+
+from google import genai
+from google.genai.types import (
+    AudioTranscriptionConfig,
+    Blob,
+    HttpOptions,
+    LiveConnectConfig,
+    Modality,
+)
+import numpy as np
+
+from scipy.io import wavfile
+
+# The number of audio frames to send in each chunk.
+CHUNK = 4200
+CHANNELS = 1
+MODEL = "gemini-live-2.5-flash-preview-native-audio-09-2025"
+
+# The audio sample rate expected by the model.
+INPUT_RATE = 16000
+# The audio sample rate of the audio generated by the model.
+OUTPUT_RATE = 24000
+
+# The sample width for 16-bit audio, which is standard for this type of audio data.
+SAMPLE_WIDTH = 2
+
+client = genai.Client(http_options=HttpOptions(api_version="v1beta1"))
+
+
+def read_wavefile(filepath: str) -> tuple[str, str]:
+    # Read the .wav file using scipy.io.wavfile.read
+    rate, data = wavfile.read(filepath)
+    # Convert the NumPy array of audio samples back to raw bytes
+    raw_audio_bytes = data.tobytes()
+    # Encode the raw bytes to a base64 string.
+    # The result needs to be decoded from bytes to a UTF-8 string
+    base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii")
+    mime_type = f"audio/pcm;rate={rate}"
+    return base64_encoded_data, mime_type
-def read_wavefile(filepath: str) -> tuple[str, str]:
-    # Read the .wav file using scipy.io.wavfile.read
-    rate, data = wavfile.read(filepath)
-    # Convert the NumPy array of audio samples back to raw bytes
-    raw_audio_bytes = data.tobytes()
-    # Encode the raw bytes to a base64 string.
-    # The result needs to be decoded from bytes to a UTF-8 string
-    base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii")
-    mime_type = f"audio/pcm;rate={rate}"
-    return base64_encoded_data, mime_type
+def read_wavefile(filepath: str) -> tuple[bytes, str]:
+    # Read the .wav file using scipy.io.wavfile.read
+    rate, data = wavfile.read(filepath)
+    # Convert the NumPy array of audio samples back to raw bytes
+    raw_audio_bytes = data.tobytes()
+    mime_type = f"audio/pcm;rate={rate}"
+    return raw_audio_bytes, mime_type
-def read_wavefile(filepath: str) -> tuple[str, str]:
-    # Read the .wav file using scipy.io.wavfile.read
-    rate, data = wavfile.read(filepath)
-    # Convert the NumPy array of audio samples back to raw bytes
-    raw_audio_bytes = data.tobytes()
-    # Encode the raw bytes to a base64 string.
-    # The result needs to be decoded from bytes to a UTF-8 string
-    base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii")
-    mime_type = f"audio/pcm;rate={rate}"
-    return base64_encoded_data, mime_type
+def read_wavefile(filepath: str) -> tuple[bytes, str]:
+    # Read the .wav file using scipy.io.wavfile.read
+    rate, data = wavfile.read(filepath)
+    # Convert the NumPy array of audio samples back to raw bytes
+    raw_audio_bytes = data.tobytes()
+    mime_type = f"audio/pcm;rate={rate}"
+    return raw_audio_bytes, mime_type
+
+
+def write_wavefile(filepath: str, audio_frames: list[bytes], rate: int) -> None:
+    """Writes a list of audio byte frames to a WAV file using scipy."""
+    # Combine the list of byte frames into a single byte string
+    raw_audio_bytes = b"".join(audio_frames)
+
+    # Convert the raw bytes to a NumPy array.
+    # The sample width is 2 bytes (16-bit), so we use np.int16
+    audio_data = np.frombuffer(raw_audio_bytes, dtype=np.int16)
+
+    # Write the NumPy array to a .wav file
+    wavfile.write(filepath, rate, audio_data)
+    print(f"Model response saved to {filepath}")
+
+
+async def main() -> bool:
+    print("Starting the code")
+
+    async with client.aio.live.connect(
+        model=MODEL,
+        config=LiveConnectConfig(
+            # Set Model responses to be in Audio
+            response_modalities=[Modality.AUDIO],
+            # To generate transcript for input audio
+            input_audio_transcription=AudioTranscriptionConfig(),
+            # To generate transcript for output audio
+            output_audio_transcription=AudioTranscriptionConfig(),
+        ),
+    ) as session:
+
+        async def send() -> None:
+            # using local file as an example for live audio input
+            wav_file_path = "hello_gemini_are_you_there.wav"
+            base64_data, mime_type = read_wavefile(wav_file_path)
+            audio_bytes = base64.b64decode(base64_data)
+            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
-            base64_data, mime_type = read_wavefile(wav_file_path)
-            audio_bytes = base64.b64decode(base64_data)
-            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
+            audio_bytes, mime_type = read_wavefile(wav_file_path)
+            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
-            base64_data, mime_type = read_wavefile(wav_file_path)
-            audio_bytes = base64.b64decode(base64_data)
-            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
+            audio_bytes, mime_type = read_wavefile(wav_file_path)
+            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))
+
+        async def receive() -> None:
+            audio_frames = []
+
+            async for message in session.receive():
+                if message.server_content.input_transcription:
+                    print(message.server_content.model_dump(mode="json", exclude_none=True))
+                if message.server_content.output_transcription:
+                    print(message.server_content.model_dump(mode="json", exclude_none=True))
+                if message.server_content.model_turn:
+                    for part in message.server_content.model_turn.parts:
+                        if part.inline_data.data:
+                            audio_data = part.inline_data.data
+                            audio_frames.append(audio_data)
+
+            if audio_frames:
+                write_wavefile(
+                    "example_model_response.wav",
+                    audio_frames,
+                    OUTPUT_RATE,
+                )
+
+        send_task = asyncio.create_task(send())
+        receive_task = asyncio.create_task(receive())
+        await asyncio.gather(send_task, receive_task)
+        # Example response:
+        #     gemini-2.0-flash-live-preview-04-09
+        #     {'input_transcription': {'text': 'Hello.'}}
+        #     {'output_transcription': {}}
+        #     {'output_transcription': {'text': 'Hi'}}
+        #     {'output_transcription': {'text': ' there. What can I do for you today?'}}
+        #     {'output_transcription': {'finished': True}}
+        #     Model response saved to example_model_response.wav
+
+# [END googlegenaisdk_live_conversation_audio_with_audio]
+    return True
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,65 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+
+_memory_corpus = "projects/cloud-ai-devrel-softserve/locations/us-central1/ragCorpora/2305843009213693952"
+
+
+async def generate_content(memory_corpus: str) -> list[str]:
+    # [START googlegenaisdk_live_ground_ragengine_with_txt]
+    from google import genai
+    from google.genai.types import (Content, LiveConnectConfig, Modality, Part,
+                                    Retrieval, Tool, VertexRagStore,
+                                    VertexRagStoreRagResource)
+
+    client = genai.Client()
+    model_id = "gemini-2.0-flash-live-preview-04-09"
+    rag_store = VertexRagStore(
+        rag_resources=[
+            VertexRagStoreRagResource(
+                rag_corpus=memory_corpus  # Use memory corpus if you want to store context.
+            )
+        ],
+        # Set `store_context` to true to allow Live API sink context into your memory corpus.
+        store_context=True,
+    )
+    config = LiveConnectConfig(
+        response_modalities=[Modality.TEXT],
+        tools=[Tool(retrieval=Retrieval(vertex_rag_store=rag_store))],
+    )
+
+    async with client.aio.live.connect(model=model_id, config=config) as session:
+        text_input = "What are newest gemini models?"
+        print("> ", text_input, "\n")
+
+        await session.send_client_content(
+            turns=Content(role="user", parts=[Part(text=text_input)])
+        )
+
+        response = []
+
+        async for message in session.receive():
+            if message.text:
+                response.append(message.text)
+
+    print("".join(response))
+    # Example output:
+    # >  What are newest gemini models?
+    # In December 2023, Google launched Gemini, their "most capable and general model". It's multimodal, meaning it understands and combines different types of information like text, code, audio, images, and video.
+    # [END googlegenaisdk_live_ground_ragengine_with_txt]
+    return response
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content(_memory_corpus))
@@ -24,7 +24,7 @@ class CalendarEvent(BaseModel):
 
 
 def generate_content() -> CalendarEvent:
-    # [START googlegenaisdk_live_structured_ouput_with_txt]
+    # [START googlegenaisdk_live_structured_output_with_txt]
     import os
 
     import google.auth.transport.requests
@@ -78,8 +78,8 @@ def generate_content() -> CalendarEvent:
     # System message: Extract the event information.
     # User message: Alice and Bob are going to a science fair on Friday.
     # Output message: name='science fair' date='Friday' participants=['Alice', 'Bob']
-    # [END googlegenaisdk_live_structured_ouput_with_txt]
-    return True
+    # [END googlegenaisdk_live_structured_output_with_txt]
+    return response
 
 
 if __name__ == "__main__":

@@ -0,0 +1,72 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile
+
+import asyncio
+
+
+async def generate_content() -> list[str]:
+    # [START googlegenaisdk_live_txt_with_audio]
+    import io
+
+    import librosa
+    import requests
+    import soundfile as sf
+    from google import genai
+    from google.genai.types import Blob, LiveConnectConfig, Modality
+
+    client = genai.Client()
+    model = "gemini-2.0-flash-live-preview-04-09"
+    config = LiveConnectConfig(response_modalities=[Modality.TEXT])
+
+    async with client.aio.live.connect(model=model, config=config) as session:
+        audio_url = (
+            "https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
+        )
+        response = requests.get(audio_url)
+        response.raise_for_status()
+        buffer = io.BytesIO(response.content)
+        y, sr = librosa.load(buffer, sr=16000)
+        sf.write(buffer, y, sr, format="RAW", subtype="PCM_16")
+        buffer.seek(0)
+        audio_bytes = buffer.read()
+
+        # If you've pre-converted to sample.pcm using ffmpeg, use this instead:
+        # audio_bytes = Path("sample.pcm").read_bytes()
+
+        print("> Answer to this audio url", audio_url, "\n")
+
+        await session.send_realtime_input(
+            media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
+        )
+
+        response = []
+
+        async for message in session.receive():
+            if message.text is not None:
+                response.append(message.text)
+
+        print("".join(response))
+    # Example output:
+    # > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+    # Yes, I can hear you. How can I help you today?
+    # [END googlegenaisdk_live_txt_with_audio]
+    return response
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -2,3 +2,4 @@ backoff==2.2.1
 google-api-core==2.25.1
 pytest==8.4.1
 pytest-asyncio==1.1.0
+pytest-mock==3.14.0
@@ -4,4 +4,7 @@ websockets==15.0.1
 numpy==1.26.4
 soundfile==0.12.1
 openai==1.99.1
-setuptools==80.9.0
+setuptools==80.9.0
+pyaudio==0.2.14
+librosa==0.11.0
+simpleaudio==1.0.0