Skip to content

Commit 635b4f7

Browse files
committed
Switch to Gemma 3 270M and add performance stats
- Replace Qwen 2.5 0.5B with Gemma 3 270M for faster inference - Add stats display showing average STT, LLM, TTS times in ms - Create useStats composable for timing metrics - Update chat template to Gemma format - Reduce total model download from ~380MB to ~210MB
1 parent 0ff9ec2 commit 635b4f7

11 files changed

Lines changed: 177 additions & 658 deletions

File tree

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.1.0] - 2025-12-22
9+
10+
### Added
11+
12+
- Performance stats display at bottom-left showing average STT, LLM, and TTS processing times in milliseconds
13+
- New `useStats` composable for tracking timing metrics with rolling averages
14+
15+
### Changed
16+
17+
- Switched LLM from Qwen 2.5 0.5B (~350MB) to Gemma 3 270M (~180MB) for faster inference and smaller download
18+
- Updated chat template to use Gemma's `<start_of_turn>/<end_of_turn>` format
19+
- Total model download reduced from ~380MB to ~210MB
20+
821
## [1.0.0] - 2024-12-22
922

1023
### Added

README.html

Lines changed: 0 additions & 634 deletions
This file was deleted.

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Ava uses a pipeline architecture with three WebAssembly-powered stages:
1414

1515
1. **Speech Recognition** — Audio from the microphone is captured and processed by Whisper (tiny-en model) running in WASM. The `useWhisper` composable handles audio chunking and streams transcriptions every 2 seconds.
1616

17-
2. **Language Model** — Transcribed text is passed to Qwen 0.5B via Wllama (llama.cpp WASM port). The `useConversation` composable orchestrates the flow, triggering inference when speech ends and streaming tokens back as they're generated.
17+
2. **Language Model** — Transcribed text is passed to Gemma 3 270M via Wllama (llama.cpp WASM port). The `useConversation` composable orchestrates the flow, triggering inference when speech ends and streaming tokens back as they're generated.
1818

1919
3. **Speech Synthesis** — Generated text is split at sentence boundaries (`. ! ? ,`) and queued to the browser's native SpeechSynthesis API. This enables low-latency voice output that starts speaking before the full response is complete.
2020

@@ -29,7 +29,7 @@ All processing happens client-side with zero network requests after initial mode
2929
| Component | Technology | Size |
3030
|-----------|------------|------|
3131
| Speech-to-Text | Whisper (whisper-web-transcriber) | ~31MB |
32-
| LLM | Qwen 2.5 0.5B Instruct (Wllama) | ~350MB |
32+
| LLM | Gemma 3 270M Instruct (Wllama) | ~180MB |
3333
| Text-to-Speech | Web Speech Synthesis API | Native |
3434
| Audio Visualization | Web Audio API | Native |
3535
| Frontend | Vue 3 + TypeScript ||
@@ -46,7 +46,7 @@ src/
4646
├── composables/
4747
│ ├── useConversation.ts # Orchestrates conversation flow
4848
│ ├── useWhisper.ts # Whisper WASM speech recognition
49-
│ ├── useWllama.ts # Qwen LLM inference
49+
│ ├── useWllama.ts # Gemma LLM inference
5050
│ ├── useSpeechSynthesis.ts # Browser TTS wrapper
5151
│ └── useAudioVisualizer.ts # Web Audio frequency analysis
5252
├── styles/
@@ -124,7 +124,7 @@ npm run preview # Preview production build
124124

125125
## Performance Notes
126126

127-
- **First load**: Downloads ~380MB of models (cached by browser)
127+
- **First load**: Downloads ~210MB of models (cached by browser)
128128
- **Inference**: ~0.3-0.5s for Whisper, ~1-2s for LLM response
129129
- **Memory**: ~500MB-1GB RAM usage during operation
130130
- **WebGPU**: Not yet supported; runs on CPU via WASM SIMD

docs/architecture.dot

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ digraph Ava {
5656
color=black
5757

5858
whisperModel [label="Whisper tiny-en\n(~31MB)"]
59-
qwenModel [label="Qwen 0.5B\n(~350MB)"]
59+
gemmaModel [label="Gemma 270M\n(~180MB)"]
6060
}
6161

6262
// Browser API
@@ -76,8 +76,8 @@ digraph Ava {
7676
whisperModel -> conversation [label="transcript"]
7777

7878
conversation -> wllama [label="prompt"]
79-
wllama -> qwenModel [label="WASM"]
80-
qwenModel -> conversation [label="response"]
79+
wllama -> gemmaModel [label="WASM"]
80+
gemmaModel -> conversation [label="response"]
8181

8282
conversation -> tts [label="sentences"]
8383
tts -> speechSynth [label="speak"]

src/App.vue

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ const {
2525
isSpeaking,
2626
ttsError,
2727
frequencyData,
28+
avgSTT,
29+
avgLLM,
30+
avgTTS,
2831
toggleConversation,
2932
initialize
3033
} = useConversation()
@@ -63,7 +66,7 @@ onMounted(() => {
6366
<span class="loading-text">{{ whisperLoadProgress }}%</span>
6467
</div>
6568
<div v-if="isLLMLoading" class="loading-container">
66-
<div class="loading-label">CORTEX (Qwen 0.5B)</div>
69+
<div class="loading-label">CORTEX (Gemma 270M)</div>
6770
<div class="loading-bar">
6871
<div class="loading-fill" :style="{ width: llmLoadProgress + '%' }"></div>
6972
</div>
@@ -123,6 +126,13 @@ onMounted(() => {
123126
<!-- About Button -->
124127
<button class="about-button" @click="showAbout = true">About</button>
125128

129+
<!-- Stats Display -->
130+
<div v-if="avgSTT > 0 || avgLLM > 0 || avgTTS > 0" class="stats-display">
131+
<span v-if="avgSTT > 0" class="stat-item">STT: {{ avgSTT }}ms</span>
132+
<span v-if="avgLLM > 0" class="stat-item">LLM: {{ avgLLM }}ms</span>
133+
<span v-if="avgTTS > 0" class="stat-item">TTS: {{ avgTTS }}ms</span>
134+
</div>
135+
126136
<!-- About Popup -->
127137
<AboutPopup :visible="showAbout" @close="showAbout = false" />
128138
</div>

src/components/AboutPopup.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ const emit = defineEmits<{
1717
<p class="about-desc">Meet Ava, your private AI assistant running entirely in your browser. No servers, no data leaves your device.</p>
1818
<div class="about-tech">
1919
<span>Whisper</span>
20-
<span>Qwen 0.5B</span>
20+
<span>Gemma 270M</span>
2121
<span>WebAssembly</span>
2222
</div>
2323
<div class="author-footer">

src/composables/useConversation.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { useWhisper } from './useWhisper'
33
import { useSpeechSynthesis } from './useSpeechSynthesis'
44
import { useWllama } from './useWllama'
55
import { useAudioVisualizer } from './useAudioVisualizer'
6+
import { getStats } from './useStats'
67

78
export type ConversationState = 'idle' | 'ready' | 'listening' | 'processing' | 'thinking' | 'speaking' | 'loading'
89

@@ -31,6 +32,9 @@ export function useConversation() {
3132
const isConversationActive = ref(false)
3233
const hasPlayedIntro = ref(false)
3334

35+
// Stats
36+
const stats = getStats()
37+
3438
// Whisper (Speech-to-Text)
3539
const {
3640
isSupported,
@@ -112,13 +116,28 @@ export function useConversation() {
112116
stopVisualizer()
113117

114118
// Stream LLM response and queue sentences for TTS
119+
const llmStartTime = performance.now()
120+
let ttsStartTime = 0
121+
let firstSentence = true
122+
115123
if (isTTSSupported.value) {
116124
await generateStreaming(text, (sentence: string) => {
125+
if (firstSentence) {
126+
ttsStartTime = performance.now()
127+
firstSentence = false
128+
}
117129
queueSentence(sentence)
118130
})
131+
const llmEndTime = performance.now()
132+
stats.addLLMTime(llmEndTime - llmStartTime)
133+
119134
await waitForQueue()
135+
if (ttsStartTime > 0) {
136+
stats.addTTSTime(performance.now() - ttsStartTime)
137+
}
120138
} else {
121139
await generateStreaming(text, () => {})
140+
stats.addLLMTime(performance.now() - llmStartTime)
122141
}
123142

124143
isProcessing.value = false
@@ -187,6 +206,11 @@ export function useConversation() {
187206
startVisualizer,
188207
stopVisualizer,
189208

209+
// Stats
210+
avgSTT: stats.avgSTT,
211+
avgLLM: stats.avgLLM,
212+
avgTTS: stats.avgTTS,
213+
190214
// Actions
191215
toggleConversation,
192216
initialize

src/composables/useStats.ts

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import { ref, computed } from 'vue'
2+
3+
// Keep last N samples for averaging
4+
const MAX_SAMPLES = 10
5+
6+
export function useStats() {
7+
const sttTimes = ref<number[]>([])
8+
const llmTimes = ref<number[]>([])
9+
const ttsTimes = ref<number[]>([])
10+
11+
const avgSTT = computed(() => {
12+
if (sttTimes.value.length === 0) return 0
13+
const sum = sttTimes.value.reduce((a, b) => a + b, 0)
14+
return Math.round(sum / sttTimes.value.length)
15+
})
16+
17+
const avgLLM = computed(() => {
18+
if (llmTimes.value.length === 0) return 0
19+
const sum = llmTimes.value.reduce((a, b) => a + b, 0)
20+
return Math.round(sum / llmTimes.value.length)
21+
})
22+
23+
const avgTTS = computed(() => {
24+
if (ttsTimes.value.length === 0) return 0
25+
const sum = ttsTimes.value.reduce((a, b) => a + b, 0)
26+
return Math.round(sum / ttsTimes.value.length)
27+
})
28+
29+
function addSTTTime(ms: number) {
30+
sttTimes.value.push(ms)
31+
if (sttTimes.value.length > MAX_SAMPLES) {
32+
sttTimes.value.shift()
33+
}
34+
}
35+
36+
function addLLMTime(ms: number) {
37+
llmTimes.value.push(ms)
38+
if (llmTimes.value.length > MAX_SAMPLES) {
39+
llmTimes.value.shift()
40+
}
41+
}
42+
43+
function addTTSTime(ms: number) {
44+
ttsTimes.value.push(ms)
45+
if (ttsTimes.value.length > MAX_SAMPLES) {
46+
ttsTimes.value.shift()
47+
}
48+
}
49+
50+
function reset() {
51+
sttTimes.value = []
52+
llmTimes.value = []
53+
ttsTimes.value = []
54+
}
55+
56+
return {
57+
avgSTT,
58+
avgLLM,
59+
avgTTS,
60+
addSTTTime,
61+
addLLMTime,
62+
addTTSTime,
63+
reset
64+
}
65+
}
66+
67+
// Singleton instance for shared state
68+
let statsInstance: ReturnType<typeof useStats> | null = null
69+
70+
export function getStats() {
71+
if (!statsInstance) {
72+
statsInstance = useStats()
73+
}
74+
return statsInstance
75+
}

src/composables/useWhisper.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { ref, onUnmounted } from 'vue'
22
import { WhisperTranscriber } from 'whisper-web-transcriber'
3+
import { getStats } from './useStats'
34

45
export function useWhisper() {
56
const isSupported = ref(true)
@@ -11,6 +12,7 @@ export function useWhisper() {
1112
const error = ref<string | null>(null)
1213

1314
let transcriber: WhisperTranscriber | null = null
15+
let sttStartTime = 0
1416

1517
async function loadModel() {
1618
if (isModelLoaded.value || isLoading.value) return
@@ -24,8 +26,15 @@ export function useWhisper() {
2426
modelSize: 'tiny-en-q5_1', // Smallest and fastest (~31MB)
2527
onTranscription: (text: string) => {
2628
if (text.trim()) {
29+
// Record STT processing time
30+
if (sttStartTime > 0) {
31+
const sttTime = performance.now() - sttStartTime
32+
getStats().addSTTTime(sttTime)
33+
}
2734
transcript.value = text.trim()
2835
}
36+
// Reset timer for next transcription
37+
sttStartTime = performance.now()
2938
},
3039
onProgress: (progress: number) => {
3140
loadProgress.value = Math.round(progress)
@@ -58,6 +67,7 @@ export function useWhisper() {
5867
await navigator.mediaDevices.getUserMedia({ audio: true })
5968
await transcriber.startRecording()
6069
isListening.value = true
70+
sttStartTime = performance.now() // Start timing for first transcription
6171
} catch (e) {
6272
const err = e as Error
6373
if (err.name === 'NotAllowedError' || err.message?.includes('not-allowed')) {

src/composables/useWllama.ts

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ const CONFIG_PATHS = {
66
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.7/esm/multi-thread/wllama.wasm',
77
}
88

9-
// Qwen2.5-0.5B-Instruct - Good balance of size (~350MB) and quality
9+
// Gemma 3 270M - Ultra fast, ~180MB
1010
const DEFAULT_MODEL = {
11-
repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF',
12-
file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf'
11+
repo: 'unsloth/gemma-3-270m-it-GGUF',
12+
file: 'gemma-3-270m-it-Q4_K_M.gguf'
1313
}
1414

1515
// Sentence boundary pattern - matches . ! ? , followed by space or end
@@ -79,8 +79,8 @@ export function useWllama() {
7979
top_p: 0.9,
8080
},
8181
onNewToken: (_token, _piece, currentText) => {
82-
// Strip any trailing special tokens from display
83-
const cleanText = currentText.replace(/<\|im_end\|>.*$/s, '').trim()
82+
// Strip any trailing special tokens from display (Gemma uses <end_of_turn>)
83+
const cleanText = currentText.replace(/<end_of_turn>.*$/s, '').trim()
8484
response.value = cleanText
8585

8686
// Get new content since last check
@@ -103,8 +103,8 @@ export function useWllama() {
103103
}
104104
})
105105

106-
// Clean final result
107-
const cleanResult = result.replace(/<\|im_end\|>.*$/s, '').trim()
106+
// Clean final result (Gemma uses <end_of_turn>)
107+
const cleanResult = result.replace(/<end_of_turn>.*$/s, '').trim()
108108
response.value = cleanResult
109109

110110
// Emit any remaining text in buffer
@@ -142,15 +142,15 @@ export function useWllama() {
142142
top_p: 0.9,
143143
},
144144
onNewToken: (_token, _piece, currentText) => {
145-
const cleanText = currentText.replace(/<\|im_end\|>.*$/s, '').trim()
145+
const cleanText = currentText.replace(/<end_of_turn>.*$/s, '').trim()
146146
response.value = cleanText
147147
if (onToken) {
148148
onToken(cleanText)
149149
}
150150
}
151151
})
152152

153-
const cleanResult = result.replace(/<\|im_end\|>.*$/s, '').trim()
153+
const cleanResult = result.replace(/<end_of_turn>.*$/s, '').trim()
154154
response.value = cleanResult
155155
return cleanResult
156156
} catch (e) {
@@ -162,12 +162,12 @@ export function useWllama() {
162162
}
163163

164164
function formatChatPrompt(userMessage: string): string {
165-
// Qwen2.5-Instruct chat template (ChatML format)
166-
return `<|im_start|>system
167-
You are Ava. Reply in 1-2 short sentences only.<|im_end|>
168-
<|im_start|>user
169-
${userMessage}<|im_end|>
170-
<|im_start|>assistant
165+
// Gemma 3 chat template
166+
return `<start_of_turn>user
167+
You are Ava. Reply in 1-2 short sentences only.
168+
169+
${userMessage}<end_of_turn>
170+
<start_of_turn>model
171171
`
172172
}
173173

0 commit comments

Comments
 (0)