Switch to Gemma 3 270M and add performance stats

muthuspark · muthuspark · commit 635b4f7cfa6c · 2025-12-22T13:47:37.000+05:30
- Replace Qwen 2.5 0.5B with Gemma 3 270M for faster inference
- Add stats display showing average STT, LLM, TTS times in ms
- Create useStats composable for timing metrics
- Update chat template to Gemma format
- Reduce total model download from ~380MB to ~210MB
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.1.0] - 2025-12-22
+
+### Added
+
+- Performance stats display at bottom-left showing average STT, LLM, and TTS processing times in milliseconds
+- New `useStats` composable for tracking timing metrics with rolling averages
+
+### Changed
+
+- Switched LLM from Qwen 2.5 0.5B (~350MB) to Gemma 3 270M (~180MB) for faster inference and smaller download
+- Updated chat template to use Gemma's `<start_of_turn>/<end_of_turn>` format
+- Total model download reduced from ~380MB to ~210MB
+
 ## [1.0.0] - 2024-12-22
 
 ### Added
diff --git a/README.html b/README.html
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Ava uses a pipeline architecture with three WebAssembly-powered stages:
 
 1. **Speech Recognition** — Audio from the microphone is captured and processed by Whisper (tiny-en model) running in WASM. The `useWhisper` composable handles audio chunking and streams transcriptions every 2 seconds.
 
-2. **Language Model** — Transcribed text is passed to Qwen 0.5B via Wllama (llama.cpp WASM port). The `useConversation` composable orchestrates the flow, triggering inference when speech ends and streaming tokens back as they're generated.
+2. **Language Model** — Transcribed text is passed to Gemma 3 270M via Wllama (llama.cpp WASM port). The `useConversation` composable orchestrates the flow, triggering inference when speech ends and streaming tokens back as they're generated.
 
 3. **Speech Synthesis** — Generated text is split at sentence boundaries (`. ! ? ,`) and queued to the browser's native SpeechSynthesis API. This enables low-latency voice output that starts speaking before the full response is complete.
 
@@ -29,7 +29,7 @@ All processing happens client-side with zero network requests after initial mode
 | Component | Technology | Size |
 |-----------|------------|------|
 | Speech-to-Text | Whisper (whisper-web-transcriber) | ~31MB |
-| LLM | Qwen 2.5 0.5B Instruct (Wllama) | ~350MB |
+| LLM | Gemma 3 270M Instruct (Wllama) | ~180MB |
 | Text-to-Speech | Web Speech Synthesis API | Native |
 | Audio Visualization | Web Audio API | Native |
 | Frontend | Vue 3 + TypeScript | — |
@@ -46,7 +46,7 @@ src/
 ├── composables/
 │   ├── useConversation.ts       # Orchestrates conversation flow
 │   ├── useWhisper.ts            # Whisper WASM speech recognition
-│   ├── useWllama.ts             # Qwen LLM inference
+│   ├── useWllama.ts             # Gemma LLM inference
 │   ├── useSpeechSynthesis.ts    # Browser TTS wrapper
 │   └── useAudioVisualizer.ts    # Web Audio frequency analysis
 ├── styles/
@@ -124,7 +124,7 @@ npm run preview  # Preview production build
 
 ## Performance Notes
 
-- **First load**: Downloads ~380MB of models (cached by browser)
+- **First load**: Downloads ~210MB of models (cached by browser)
 - **Inference**: ~0.3-0.5s for Whisper, ~1-2s for LLM response
 - **Memory**: ~500MB-1GB RAM usage during operation
 - **WebGPU**: Not yet supported; runs on CPU via WASM SIMD
diff --git a/docs/architecture.dot b/docs/architecture.dot
@@ -56,7 +56,7 @@ digraph Ava {
             color=black
     
             whisperModel [label="Whisper tiny-en\n(~31MB)"]
-            qwenModel [label="Qwen 0.5B\n(~350MB)"]
+            gemmaModel [label="Gemma 270M\n(~180MB)"]
         }
     
         // Browser API
@@ -76,8 +76,8 @@ digraph Ava {
     whisperModel -> conversation [label="transcript"]
 
     conversation -> wllama [label="prompt"]
-    wllama -> qwenModel [label="WASM"]
-    qwenModel -> conversation [label="response"]
+    wllama -> gemmaModel [label="WASM"]
+    gemmaModel -> conversation [label="response"]
 
     conversation -> tts [label="sentences"]
     tts -> speechSynth [label="speak"]
diff --git a/src/App.vue b/src/App.vue
@@ -25,6 +25,9 @@ const {
   isSpeaking,
   ttsError,
   frequencyData,
+  avgSTT,
+  avgLLM,
+  avgTTS,
   toggleConversation,
   initialize
 } = useConversation()
@@ -63,7 +66,7 @@ onMounted(() => {
         <span class="loading-text">{{ whisperLoadProgress }}%</span>
       </div>
       <div v-if="isLLMLoading" class="loading-container">
-        <div class="loading-label">CORTEX (Qwen 0.5B)</div>
+        <div class="loading-label">CORTEX (Gemma 270M)</div>
         <div class="loading-bar">
           <div class="loading-fill" :style="{ width: llmLoadProgress + '%' }"></div>
         </div>
@@ -123,6 +126,13 @@ onMounted(() => {
     <!-- About Button -->
     <button class="about-button" @click="showAbout = true">About</button>
 
+    <!-- Stats Display -->
+    <div v-if="avgSTT > 0 || avgLLM > 0 || avgTTS > 0" class="stats-display">
+      <span v-if="avgSTT > 0" class="stat-item">STT: {{ avgSTT }}ms</span>
+      <span v-if="avgLLM > 0" class="stat-item">LLM: {{ avgLLM }}ms</span>
+      <span v-if="avgTTS > 0" class="stat-item">TTS: {{ avgTTS }}ms</span>
+    </div>
+
     <!-- About Popup -->
     <AboutPopup :visible="showAbout" @close="showAbout = false" />
   </div>
diff --git a/src/components/AboutPopup.vue b/src/components/AboutPopup.vue
@@ -17,7 +17,7 @@ const emit = defineEmits<{
       <p class="about-desc">Meet Ava, your private AI assistant running entirely in your browser. No servers, no data leaves your device.</p>
       <div class="about-tech">
         <span>Whisper</span>
-        <span>Qwen 0.5B</span>
+        <span>Gemma 270M</span>
         <span>WebAssembly</span>
       </div>
       <div class="author-footer">
diff --git a/src/composables/useConversation.ts b/src/composables/useConversation.ts
@@ -3,6 +3,7 @@ import { useWhisper } from './useWhisper'
 import { useSpeechSynthesis } from './useSpeechSynthesis'
 import { useWllama } from './useWllama'
 import { useAudioVisualizer } from './useAudioVisualizer'
+import { getStats } from './useStats'
 
 export type ConversationState = 'idle' | 'ready' | 'listening' | 'processing' | 'thinking' | 'speaking' | 'loading'
 
@@ -31,6 +32,9 @@ export function useConversation() {
   const isConversationActive = ref(false)
   const hasPlayedIntro = ref(false)
 
+  // Stats
+  const stats = getStats()
+
   // Whisper (Speech-to-Text)
   const {
     isSupported,
@@ -112,13 +116,28 @@ export function useConversation() {
     stopVisualizer()
 
     // Stream LLM response and queue sentences for TTS
+    const llmStartTime = performance.now()
+    let ttsStartTime = 0
+    let firstSentence = true
+
     if (isTTSSupported.value) {
       await generateStreaming(text, (sentence: string) => {
+        if (firstSentence) {
+          ttsStartTime = performance.now()
+          firstSentence = false
+        }
         queueSentence(sentence)
       })
+      const llmEndTime = performance.now()
+      stats.addLLMTime(llmEndTime - llmStartTime)
+
       await waitForQueue()
+      if (ttsStartTime > 0) {
+        stats.addTTSTime(performance.now() - ttsStartTime)
+      }
     } else {
       await generateStreaming(text, () => {})
+      stats.addLLMTime(performance.now() - llmStartTime)
     }
 
     isProcessing.value = false
@@ -187,6 +206,11 @@ export function useConversation() {
     startVisualizer,
     stopVisualizer,
 
+    // Stats
+    avgSTT: stats.avgSTT,
+    avgLLM: stats.avgLLM,
+    avgTTS: stats.avgTTS,
+
     // Actions
     toggleConversation,
     initialize
diff --git a/src/composables/useStats.ts b/src/composables/useStats.ts
@@ -0,0 +1,75 @@
+import { ref, computed } from 'vue'
+
+// Keep last N samples for averaging
+const MAX_SAMPLES = 10
+
+export function useStats() {
+  const sttTimes = ref<number[]>([])
+  const llmTimes = ref<number[]>([])
+  const ttsTimes = ref<number[]>([])
+
+  const avgSTT = computed(() => {
+    if (sttTimes.value.length === 0) return 0
+    const sum = sttTimes.value.reduce((a, b) => a + b, 0)
+    return Math.round(sum / sttTimes.value.length)
+  })
+
+  const avgLLM = computed(() => {
+    if (llmTimes.value.length === 0) return 0
+    const sum = llmTimes.value.reduce((a, b) => a + b, 0)
+    return Math.round(sum / llmTimes.value.length)
+  })
+
+  const avgTTS = computed(() => {
+    if (ttsTimes.value.length === 0) return 0
+    const sum = ttsTimes.value.reduce((a, b) => a + b, 0)
+    return Math.round(sum / ttsTimes.value.length)
+  })
+
+  function addSTTTime(ms: number) {
+    sttTimes.value.push(ms)
+    if (sttTimes.value.length > MAX_SAMPLES) {
+      sttTimes.value.shift()
+    }
+  }
+
+  function addLLMTime(ms: number) {
+    llmTimes.value.push(ms)
+    if (llmTimes.value.length > MAX_SAMPLES) {
+      llmTimes.value.shift()
+    }
+  }
+
+  function addTTSTime(ms: number) {
+    ttsTimes.value.push(ms)
+    if (ttsTimes.value.length > MAX_SAMPLES) {
+      ttsTimes.value.shift()
+    }
+  }
+
+  function reset() {
+    sttTimes.value = []
+    llmTimes.value = []
+    ttsTimes.value = []
+  }
+
+  return {
+    avgSTT,
+    avgLLM,
+    avgTTS,
+    addSTTTime,
+    addLLMTime,
+    addTTSTime,
+    reset
+  }
+}
+
+// Singleton instance for shared state
+let statsInstance: ReturnType<typeof useStats> | null = null
+
+export function getStats() {
+  if (!statsInstance) {
+    statsInstance = useStats()
+  }
+  return statsInstance
+}
diff --git a/src/composables/useWhisper.ts b/src/composables/useWhisper.ts
@@ -1,5 +1,6 @@
 import { ref, onUnmounted } from 'vue'
 import { WhisperTranscriber } from 'whisper-web-transcriber'
+import { getStats } from './useStats'
 
 export function useWhisper() {
   const isSupported = ref(true)
@@ -11,6 +12,7 @@ export function useWhisper() {
   const error = ref<string | null>(null)
 
   let transcriber: WhisperTranscriber | null = null
+  let sttStartTime = 0
 
   async function loadModel() {
     if (isModelLoaded.value || isLoading.value) return
@@ -24,8 +26,15 @@ export function useWhisper() {
         modelSize: 'tiny-en-q5_1', // Smallest and fastest (~31MB)
         onTranscription: (text: string) => {
           if (text.trim()) {
+            // Record STT processing time
+            if (sttStartTime > 0) {
+              const sttTime = performance.now() - sttStartTime
+              getStats().addSTTTime(sttTime)
+            }
             transcript.value = text.trim()
           }
+          // Reset timer for next transcription
+          sttStartTime = performance.now()
         },
         onProgress: (progress: number) => {
           loadProgress.value = Math.round(progress)
@@ -58,6 +67,7 @@ export function useWhisper() {
       await navigator.mediaDevices.getUserMedia({ audio: true })
       await transcriber.startRecording()
       isListening.value = true
+      sttStartTime = performance.now() // Start timing for first transcription
     } catch (e) {
       const err = e as Error
       if (err.name === 'NotAllowedError' || err.message?.includes('not-allowed')) {
diff --git a/src/composables/useWllama.ts b/src/composables/useWllama.ts
@@ -6,10 +6,10 @@ const CONFIG_PATHS = {
   'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/wllama@2.3.7/esm/multi-thread/wllama.wasm',
 }
 
-// Qwen2.5-0.5B-Instruct - Good balance of size (~350MB) and quality
+// Gemma 3 270M - Ultra fast, ~180MB
 const DEFAULT_MODEL = {
-  repo: 'Qwen/Qwen2.5-0.5B-Instruct-GGUF',
-  file: 'qwen2.5-0.5b-instruct-q4_k_m.gguf'
+  repo: 'unsloth/gemma-3-270m-it-GGUF',
+  file: 'gemma-3-270m-it-Q4_K_M.gguf'
 }
 
 // Sentence boundary pattern - matches . ! ? , followed by space or end
@@ -79,8 +79,8 @@ export function useWllama() {
           top_p: 0.9,
         },
         onNewToken: (_token, _piece, currentText) => {
-          // Strip any trailing special tokens from display
-          const cleanText = currentText.replace(/<\|im_end\|>.*$/s, '').trim()
+          // Strip any trailing special tokens from display (Gemma uses <end_of_turn>)
+          const cleanText = currentText.replace(/<end_of_turn>.*$/s, '').trim()
           response.value = cleanText
 
           // Get new content since last check
@@ -103,8 +103,8 @@ export function useWllama() {
         }
       })
 
-      // Clean final result
-      const cleanResult = result.replace(/<\|im_end\|>.*$/s, '').trim()
+      // Clean final result (Gemma uses <end_of_turn>)
+      const cleanResult = result.replace(/<end_of_turn>.*$/s, '').trim()
       response.value = cleanResult
 
       // Emit any remaining text in buffer
@@ -142,15 +142,15 @@ export function useWllama() {
           top_p: 0.9,
         },
         onNewToken: (_token, _piece, currentText) => {
-          const cleanText = currentText.replace(/<\|im_end\|>.*$/s, '').trim()
+          const cleanText = currentText.replace(/<end_of_turn>.*$/s, '').trim()
           response.value = cleanText
           if (onToken) {
             onToken(cleanText)
           }
         }
       })
 
-      const cleanResult = result.replace(/<\|im_end\|>.*$/s, '').trim()
+      const cleanResult = result.replace(/<end_of_turn>.*$/s, '').trim()
       response.value = cleanResult
       return cleanResult
     } catch (e) {
@@ -162,12 +162,12 @@ export function useWllama() {
   }
 
   function formatChatPrompt(userMessage: string): string {
-    // Qwen2.5-Instruct chat template (ChatML format)
-    return `<|im_start|>system
-You are Ava. Reply in 1-2 short sentences only.<|im_end|>
-<|im_start|>user
-${userMessage}<|im_end|>
-<|im_start|>assistant
+    // Gemma 3 chat template
+    return `<start_of_turn>user
+You are Ava. Reply in 1-2 short sentences only.
+
+${userMessage}<end_of_turn>
+<start_of_turn>model
 `
   }
 
diff --git a/src/styles/main.css b/src/styles/main.css
@@ -343,6 +343,27 @@ body {
   border-color: var(--primary);
 }
 
+/* Stats Display */
+.stats-display {
+  position: fixed;
+  bottom: 1.5rem;
+  left: 1.5rem;
+  display: flex;
+  gap: 0.75rem;
+  padding: 0.5rem 0.75rem;
+  background: var(--bg-card);
+  border: 1px solid var(--border);
+  border-radius: 8px;
+  z-index: 101;
+}
+
+.stat-item {
+  font-size: 0.7rem;
+  color: var(--text-dim);
+  letter-spacing: 0.05em;
+  font-family: 'JetBrains Mono', monospace;
+}
+
 /* About Popup */
 .about-overlay {
   position: fixed;