From 8e42946b20d29a4a2ab94682c2e872d587e8bfa2 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 8 Dec 2025 07:56:44 +0000
Subject: [PATCH 1/2] feat: Add pHash generation from MFCC for chord segments

- Implement `app/database.py` for storing pHash and chord symbol pairs in SQLite.
- Update `app/tasks/chord_tasks.py` in `e2e_base_ready_task` to compute MFCC and pHash for detected chord segments using `librosa` and `ImageHash`.
- Add `ImageHash` and `Pillow` to `requirements.txt`.
- Save the pHash-chord pair to the database.
---
 app/database.py          | 55 ++++++++++++++++++++++++++++++++++++++++
 app/tasks/chord_tasks.py | 51 ++++++++++++++++++++++++++++++++-----
 requirements.txt         |  2 ++
 3 files changed, 102 insertions(+), 6 deletions(-)
 create mode 100644 app/database.py

diff --git a/app/database.py b/app/database.py
new file mode 100644
index 0000000..dd3e831
--- /dev/null
+++ b/app/database.py
@@ -0,0 +1,55 @@
+import sqlite3
+import os
+from datetime import datetime
+from pathlib import Path
+from contextlib import contextmanager
+
+# Use an absolute path relative to the app directory or project root
+# Using the directory of this file to place the DB in 'app/' folder or similar
+BASE_DIR = Path(__file__).resolve().parent.parent
+DB_FILE = BASE_DIR / "data" / "chord_fingerprints.db"
+
+# Ensure data directory exists
+DB_FILE.parent.mkdir(parents=True, exist_ok=True)
+
+def get_db_connection():
+    conn = sqlite3.connect(str(DB_FILE))
+    conn.row_factory = sqlite3.Row
+    return conn
+
+@contextmanager
+def db_cursor():
+    conn = get_db_connection()
+    try:
+        yield conn, conn.cursor()
+        conn.commit()
+    finally:
+        conn.close()
+
+def init_db():
+    with db_cursor() as (conn, c):
+        c.execute('''
+            CREATE TABLE IF NOT EXISTS chord_fingerprints (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                phash TEXT NOT NULL,
+                chord_symbol TEXT NOT NULL,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        ''')
+
+def save_phash_chord_pair(phash: str, chord_symbol: str, cursor=None):
+    """
+    Save pHash and chord symbol.
+    If cursor is provided, use it (for batch operations).
+    Otherwise, open a new connection.
+    """
+    if cursor:
+        cursor.execute('INSERT INTO chord_fingerprints (phash, chord_symbol) VALUES (?, ?)',
+                  (phash, chord_symbol))
+    else:
+        # Fallback for single inserts
+        # Ideally, we should ensure DB is init somewhere else, but for safety:
+        # init_db() # Moving this out for performance in loops
+        with db_cursor() as (conn, c):
+            c.execute('INSERT INTO chord_fingerprints (phash, chord_symbol) VALUES (?, ?)',
+                      (phash, chord_symbol))
diff --git a/app/tasks/chord_tasks.py b/app/tasks/chord_tasks.py
index b7c004a..843513c 100644
--- a/app/tasks/chord_tasks.py
+++ b/app/tasks/chord_tasks.py
@@ -284,8 +284,13 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
     from demucs.apply import apply_model
     import torch
     import torchaudio
+    import librosa
+    import numpy as np
+    from PIL import Image
+    import imagehash
     from halmoni import MIDIAnalyzer, ChordDetector, KeyDetector, ChordProgression
     import json
+    from app.database import save_phash_chord_pair, db_cursor, init_db
 
     self.update_progress(0, 100, "Starting E2E pipeline")
 
@@ -293,6 +298,9 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
         output_dir = Path(f"./outputs/{job_id}")
         output_dir.mkdir(parents=True, exist_ok=True)
 
+        # Initialize DB once
+        init_db()
+
         # Step 1: Audio separation
         self.update_progress(10, 100, "Separating audio")
 
@@ -347,12 +355,43 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
         time_windows = analyzer.get_time_windows(all_notes_flat, window_size=1.0)
 
         chords = []
-        for window_start, window_notes in time_windows:
-            notes = analyzer.group_simultaneous_notes(window_notes)
-            if notes:
-                chord = detector.detect_chord_from_midi_notes(notes[0])
-                if chord:
-                    chords.append(chord)
+
+        # Load audio for MFCC extraction
+        y, sr = librosa.load(str(separated_audio_path), sr=sample_rate)
+
+        # Use a single connection for all inserts
+        with db_cursor() as (conn, cursor):
+            for window_start, window_notes in time_windows:
+                notes = analyzer.group_simultaneous_notes(window_notes)
+                if notes:
+                    chord = detector.detect_chord_from_midi_notes(notes[0])
+                    if chord:
+                        chords.append(chord)
+
+                        # Extract MFCC and pHash
+                        try:
+                            # Extract audio segment (window_size=1.0)
+                            start_sample = int(window_start * sr)
+                            end_sample = int((window_start + 1.0) * sr)
+
+                            if start_sample < len(y):
+                                segment = y[start_sample:min(end_sample, len(y))]
+
+                                if len(segment) > 0:
+                                    # Compute MFCC
+                                    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=20)
+
+                                    # Normalize MFCC to 0-255 for image conversion
+                                    mfcc_norm = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min() + 1e-6) * 255
+                                    mfcc_img = Image.fromarray(mfcc_norm.astype(np.uint8))
+
+                                    # Compute pHash
+                                    phash = str(imagehash.phash(mfcc_img))
+
+                                    # Save to DB
+                                    save_phash_chord_pair(phash, str(chord), cursor=cursor)
+                        except Exception as e:
+                            print(f"Warning: Failed to generate pHash for chord {chord}: {e}")
 
         key_detector = KeyDetector()
         all_notes = all_notes_flat
diff --git a/requirements.txt b/requirements.txt
index 1a62543..50e38f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,3 +42,5 @@ jams
 python-dotenv
 deprecated
 onnx>=1.19.0
+ImageHash
+Pillow

From 7f86a446ce690c42c4cb5656489f610e6ff12b86 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 8 Dec 2025 08:42:36 +0000
Subject: [PATCH 2/2] feat: Add caching and chord fingerprinting using MFCC
 pHash

- Introduce `app/database.py` to manage SQLite database for caching and fingerprints.
- Update `app/tasks/chord_tasks.py` to:
    - Compute pHash of input audio files to cache full pipeline results.
    - Compute pHash of individual chord segments to build a chord fingerprint database.
- Update `app/schemas.py` to allow optional URLs for cached results.
- Add `ImageHash` and `Pillow` to `requirements.txt`.
---
 app/database.py          | 30 ++++++++++++++++++++++++---
 app/schemas.py           |  4 ++--
 app/tasks/chord_tasks.py | 45 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/app/database.py b/app/database.py
index dd3e831..74e3817 100644
--- a/app/database.py
+++ b/app/database.py
@@ -28,6 +28,7 @@ def db_cursor():
 
 def init_db():
     with db_cursor() as (conn, c):
+        # Table for segment-level fingerprints
         c.execute('''
             CREATE TABLE IF NOT EXISTS chord_fingerprints (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -37,6 +38,15 @@ def init_db():
             )
         ''')
 
+        # Table for file-level caching
+        c.execute('''
+            CREATE TABLE IF NOT EXISTS file_cache (
+                phash TEXT PRIMARY KEY,
+                progression_data TEXT NOT NULL,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        ''')
+
 def save_phash_chord_pair(phash: str, chord_symbol: str, cursor=None):
     """
     Save pHash and chord symbol.
@@ -47,9 +57,23 @@ def save_phash_chord_pair(phash: str, chord_symbol: str, cursor=None):
         cursor.execute('INSERT INTO chord_fingerprints (phash, chord_symbol) VALUES (?, ?)',
                   (phash, chord_symbol))
     else:
-        # Fallback for single inserts
-        # Ideally, we should ensure DB is init somewhere else, but for safety:
-        # init_db() # Moving this out for performance in loops
         with db_cursor() as (conn, c):
             c.execute('INSERT INTO chord_fingerprints (phash, chord_symbol) VALUES (?, ?)',
                       (phash, chord_symbol))
+
+def get_cached_progression(phash: str):
+    """Retrieve cached chord progression for a file pHash"""
+    with db_cursor() as (conn, c):
+        c.execute("SELECT progression_data FROM file_cache WHERE phash=?", (phash,))
+        row = c.fetchone()
+        if row:
+            return row['progression_data']
+    return None
+
+def save_cached_progression(phash: str, progression_data: str):
+    """Save chord progression to cache"""
+    with db_cursor() as (conn, c):
+        c.execute('''
+            INSERT OR REPLACE INTO file_cache (phash, progression_data)
+            VALUES (?, ?)
+        ''', (phash, progression_data))
diff --git a/app/schemas.py b/app/schemas.py
index cfcdac4..b4c0b07 100644
--- a/app/schemas.py
+++ b/app/schemas.py
@@ -86,8 +86,8 @@ class E2EBaseRequest(BaseModel):
 
 class E2EBaseResult(BaseModel):
     jobId: str
-    transcriptionUrl: str
-    separatedAudioUrl: str
+    transcriptionUrl: Optional[str] = None
+    separatedAudioUrl: Optional[str] = None
     chordProgressionUrl: str
     format: ChartFormat = ChartFormat.JSON
 
diff --git a/app/tasks/chord_tasks.py b/app/tasks/chord_tasks.py
index 843513c..efc91f1 100644
--- a/app/tasks/chord_tasks.py
+++ b/app/tasks/chord_tasks.py
@@ -290,7 +290,10 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
     import imagehash
     from halmoni import MIDIAnalyzer, ChordDetector, KeyDetector, ChordProgression
     import json
-    from app.database import save_phash_chord_pair, db_cursor, init_db
+    from app.database import (
+        save_phash_chord_pair, db_cursor, init_db,
+        get_cached_progression, save_cached_progression
+    )
 
     self.update_progress(0, 100, "Starting E2E pipeline")
 
@@ -301,6 +304,36 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
         # Initialize DB once
         init_db()
 
+        # Check cache
+        self.update_progress(5, 100, "Checking cache")
+        try:
+            # Compute file pHash
+            y, sr = librosa.load(audio_file_path, sr=22050) # Use 22050Hz for pHash consistency
+            if len(y) > 0:
+                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
+                mfcc_norm = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min() + 1e-6) * 255
+                mfcc_img = Image.fromarray(mfcc_norm.astype(np.uint8))
+                file_phash = str(imagehash.phash(mfcc_img))
+
+                cached_data = get_cached_progression(file_phash)
+                if cached_data:
+                    self.update_progress(100, 100, "Found cached result")
+                    # Save cached data to output file
+                    chord_output_path = output_dir / "chord_progression.json"
+                    with open(chord_output_path, 'w') as f:
+                        f.write(cached_data)
+
+                    return {
+                        'jobId': job_id,
+                        'transcriptionUrl': None,
+                        'separatedAudioUrl': None,
+                        'chordProgressionUrl': f'/outputs/{job_id}/chord_progression.json',
+                        'format': 'json'
+                    }
+        except Exception as e:
+            print(f"Warning: Cache check failed: {e}")
+            file_phash = None
+
         # Step 1: Audio separation
         self.update_progress(10, 100, "Separating audio")
 
@@ -404,8 +437,16 @@ def e2e_base_ready_task(self, audio_file_path: str, instrument: str):
             'key': str(key) if key else None,
             'chords': [{'symbol': str(chord), 'duration': 1.0} for chord in chords]
         }
+        json_str = json.dumps(progression_data, indent=2)
         with open(chord_output_path, 'w') as f:
-            json.dump(progression_data, f, indent=2)
+            f.write(json_str)
+
+        # Save to cache if we have a file pHash
+        if file_phash:
+            try:
+                save_cached_progression(file_phash, json_str)
+            except Exception as e:
+                print(f"Warning: Failed to save to cache: {e}")
 
         chord_progression_url = f'/outputs/{job_id}/chord_progression.json'