diff --git a/understand-anything-plugin/packages/core/src/__tests__/embedding-search.test.ts b/understand-anything-plugin/packages/core/src/__tests__/embedding-search.test.ts index 4952c7945..2c8870f3b 100644 --- a/understand-anything-plugin/packages/core/src/__tests__/embedding-search.test.ts +++ b/understand-anything-plugin/packages/core/src/__tests__/embedding-search.test.ts @@ -33,6 +33,13 @@ describe("embedding-search", () => { it("handles zero vectors", () => { expect(cosineSimilarity([0, 0, 0], [1, 0, 0])).toBe(0); }); + + it("returns 0 (not NaN) for mismatched vector lengths", () => { + // a longer than b + expect(cosineSimilarity([1, 0, 0, 0.5], [1, 0, 0])).toBe(0); + // a shorter than b — today silently returns 1, overstating similarity + expect(cosineSimilarity([1, 0, 0], [1, 0, 0, 5])).toBe(0); + }); }); describe("SemanticSearchEngine", () => { @@ -88,5 +95,55 @@ describe("embedding-search", () => { engine.addEmbedding("n1", [1, 0, 0, 0]); expect(engine.hasEmbeddings()).toBe(true); }); + + it("tolerates a mis-sized stored embedding without throwing", () => { + // n2 simulates a stale/corrupt index entry: it was persisted by a + // different model and has the wrong dimension relative to the query. + const mixedEmbeddings: Record = { + n1: [1, 0, 0, 0], + n2: [0, 1, 0], // wrong length (3 vs 4) + n3: [0.9, 0, 0.1, 0], + }; + const engine = new SemanticSearchEngine(nodes, mixedEmbeddings); + const queryEmbedding = [1, 0, 0, 0]; + + expect(() => engine.search(queryEmbedding)).not.toThrow(); + }); + + it("ranks a mis-sized stored embedding last at the default threshold", () => { + const mixedEmbeddings: Record = { + n1: [1, 0, 0, 0], // identical to query -> similarity 1, score 0 (best) + n2: [0, 1, 0], // wrong length -> similarity 0, score 1 (worst) + n3: [0.9, 0, 0.1, 0], // similar -> high similarity, low score + }; + const engine = new SemanticSearchEngine(nodes, mixedEmbeddings); + + const results = engine.search([1, 0, 0, 0]); + const ids = results.map((r) => r.nodeId); + + // Correctly-sized neighbours come back in ascending-score (descending + // similarity) order; the mismatched node is included but ranked last + // because similarity 0 satisfies `0 >= 0` and scores `1 - 0 = 1`. + expect(ids).toEqual(["n1", "n3", "n2"]); + expect(results[results.length - 1].nodeId).toBe("n2"); + expect(results[results.length - 1].score).toBe(1); + }); + + it("drops a mis-sized stored embedding under a positive threshold", () => { + const mixedEmbeddings: Record = { + n1: [1, 0, 0, 0], + n2: [0, 1, 0], // wrong length -> similarity 0, filtered by threshold + n3: [0.9, 0, 0.1, 0], + }; + const engine = new SemanticSearchEngine(nodes, mixedEmbeddings); + + const results = engine.search([1, 0, 0, 0], { threshold: 0.5 }); + const ids = results.map((r) => r.nodeId); + + // similarity 0 fails `0 >= 0.5`, so the mismatched node is excluded while + // the correctly-sized neighbours remain in score order. + expect(ids).not.toContain("n2"); + expect(ids).toEqual(["n1", "n3"]); + }); }); }); diff --git a/understand-anything-plugin/packages/core/src/embedding-search.ts b/understand-anything-plugin/packages/core/src/embedding-search.ts index 71192ca2a..bedfa1c30 100644 --- a/understand-anything-plugin/packages/core/src/embedding-search.ts +++ b/understand-anything-plugin/packages/core/src/embedding-search.ts @@ -9,9 +9,22 @@ export interface SemanticSearchOptions { /** * Compute cosine similarity between two vectors. - * Returns 0 if either vector has zero magnitude. + * Returns 0 if either vector has zero magnitude or if the two vectors have + * different lengths. */ export function cosineSimilarity(a: number[], b: number[]): number { + // A length mismatch means the two vectors came from different embedding + // models/dimensions — i.e. a stale or corrupt index, not a meaningful data + // point. We swallow it as 0 (treated as "completely dissimilar") so callers + // never see NaN or an overstated similarity. This is intentionally silent + // here because cosineSimilarity is a pure helper invoked once per node per + // query in search()'s hot loop and has no state to dedupe a warning. + // TODO(follow-up): surface mismatches at the engine level (record the + // expected dimension and warn-once / count skipped stored embeddings) so a + // user re-running search after a model upgrade gets a signal instead of + // quietly degraded recall. + if (a.length !== b.length) return 0; + let dot = 0; let magA = 0; let magB = 0; @@ -67,6 +80,11 @@ export class SemanticSearchEngine { const embedding = this.embeddings.get(node.id); if (!embedding) continue; + // If a stored embedding's length differs from queryEmbedding (e.g. a + // persisted index from a prior model/dimension loaded alongside a fresh + // query), cosineSimilarity returns 0, so the node is treated as fully + // dissimilar rather than throwing or scoring spuriously high. See the + // TODO in cosineSimilarity about surfacing these mismatches engine-side. const similarity = cosineSimilarity(queryEmbedding, embedding); if (similarity >= threshold) { scored.push({ nodeId: node.id, score: 1 - similarity });