Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ describe("embedding-search", () => {
it("handles zero vectors", () => {
expect(cosineSimilarity([0, 0, 0], [1, 0, 0])).toBe(0);
});

it("returns 0 (not NaN) for mismatched vector lengths", () => {
// a longer than b
expect(cosineSimilarity([1, 0, 0, 0.5], [1, 0, 0])).toBe(0);
// a shorter than b — today silently returns 1, overstating similarity
expect(cosineSimilarity([1, 0, 0], [1, 0, 0, 5])).toBe(0);
});
});

describe("SemanticSearchEngine", () => {
Expand Down Expand Up @@ -88,5 +95,55 @@ describe("embedding-search", () => {
engine.addEmbedding("n1", [1, 0, 0, 0]);
expect(engine.hasEmbeddings()).toBe(true);
});

it("tolerates a mis-sized stored embedding without throwing", () => {
// n2 simulates a stale/corrupt index entry: it was persisted by a
// different model and has the wrong dimension relative to the query.
const mixedEmbeddings: Record<string, number[]> = {
n1: [1, 0, 0, 0],
n2: [0, 1, 0], // wrong length (3 vs 4)
n3: [0.9, 0, 0.1, 0],
};
const engine = new SemanticSearchEngine(nodes, mixedEmbeddings);
const queryEmbedding = [1, 0, 0, 0];

expect(() => engine.search(queryEmbedding)).not.toThrow();
});

it("ranks a mis-sized stored embedding last at the default threshold", () => {
const mixedEmbeddings: Record<string, number[]> = {
n1: [1, 0, 0, 0], // identical to query -> similarity 1, score 0 (best)
n2: [0, 1, 0], // wrong length -> similarity 0, score 1 (worst)
n3: [0.9, 0, 0.1, 0], // similar -> high similarity, low score
};
const engine = new SemanticSearchEngine(nodes, mixedEmbeddings);

const results = engine.search([1, 0, 0, 0]);
const ids = results.map((r) => r.nodeId);

// Correctly-sized neighbours come back in ascending-score (descending
// similarity) order; the mismatched node is included but ranked last
// because similarity 0 satisfies `0 >= 0` and scores `1 - 0 = 1`.
expect(ids).toEqual(["n1", "n3", "n2"]);
expect(results[results.length - 1].nodeId).toBe("n2");
expect(results[results.length - 1].score).toBe(1);
});

it("drops a mis-sized stored embedding under a positive threshold", () => {
const mixedEmbeddings: Record<string, number[]> = {
n1: [1, 0, 0, 0],
n2: [0, 1, 0], // wrong length -> similarity 0, filtered by threshold
n3: [0.9, 0, 0.1, 0],
};
const engine = new SemanticSearchEngine(nodes, mixedEmbeddings);

const results = engine.search([1, 0, 0, 0], { threshold: 0.5 });
const ids = results.map((r) => r.nodeId);

// similarity 0 fails `0 >= 0.5`, so the mismatched node is excluded while
// the correctly-sized neighbours remain in score order.
expect(ids).not.toContain("n2");
expect(ids).toEqual(["n1", "n3"]);
});
});
});
20 changes: 19 additions & 1 deletion understand-anything-plugin/packages/core/src/embedding-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,22 @@ export interface SemanticSearchOptions {

/**
* Compute cosine similarity between two vectors.
* Returns 0 if either vector has zero magnitude.
* Returns 0 if either vector has zero magnitude or if the two vectors have
* different lengths.
*/
export function cosineSimilarity(a: number[], b: number[]): number {
// A length mismatch means the two vectors came from different embedding
// models/dimensions — i.e. a stale or corrupt index, not a meaningful data
// point. We swallow it as 0 (treated as "completely dissimilar") so callers
// never see NaN or an overstated similarity. This is intentionally silent
// here because cosineSimilarity is a pure helper invoked once per node per
// query in search()'s hot loop and has no state to dedupe a warning.
// TODO(follow-up): surface mismatches at the engine level (record the
// expected dimension and warn-once / count skipped stored embeddings) so a
// user re-running search after a model upgrade gets a signal instead of
// quietly degraded recall.
if (a.length !== b.length) return 0;

let dot = 0;
let magA = 0;
let magB = 0;
Expand Down Expand Up @@ -67,6 +80,11 @@ export class SemanticSearchEngine {
const embedding = this.embeddings.get(node.id);
if (!embedding) continue;

// If a stored embedding's length differs from queryEmbedding (e.g. a
// persisted index from a prior model/dimension loaded alongside a fresh
// query), cosineSimilarity returns 0, so the node is treated as fully
// dissimilar rather than throwing or scoring spuriously high. See the
// TODO in cosineSimilarity about surfacing these mismatches engine-side.
const similarity = cosineSimilarity(queryEmbedding, embedding);
if (similarity >= threshold) {
scored.push({ nodeId: node.id, score: 1 - similarity });
Expand Down
Loading