From 8e2b89072c8c9026fc7d52125fbab02616bf08df Mon Sep 17 00:00:00 2001 From: Alan Shurafa Date: Sat, 13 Jun 2026 15:06:14 -0400 Subject: [PATCH 1/3] [schemas] enhanced-thoughts: sync RPCs with current behavior Existing installs ran older upsert/search RPCs than the reference brain. This brings them current without breaking the v1 contracts. Co-Authored-By: Claude Opus 4.8 (1M context) --- schemas/enhanced-thoughts/README.md | 31 ++- schemas/enhanced-thoughts/metadata.json | 8 +- schemas/enhanced-thoughts/schema.sql | 337 +++++++++++++++++++++--- 3 files changed, 332 insertions(+), 44 deletions(-) diff --git a/schemas/enhanced-thoughts/README.md b/schemas/enhanced-thoughts/README.md index f7314114f..8cd3f3de1 100644 --- a/schemas/enhanced-thoughts/README.md +++ b/schemas/enhanced-thoughts/README.md @@ -19,10 +19,16 @@ This schema extension adds six new columns to the `thoughts` table (`type`, `sen - **`get_thought_connections`** -- Finds thoughts that share metadata topics or people with a given thought. - **`backfill_thought_types(p_allowed_types TEXT[])`** -- Populates the new top-level `type` column from `metadata->>'type'`. The default allowlist covers the canonical eight values (`idea`, `task`, `person_note`, `reference`, `decision`, `lesson`, `meeting`, `journal`). Pass a custom array to accept additional values, or pass `NULL` to backfill whatever `metadata->>'type'` contains. +It also overrides the base `upsert_thought` so structured columns (`type`, `source_type`, `importance`, `quality_score`, `sensitivity_tier`, `status`) stay in sync on capture, and (when `schemas/typed-reasoning-edges` is installed) installs one optional opt-in RPC: + +- **`match_thoughts_superseded_aware`** -- The same nearest-neighbor search as the core `match_thoughts`, plus a `superseded_by` column. Thoughts that have been replaced (the target of a `supersedes` edge in `thought_edges`) get a 0.8x ranking penalty so fresh thoughts surface above their stale predecessors. The core `match_thoughts` is left untouched; callers opt in by name. + ## Prerequisites - Working Open Brain setup (see the getting-started guide in `docs/01-getting-started.md`) - Supabase project with the `thoughts` table, `match_thoughts` function, and `upsert_thought` function already created +- Apply `schemas/workflow-status/` first if it is not already applied. The `upsert_thought` override here writes to the `status` and `status_updated_at` columns that `workflow-status` creates. Both files use `ADD COLUMN IF NOT EXISTS`, so applying either order is safe, but `workflow-status` must be present before the first `upsert_thought` call runs. +- Optional: `schemas/typed-reasoning-edges/` (creates `public.thought_edges`). Required only for `match_thoughts_superseded_aware`. If it is absent, the rest of this migration still applies and that one function is skipped with a `NOTICE`; re-run this `schema.sql` after installing typed-reasoning-edges to add it. ## Credential Tracker @@ -45,7 +51,7 @@ SUPABASE (from your Open Brain setup) 2. Create a new query and paste the full contents of `schema.sql` 3. Click **Run** to execute the migration 4. Open **Table Editor** and select the `thoughts` table to confirm the new columns appear: `type`, `sensitivity_tier`, `importance`, `quality_score`, `source_type`, `enriched` -5. Navigate to **Database > Functions** and verify the new functions exist: `search_thoughts_text`, `brain_stats_aggregate`, `get_thought_connections`, `backfill_thought_types` +5. Navigate to **Database > Functions** and verify the functions exist: `search_thoughts_text`, `brain_stats_aggregate`, `get_thought_connections`, `backfill_thought_types`, and `upsert_thought`. If you have applied `schemas/typed-reasoning-edges/`, `match_thoughts_superseded_aware` is present too; if not, that one function is skipped (see Prerequisites) 6. If you have existing thoughts with `type` or `source` values stored in the metadata JSONB, the script automatically calls `backfill_thought_types()` with the default canonical allowlist. If your brain uses non-canonical `type` values, re-run `SELECT backfill_thought_types(ARRAY['your','custom','types']);` or `SELECT backfill_thought_types(NULL);` to accept any value ## Expected Outcome @@ -54,7 +60,7 @@ After running the migration: - The `thoughts` table has six new columns with sensible defaults: - `sensitivity_tier TEXT DEFAULT 'standard'` (canonical values: `'standard'`, `'personal'`, `'restricted'`) - - `importance SMALLINT DEFAULT 3` (scale: 1-5, where 3 is the default) + - `importance SMALLINT DEFAULT 3` (the column default is 3; the `upsert_thought` override accepts and clamps payload values to 0-100. See "Changes from v1" for why this is NOT the ExoCortex 0-6 scale.) - `quality_score NUMERIC(5,2) DEFAULT 50` (scale: 0-100, where 50 is the default) - `enriched BOOLEAN DEFAULT false` - `type TEXT` (nullable; populated by backfill or writers) @@ -63,13 +69,32 @@ After running the migration: - Four new RPC functions callable via the Supabase client or REST API (`search_thoughts_text`, `brain_stats_aggregate`, `get_thought_connections`, `backfill_thought_types`). - Any existing thoughts with `type` or `source` in their metadata JSONB will have those values copied into the new top-level columns (via `backfill_thought_types()` for `type` with the canonical allowlist, plus an inline `UPDATE` for `source_type`). +## Changes from v1 + +v1.1 brings the RPCs in line with how the reference Open Brain install runs them today. Everything is additive and idempotent — re-running `schema.sql` on a v1 install is safe. The `upsert_thought` return contract (`{id, fingerprint}`) and the `status` / `status_updated_at` handling are unchanged, so existing callers and `schemas/workflow-status/` are not affected. + +What changed for an existing install: + +- **`search_thoughts_text` now reads three control keys out of `p_filter`.** `start_date` and `end_date` (ISO 8601 timestamps) filter `created_at` to that range; `exclude_restricted` (boolean) drops `sensitivity_tier = 'restricted'` rows. These keys are stripped from the metadata-containment predicate, so they no longer require a literal metadata key of the same name. Any other `p_filter` key keeps its original `metadata @> filter` behavior. If you were (accidentally) relying on a metadata key literally named `start_date`/`end_date`/`exclude_restricted`, it is now interpreted as a control key instead. + +- **`upsert_thought` gained two dedup/merge guards.** + - *Original-fingerprint fallback*: when a thought's content is corrected, its fingerprint changes. If your update path appends the pre-edit fingerprint to an append-only array `metadata.original_fingerprints[]`, a later reimport of the original source text now lands on the corrected row as a dedup hit instead of inserting a stale sibling that "outvotes" the correction. Exact-fingerprint match still wins first. If you never write `original_fingerprints`, behavior is identical to v1 (an extra indexed lookup on miss, no semantic change). + - *User-edit guard*: keys listed in `metadata.user_edits` are treated as human-owned. On the merge path they (and the system-managed `user_edits` / `original_fingerprints` maps) are stripped from the incoming patch so an automated reimport cannot resurrect stale values over a human correction. This guards **metadata keys only** — the top-level structured columns (`type`, `importance`, etc.) keep v1's overwrite-on-merge behavior. If you never write `user_edits`, behavior is identical to v1. + - To make the fallback possible the function now does an explicit fingerprint lookup and branches INSERT vs UPDATE instead of using `ON CONFLICT`. The visible result is the same `{id, fingerprint}` payload. + +- **Importance stays on the 0-100 scale (deliberate deviation).** ExoCortex widened its own importance to a 0-6 scale. Open Brain's `upsert_thought` already accepts a wider 0-100 range, so it does not clip 0-6 values — adopting 0-6 here would retroactively rescale every existing row's importance, which is a breaking data change, not an additive one. The column default remains 3; payload values are clamped to 0-100. Treat 0-6 as a subset if you want cross-system parity. + +- **New opt-in RPC `match_thoughts_superseded_aware`** (installed only when `schemas/typed-reasoning-edges/` is present). It returns the same columns as the core `match_thoughts` plus `superseded_by UUID`, and applies a 0.8x penalty to thoughts that are the target of a `supersedes` edge so stale predecessors rank below their replacements without being excluded. The core `match_thoughts` is not modified. If `public.thought_edges` is missing, this function is skipped with a `NOTICE` and the rest of the migration still applies. + ## Security This schema follows stock Open Brain's "service_role only" posture: - `brain_stats_aggregate` and `get_thought_connections` are `SECURITY DEFINER` with `SET search_path = public` (defense in depth against search-path hijacks). They can read the full `thoughts` table regardless of RLS. - `search_thoughts_text` is `SECURITY INVOKER` and respects RLS. -- **None of the three RPCs are granted to `anon`.** Execute privilege is limited to `authenticated` and `service_role`. The publishable anon key cannot call them. +- `match_thoughts_superseded_aware` is `SECURITY INVOKER` and granted to `service_role` only, matching the access posture of `public.thought_edges` (service-role only). +- `upsert_thought` is granted to `service_role` only. The `exclude_restricted` control key on `search_thoughts_text` lets a caller drop restricted rows, but the default is `false` (restricted rows are returned), so set it explicitly when building any lower-trust surface. +- **None of the read RPCs are granted to `anon`.** Execute privilege is limited to `authenticated` and `service_role` (or `service_role` only, per function above). The publishable anon key cannot call them. If you want to expose any of these to `anon` (for example, a public-read dashboard), add your own `GRANT EXECUTE ... TO anon;` in a follow-up migration and confirm that `p_exclude_restricted := true` (the default) plus your sensitivity-tier hygiene gives you the exposure surface you actually want. This is an explicit opt-in: the default stance is private. diff --git a/schemas/enhanced-thoughts/metadata.json b/schemas/enhanced-thoughts/metadata.json index 757a341b3..7391be811 100644 --- a/schemas/enhanced-thoughts/metadata.json +++ b/schemas/enhanced-thoughts/metadata.json @@ -1,18 +1,18 @@ { "name": "Enhanced Thoughts Columns and Utility RPCs", - "description": "Adds structured columns (type, importance, quality_score, sensitivity_tier, source_type, enriched) to the thoughts table and installs utility RPCs for full-text search, aggregate statistics, and thought connections.", + "description": "Adds structured columns (type, importance, quality_score, sensitivity_tier, source_type, enriched) to the thoughts table and installs utility RPCs for full-text search (with date and restricted-tier filters), aggregate statistics, thought connections, a dedup-guarded upsert, and an optional superseded-aware semantic search.", "category": "schemas", "author": { "name": "Alan Shurafa", "github": "alanshurafa" }, - "version": "1.0.0", + "version": "1.1.0", "requires": { "open_brain": true }, - "tags": ["schema", "metadata", "search", "statistics", "enrichment"], + "tags": ["schema", "metadata", "search", "statistics", "enrichment", "deduplication", "semantic-search"], "difficulty": "beginner", "estimated_time": "15 minutes", "created": "2026-04-06", - "updated": "2026-04-17" + "updated": "2026-06-13" } diff --git a/schemas/enhanced-thoughts/schema.sql b/schemas/enhanced-thoughts/schema.sql index 2e1297bfd..de3f703d6 100644 --- a/schemas/enhanced-thoughts/schema.sql +++ b/schemas/enhanced-thoughts/schema.sql @@ -33,6 +33,18 @@ CREATE INDEX IF NOT EXISTS idx_thoughts_content_tsvector -- Supports boolean operators via websearch_to_tsquery -- ("quoted phrases", AND, OR, -NOT) with ILIKE fallback, -- pagination, and result count. +-- +-- v1.1: p_filter now recognizes three reserved control keys that +-- are applied at the data layer instead of as metadata containment: +-- start_date / end_date — ISO 8601 timestamps; filter created_at +-- to the [start_date, end_date] range. +-- exclude_restricted — boolean; when true, drop rows whose +-- sensitivity_tier is 'restricted'. +-- These keys are stripped from the containment predicate so they do +-- not accidentally require a literal metadata key of the same name. +-- All other p_filter keys keep their original `metadata @> filter` +-- containment behavior. Ported from ExoCortex search-text date +-- filters; UUID id contract preserved. -- ============================================================ CREATE OR REPLACE FUNCTION search_thoughts_text( @@ -58,6 +70,21 @@ LANGUAGE plpgsql STABLE SET statement_timeout = '25s' AS $$ +DECLARE + -- Reserved control keys, peeled off p_filter so they are not treated + -- as metadata containment requirements. + v_exclude_restricted BOOLEAN := + coalesce((p_filter->>'exclude_restricted')::boolean, false); + v_start_date TIMESTAMPTZ := + CASE WHEN nullif(p_filter->>'start_date', '') IS NOT NULL + THEN (p_filter->>'start_date')::timestamptz ELSE NULL END; + v_end_date TIMESTAMPTZ := + CASE WHEN nullif(p_filter->>'end_date', '') IS NOT NULL + THEN (p_filter->>'end_date')::timestamptz ELSE NULL END; + -- Containment filter with the reserved keys removed. + v_meta_filter JSONB := + coalesce(p_filter, '{}'::jsonb) + - 'start_date' - 'end_date' - 'exclude_restricted'; BEGIN RETURN QUERY WITH query_input AS ( @@ -72,7 +99,11 @@ BEGIN CROSS JOIN query_input q WHERE q.raw_query <> '' AND to_tsvector('simple', coalesce(t.content, '')) @@ q.ts_query - AND t.metadata @> coalesce(p_filter, '{}'::jsonb) + AND t.metadata @> v_meta_filter + AND (NOT v_exclude_restricted + OR coalesce(t.sensitivity_tier, 'standard') <> 'restricted') + AND (v_start_date IS NULL OR t.created_at >= v_start_date) + AND (v_end_date IS NULL OR t.created_at <= v_end_date) LIMIT 2000 ), -- Phase 2: ILIKE fallback when tsvector finds fewer than needed @@ -83,7 +114,11 @@ BEGIN WHERE q.raw_query <> '' AND (SELECT count(*) FROM tsvector_hits) < (p_limit + p_offset) AND t.content ILIKE '%' || q.raw_query || '%' - AND t.metadata @> coalesce(p_filter, '{}'::jsonb) + AND t.metadata @> v_meta_filter + AND (NOT v_exclude_restricted + OR coalesce(t.sensitivity_tier, 'standard') <> 'restricted') + AND (v_start_date IS NULL OR t.created_at >= v_start_date) + AND (v_end_date IS NULL OR t.created_at <= v_end_date) AND NOT EXISTS (SELECT 1 FROM tsvector_hits th WHERE th.hit_id = t.id) LIMIT 500 ), @@ -355,6 +390,34 @@ WHERE source_type IS NULL AND metadata->>'source' IS NOT NULL; -- 6. ENHANCED UPSERT RPC -- Keeps structured dashboard columns in sync when callers use -- the base upsert_thought RPC with metadata payloads. +-- +-- v1.1 behavior deltas (ported from ExoCortex, UUID-adapted; the +-- {id, fingerprint} return contract and status handling from v1 are +-- preserved unchanged so existing callers and schemas/workflow-status +-- are not affected): +-- +-- a) Original-fingerprint fallback dedup. When a thought's content is +-- later corrected, its content_fingerprint changes. A reimport of +-- the ORIGINAL source text would previously insert a stale sibling +-- row that "outvotes" the correction. Update paths (REST/MCP) may +-- append the pre-edit fingerprint to an append-only array +-- metadata.original_fingerprints[]. This RPC now treats an incoming +-- fingerprint that matches that array as a dedup hit on the +-- corrected row (merge metadata; never insert; never touch content). +-- Exact content_fingerprint match still wins over the fallback. +-- +-- b) User-edit guard. Keys listed in metadata.user_edits are owned by +-- the human. On the merge path they are stripped from the incoming +-- patch so a later automated import cannot resurrect stale values +-- over a correction. original_fingerprints and user_edits are +-- system-managed: the merge never lets an incoming payload rewrite +-- them, and inserts drop them unless well-formed. +-- +-- The dedup fallback path cannot be expressed in ON CONFLICT, so the +-- function now does an explicit lookup (exact fingerprint, then +-- original-fingerprint fallback) and branches into INSERT vs UPDATE. +-- Importance keeps v1's 0-100 clamp (NOT ExoCortex's 0-6) so existing +-- rows are not retroactively rescaled — see README "Changes from v1". -- ============================================================ CREATE OR REPLACE FUNCTION public.upsert_thought(p_content TEXT, p_payload JSONB DEFAULT '{}') @@ -370,6 +433,10 @@ DECLARE v_quality_score NUMERIC(5,2); v_sensitivity_tier TEXT; v_status TEXT; + v_existing_metadata JSONB; + v_user_edits JSONB := '{}'::jsonb; + v_protected_keys TEXT[] := ARRAY[]::text[]; + v_inserted BOOLEAN := false; BEGIN v_metadata := COALESCE(p_payload->'metadata', '{}'::jsonb); v_type := COALESCE(NULLIF(v_metadata->>'type', ''), 'observation'); @@ -395,44 +462,123 @@ BEGIN 'UTF8' )), 'hex'); - INSERT INTO public.thoughts ( - content, - content_fingerprint, - metadata, - type, - source_type, - importance, - quality_score, - sensitivity_tier, - status, - status_updated_at - ) - VALUES ( - p_content, - v_fingerprint, - v_metadata, - v_type, - v_source_type, - v_importance, - v_quality_score, - v_sensitivity_tier, - v_status, - CASE WHEN v_status IS NULL THEN NULL ELSE now() END - ) - ON CONFLICT (content_fingerprint) WHERE content_fingerprint IS NOT NULL DO UPDATE - SET updated_at = now(), - metadata = public.thoughts.metadata || COALESCE(EXCLUDED.metadata, '{}'::jsonb), - type = COALESCE(EXCLUDED.type, public.thoughts.type), - source_type = COALESCE(EXCLUDED.source_type, public.thoughts.source_type), - importance = COALESCE(EXCLUDED.importance, public.thoughts.importance), - quality_score = COALESCE(EXCLUDED.quality_score, public.thoughts.quality_score), - sensitivity_tier = COALESCE(EXCLUDED.sensitivity_tier, public.thoughts.sensitivity_tier), - status = COALESCE(EXCLUDED.status, public.thoughts.status), + -- (a) Exact-fingerprint lookup first (the v1 ON CONFLICT key). + SELECT t.id, t.metadata + INTO v_id, v_existing_metadata + FROM public.thoughts t + WHERE t.content_fingerprint = v_fingerprint + FOR UPDATE; + + -- (a) Original-fingerprint fallback: land on the corrected row instead + -- of inserting a stale sibling that outvotes the correction. + IF v_id IS NULL THEN + SELECT t.id, t.metadata + INTO v_id, v_existing_metadata + FROM public.thoughts t + WHERE jsonb_typeof(t.metadata->'original_fingerprints') = 'array' + AND t.metadata->'original_fingerprints' ? v_fingerprint + ORDER BY t.created_at ASC, t.id ASC + LIMIT 1 + FOR UPDATE; + END IF; + + IF v_id IS NULL THEN + -- INSERT path. Imports cannot mint malformed protections: drop + -- user_edits / original_fingerprints from the inserted metadata + -- unless they are well-formed (a round-tripped export keeps valid + -- stamps). + IF v_metadata ? 'user_edits' + AND jsonb_typeof(v_metadata->'user_edits') <> 'object' THEN + v_metadata := v_metadata - 'user_edits'; + END IF; + IF v_metadata ? 'original_fingerprints' + AND jsonb_typeof(v_metadata->'original_fingerprints') <> 'array' THEN + v_metadata := v_metadata - 'original_fingerprints'; + END IF; + + -- Race guard: the explicit lookup above is not atomic with this INSERT, + -- so a concurrent call with the same content_fingerprint can slip in + -- between. v1 got this for free from ON CONFLICT; here we catch the + -- unique_violation, re-read the row the other txn inserted, and fall + -- through to the UPDATE/merge path so the contract (always return an + -- existing-or-new {id, fingerprint}) holds. + BEGIN + INSERT INTO public.thoughts ( + content, + content_fingerprint, + metadata, + type, + source_type, + importance, + quality_score, + sensitivity_tier, + status, + status_updated_at + ) + VALUES ( + p_content, + v_fingerprint, + v_metadata, + v_type, + v_source_type, + v_importance, + v_quality_score, + v_sensitivity_tier, + v_status, + CASE WHEN v_status IS NULL THEN NULL ELSE now() END + ) + RETURNING id INTO v_id; + v_inserted := true; + EXCEPTION WHEN unique_violation THEN + -- Another transaction inserted this fingerprint first. Adopt its row + -- and continue into the merge branch below. + SELECT t.id, t.metadata + INTO v_id, v_existing_metadata + FROM public.thoughts t + WHERE t.content_fingerprint = v_fingerprint + FOR UPDATE; + -- Restore the caller's incoming metadata for the merge step (the + -- INSERT-path malformed-protection stripping above does not apply on + -- the merge path, which has its own user-edit guard). + v_metadata := COALESCE(p_payload->'metadata', '{}'::jsonb); + END; + END IF; + + IF NOT v_inserted THEN + -- (b) User-edit guard: strip human-owned keys (and the system-managed + -- user_edits / original_fingerprints maps) from the incoming patch so + -- the merge can never resurrect stale values over a correction. + v_user_edits := COALESCE(v_existing_metadata->'user_edits', '{}'::jsonb); + IF jsonb_typeof(v_user_edits) <> 'object' THEN + v_user_edits := '{}'::jsonb; + END IF; + IF v_user_edits <> '{}'::jsonb THEN + SELECT COALESCE(array_agg(k), ARRAY[]::text[]) + INTO v_protected_keys + FROM jsonb_object_keys(v_user_edits) k; + v_metadata := v_metadata - v_protected_keys; + END IF; + v_metadata := v_metadata - 'user_edits'; + v_metadata := v_metadata - 'original_fingerprints'; + + -- Recompute status the same way the v1 ON CONFLICT branch did, but + -- against the existing row's values fetched above. + UPDATE public.thoughts SET + updated_at = now(), + metadata = public.thoughts.metadata || v_metadata, + type = COALESCE(v_type, public.thoughts.type), + source_type = COALESCE(v_source_type, public.thoughts.source_type), + importance = COALESCE(v_importance, public.thoughts.importance), + quality_score = COALESCE(v_quality_score, public.thoughts.quality_score), + sensitivity_tier = COALESCE(v_sensitivity_tier, public.thoughts.sensitivity_tier), + status = COALESCE(v_status, public.thoughts.status), status_updated_at = CASE - WHEN EXCLUDED.status IS DISTINCT FROM public.thoughts.status THEN now() + WHEN COALESCE(v_status, public.thoughts.status) + IS DISTINCT FROM public.thoughts.status THEN now() ELSE public.thoughts.status_updated_at END - RETURNING id INTO v_id; + WHERE public.thoughts.id = v_id; + END IF; v_result := jsonb_build_object('id', v_id, 'fingerprint', v_fingerprint); RETURN v_result; @@ -441,5 +587,122 @@ $$ LANGUAGE plpgsql; GRANT EXECUTE ON FUNCTION public.upsert_thought(TEXT, JSONB) TO service_role; +-- ============================================================ +-- 7. SUPERSEDED-AWARE SEMANTIC SEARCH RPC (opt-in variant) +-- match_thoughts_superseded_aware — same shape as the core +-- match_thoughts from docs/01-getting-started.md plus a new +-- superseded_by UUID column. Thoughts that have been replaced +-- (the TARGET of a 'supersedes' edge in schemas/typed-reasoning-edges) +-- receive a 0.8x similarity penalty so fresh thoughts rank above +-- their stale predecessors. Superseded rows are NEVER excluded — +-- only ranked down — so agents can still read historical context. +-- +-- The core match_thoughts RPC is NOT replaced: callers opt into this +-- variant by name, mirroring the house pattern from +-- schemas/recency-boosted-match-thoughts. +-- +-- Supersession source of truth (verified against the repo, not the +-- ExoCortex inventory): public.thought_edges, relation = 'supersedes', +-- where from_thought_id is the newer replacement (A) and to_thought_id +-- is the older/stale thought (B) — see schemas/typed-reasoning-edges +-- relation vocabulary. superseded_by returns the newest superseder's id +-- (from_thought_id of the most recent 'supersedes' edge pointing at the +-- row, by edge created_at), or NULL. +-- +-- PREREQUISITE: schemas/typed-reasoning-edges must be applied (it +-- creates public.thought_edges). If that table is absent this function +-- is not created and a NOTICE is raised; the rest of this migration +-- still applies. Re-run after installing typed-reasoning-edges to add +-- it. +-- +-- PERFORMANCE (ported from the ExoCortex rerank-plan fix): the inner +-- query oversamples 3x using the vector index (its LIMIT is an +-- optimization fence, so the planner runs it with the same fast plan as +-- the core match_thoughts), then a LATERAL probe looks up supersession +-- per candidate row and the small window is re-ranked by penalized +-- similarity. This avoids the materialization regression that ordering +-- by a penalized expression over the full table would cause. +-- ============================================================ + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_schema = 'public' AND table_name = 'thought_edges' + ) THEN + RAISE NOTICE + 'enhanced-thoughts: skipping match_thoughts_superseded_aware — ' + 'public.thought_edges not found. Apply schemas/typed-reasoning-edges ' + 'first, then re-run this migration to install it.'; + RETURN; + END IF; + + EXECUTE $fn$ + CREATE OR REPLACE FUNCTION public.match_thoughts_superseded_aware( + query_embedding vector(1536), + match_threshold float DEFAULT 0.7, + match_count int DEFAULT 10, + filter jsonb DEFAULT '{}'::jsonb + ) + RETURNS TABLE ( + id uuid, + content text, + metadata jsonb, + similarity float, + created_at timestamptz, + superseded_by uuid + ) + LANGUAGE sql + STABLE + SET statement_timeout = '30s' + AS $body$ + SELECT + sub.id, + sub.content, + sub.metadata, + CASE + WHEN sl.newest_superseder IS NOT NULL THEN sub.base_similarity * 0.8 + ELSE sub.base_similarity + END AS similarity, + sub.created_at, + sl.newest_superseder AS superseded_by + FROM ( + -- Core single-phase NN query, oversampled 3x for re-ranking. The + -- LIMIT fences this subquery so the planner runs it with the same + -- fast plan the core match_thoughts uses. + SELECT + t.id, + t.content, + t.metadata, + t.created_at, + 1 - (t.embedding <=> query_embedding) AS base_similarity + FROM public.thoughts t + WHERE t.embedding IS NOT NULL + AND (filter = '{}'::jsonb OR t.metadata @> filter) + AND 1 - (t.embedding <=> query_embedding) >= match_threshold + ORDER BY t.embedding <=> query_embedding + LIMIT greatest(1, least(match_count, 200)) * 3 + ) sub + LEFT JOIN LATERAL ( + -- Newest superseder: the from_thought_id of the most recent + -- 'supersedes' edge pointing at this thought. Ordered by the edge's + -- created_at (UUID ids are not a recency signal), tie-broken by the + -- monotonic BIGSERIAL edge id so the result is deterministic. + SELECT te.from_thought_id AS newest_superseder + FROM public.thought_edges te + WHERE te.relation = 'supersedes' + AND te.to_thought_id = sub.id + ORDER BY te.created_at DESC, te.id DESC + LIMIT 1 + ) sl ON true + ORDER BY similarity DESC, sub.base_similarity DESC + LIMIT greatest(1, least(match_count, 200)); + $body$; + $fn$; + + EXECUTE 'GRANT EXECUTE ON FUNCTION public.match_thoughts_superseded_aware(' + || 'vector(1536), float, int, jsonb) TO service_role'; +END $$; + -- Reload PostgREST schema cache NOTIFY pgrst, 'reload schema'; From 17db7fe6e8456487aa353df26771c6f46d416f53 Mon Sep 17 00:00:00 2001 From: Alan Shurafa Date: Sun, 14 Jun 2026 15:57:41 -0400 Subject: [PATCH 2/3] [schemas] enhanced-thoughts: preserve omitted fields on re-upsert A metadata-only re-upsert recomputed importance/quality_score/type/ source_type/sensitivity_tier to hardcoded defaults, so the merge-path COALESCE never fell through to the existing column and silently overwrote it -- including a sensitivity_tier restricted->standard privacy downgrade. Split the locals into insert-only defaults (new rows) and explicit-incoming values that are NULL when the payload omits the key, so an omitted field preserves the existing column while an explicit value still updates. Extend the user-edit guard to the promoted scalar columns so a human-owned field keeps column and metadata in agreement, strip both source/source_type aliases when either is guarded, and re-derive the task/idea status seed from the effective post-guard type so a rejected type can't seed a stale status. Co-Authored-By: Claude Opus 4.8 (1M context) --- schemas/enhanced-thoughts/README.md | 3 +- schemas/enhanced-thoughts/schema.sql | 121 ++++++++++++++++++++++----- 2 files changed, 100 insertions(+), 24 deletions(-) diff --git a/schemas/enhanced-thoughts/README.md b/schemas/enhanced-thoughts/README.md index 8cd3f3de1..fa596b7fc 100644 --- a/schemas/enhanced-thoughts/README.md +++ b/schemas/enhanced-thoughts/README.md @@ -79,7 +79,8 @@ What changed for an existing install: - **`upsert_thought` gained two dedup/merge guards.** - *Original-fingerprint fallback*: when a thought's content is corrected, its fingerprint changes. If your update path appends the pre-edit fingerprint to an append-only array `metadata.original_fingerprints[]`, a later reimport of the original source text now lands on the corrected row as a dedup hit instead of inserting a stale sibling that "outvotes" the correction. Exact-fingerprint match still wins first. If you never write `original_fingerprints`, behavior is identical to v1 (an extra indexed lookup on miss, no semantic change). - - *User-edit guard*: keys listed in `metadata.user_edits` are treated as human-owned. On the merge path they (and the system-managed `user_edits` / `original_fingerprints` maps) are stripped from the incoming patch so an automated reimport cannot resurrect stale values over a human correction. This guards **metadata keys only** — the top-level structured columns (`type`, `importance`, etc.) keep v1's overwrite-on-merge behavior. If you never write `user_edits`, behavior is identical to v1. + - *User-edit guard*: keys listed in `metadata.user_edits` are treated as human-owned. On the merge path they (and the system-managed `user_edits` / `original_fingerprints` maps) are stripped from the incoming patch so an automated reimport cannot resurrect stale values over a human correction. The guard now also covers the **promoted scalar columns**: if a field (`type`, `importance`, `quality_score`, `source_type`, `sensitivity_tier`) is marked human-owned, the scalar column is preserved too, so the column and `metadata.` stay in agreement instead of the column silently overwriting. If you never write `user_edits`, behavior is identical to v1. + - *Merge preserves omitted fields*: on a re-upsert that **omits** a structured field, the existing column is now kept rather than reset to a hardcoded default. A metadata-only re-upsert (new tags, a note) no longer rewrites `importance`/`quality_score`/`type`/`source_type`/`sensitivity_tier` — in particular it no longer silently downgrades `sensitivity_tier` from `restricted` to `standard`. An explicitly provided value still updates the column, and brand-new rows still get the documented insert defaults. - To make the fallback possible the function now does an explicit fingerprint lookup and branches INSERT vs UPDATE instead of using `ON CONFLICT`. The visible result is the same `{id, fingerprint}` payload. - **Importance stays on the 0-100 scale (deliberate deviation).** ExoCortex widened its own importance to a 0-6 scale. Open Brain's `upsert_thought` already accepts a wider 0-100 range, so it does not clip 0-6 values — adopting 0-6 here would retroactively rescale every existing row's importance, which is a breaking data change, not an additive one. The column default remains 3; payload values are clamped to 0-100. Treat 0-6 as a subset if you want cross-system parity. diff --git a/schemas/enhanced-thoughts/schema.sql b/schemas/enhanced-thoughts/schema.sql index de3f703d6..fc4d3b48a 100644 --- a/schemas/enhanced-thoughts/schema.sql +++ b/schemas/enhanced-thoughts/schema.sql @@ -427,32 +427,65 @@ DECLARE v_result JSONB; v_id UUID; v_metadata JSONB; + -- "explicit_*" vars are NULL when the payload OMITS the field, and the + -- caller-supplied value when it is present. They drive the merge path so an + -- omitted field PRESERVES the existing column instead of resetting it to a + -- hardcoded default (the v1.1 silent-overwrite / privacy-downgrade bug). + v_explicit_type TEXT; + v_explicit_source_type TEXT; + v_explicit_importance SMALLINT; + v_explicit_quality_score NUMERIC(5,2); + v_explicit_sensitivity_tier TEXT; + -- "insert_*" vars fold the INSERT-only defaults in. Used only on the INSERT + -- (new-row) path, never to overwrite an existing column on merge. + v_insert_type TEXT; + v_insert_source_type TEXT; + v_insert_importance SMALLINT; + v_insert_quality_score NUMERIC(5,2); + v_insert_sensitivity_tier TEXT; v_type TEXT; - v_source_type TEXT; - v_importance SMALLINT; - v_quality_score NUMERIC(5,2); - v_sensitivity_tier TEXT; v_status TEXT; + v_status_explicit BOOLEAN := false; v_existing_metadata JSONB; v_user_edits JSONB := '{}'::jsonb; v_protected_keys TEXT[] := ARRAY[]::text[]; v_inserted BOOLEAN := false; BEGIN v_metadata := COALESCE(p_payload->'metadata', '{}'::jsonb); - v_type := COALESCE(NULLIF(v_metadata->>'type', ''), 'observation'); - v_source_type := COALESCE(NULLIF(v_metadata->>'source_type', ''), NULLIF(v_metadata->>'source', ''), 'unknown'); - v_importance := CASE + + -- Explicit-incoming values: NULL when the key is absent/blank/unparseable, + -- so the merge path can distinguish "omitted" from "explicitly provided". + v_explicit_type := NULLIF(v_metadata->>'type', ''); + v_explicit_source_type := COALESCE( + NULLIF(v_metadata->>'source_type', ''), + NULLIF(v_metadata->>'source', '') + ); + v_explicit_importance := CASE WHEN COALESCE(v_metadata->>'importance', '') ~ '^[0-9]+(\.[0-9]+)?$' THEN LEAST(100, GREATEST(0, ROUND((v_metadata->>'importance')::numeric)))::smallint - ELSE 50 + ELSE NULL END; - v_quality_score := CASE + v_explicit_quality_score := CASE WHEN COALESCE(v_metadata->>'quality_score', '') ~ '^[0-9]+(\.[0-9]+)?$' THEN LEAST(100, GREATEST(0, (v_metadata->>'quality_score')::numeric)) - ELSE 70 + ELSE NULL END; - v_sensitivity_tier := COALESCE(NULLIF(v_metadata->>'sensitivity_tier', ''), 'standard'); + v_explicit_sensitivity_tier := NULLIF(v_metadata->>'sensitivity_tier', ''); + + -- INSERT-path values: apply the v1.1 hardcoded defaults for brand-new rows. + v_insert_type := COALESCE(v_explicit_type, 'observation'); + v_insert_source_type := COALESCE(v_explicit_source_type, 'unknown'); + v_insert_importance := COALESCE(v_explicit_importance, 50); + v_insert_quality_score := COALESCE(v_explicit_quality_score, 70); + v_insert_sensitivity_tier := COALESCE(v_explicit_sensitivity_tier, 'standard'); + + -- v_type drives status seeding below; on insert it is the resolved type. + v_type := v_insert_type; v_status := COALESCE(NULLIF(p_payload->>'status', ''), NULLIF(v_metadata->>'status', '')); + v_status_explicit := v_status IS NOT NULL; + -- INSERT-path auto-seed: task/idea get status 'new'. On the merge path this + -- is re-derived from the EFFECTIVE (post-user-edit-guard) type so a guard- + -- rejected incoming type cannot seed a 'new' status it never gets to set. IF v_status IS NULL AND v_type IN ('task', 'idea') THEN v_status := 'new'; END IF; @@ -519,11 +552,11 @@ BEGIN p_content, v_fingerprint, v_metadata, - v_type, - v_source_type, - v_importance, - v_quality_score, - v_sensitivity_tier, + v_insert_type, + v_insert_source_type, + v_insert_importance, + v_insert_quality_score, + v_insert_sensitivity_tier, v_status, CASE WHEN v_status IS NULL THEN NULL ELSE now() END ) @@ -557,20 +590,62 @@ BEGIN INTO v_protected_keys FROM jsonb_object_keys(v_user_edits) k; v_metadata := v_metadata - v_protected_keys; + + -- Keep the promoted scalar column in sync with the metadata guard: if a + -- field is marked human-owned, drop the incoming scalar too so the + -- existing column is preserved (column and metadata stay in agreement + -- instead of the column overwriting while metadata. is kept). + IF v_user_edits ? 'type' THEN + v_explicit_type := NULL; + END IF; + IF v_user_edits ? 'source_type' OR v_user_edits ? 'source' THEN + v_explicit_source_type := NULL; + -- source / source_type are aliases for one scalar column. If either + -- is human-owned, strip BOTH metadata keys so the unprotected alias + -- can't merge in and diverge from the preserved column. + v_metadata := v_metadata - 'source_type' - 'source'; + END IF; + IF v_user_edits ? 'importance' THEN + v_explicit_importance := NULL; + END IF; + IF v_user_edits ? 'quality_score' THEN + v_explicit_quality_score := NULL; + END IF; + IF v_user_edits ? 'sensitivity_tier' THEN + v_explicit_sensitivity_tier := NULL; + END IF; END IF; v_metadata := v_metadata - 'user_edits'; v_metadata := v_metadata - 'original_fingerprints'; - -- Recompute status the same way the v1 ON CONFLICT branch did, but - -- against the existing row's values fetched above. + -- Re-derive the auto-seeded status from the EFFECTIVE type (the value the + -- merge will actually write: explicit-if-provided-and-not-guarded, else the + -- existing column). Without this, an incoming type that the user-edit guard + -- rejects could still leave v_status='new' seeded from it. An explicitly + -- supplied status is never overridden. + IF NOT v_status_explicit THEN + SELECT COALESCE(v_explicit_type, t.type) + INTO v_type + FROM public.thoughts t + WHERE t.id = v_id; + IF v_type IN ('task', 'idea') THEN + v_status := 'new'; + ELSE + v_status := NULL; + END IF; + END IF; + + -- Merge path: an OMITTED scalar field PRESERVES the existing column + -- (v_explicit_* is NULL when the payload did not provide it); an explicit + -- value still updates. Insert-only defaults never apply here. UPDATE public.thoughts SET updated_at = now(), metadata = public.thoughts.metadata || v_metadata, - type = COALESCE(v_type, public.thoughts.type), - source_type = COALESCE(v_source_type, public.thoughts.source_type), - importance = COALESCE(v_importance, public.thoughts.importance), - quality_score = COALESCE(v_quality_score, public.thoughts.quality_score), - sensitivity_tier = COALESCE(v_sensitivity_tier, public.thoughts.sensitivity_tier), + type = COALESCE(v_explicit_type, public.thoughts.type), + source_type = COALESCE(v_explicit_source_type, public.thoughts.source_type), + importance = COALESCE(v_explicit_importance, public.thoughts.importance), + quality_score = COALESCE(v_explicit_quality_score, public.thoughts.quality_score), + sensitivity_tier = COALESCE(v_explicit_sensitivity_tier, public.thoughts.sensitivity_tier), status = COALESCE(v_status, public.thoughts.status), status_updated_at = CASE WHEN COALESCE(v_status, public.thoughts.status) From c6f972b9907357e8ee85fe6e4a943e09782bfda0 Mon Sep 17 00:00:00 2001 From: Alan Shurafa Date: Sun, 14 Jun 2026 16:17:17 -0400 Subject: [PATCH 3/3] [schemas] enhanced-thoughts: honor metadata sensitivity, guard stale reimports Restricted rows leaked through search_thoughts_text when the tier lived only in metadata (column default 'standard'); restrict if either source says 'restricted', matching how provenance-chains reads the tier. Original-fingerprint dedup returns the corrected row, but p_content is the old text; surface matched_via so open-brain-rest skips overwriting the corrected row's embedding with a stale-text vector. Co-Authored-By: Claude Opus 4.8 (1M context) --- integrations/open-brain-rest/index.ts | 16 ++++++- schemas/enhanced-thoughts/README.md | 8 +++- schemas/enhanced-thoughts/schema.sql | 62 ++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 11 deletions(-) diff --git a/integrations/open-brain-rest/index.ts b/integrations/open-brain-rest/index.ts index 84281a2a0..b12e2e8f5 100644 --- a/integrations/open-brain-rest/index.ts +++ b/integrations/open-brain-rest/index.ts @@ -424,8 +424,17 @@ async function createThought(body: z.infer) { ? "new" : null; - const update = { - embedding, + // When upsert_thought resolved the row via the original-fingerprint fallback, + // `content`/`embedding` here were computed from the OLD pre-correction text + // (a reimport of the original source). The matched row is the CORRECTED one, + // so overwriting its embedding with the stale-text vector would silently + // desync it from its corrected content. Skip the embedding overwrite on that + // path — the RPC already merged metadata and left content untouched, so this + // becomes a pure dedup hit. + const matchedViaOriginalFingerprint = + upsert.data?.matched_via === "original_fingerprint"; + + const update: Record = { metadata, type, source_type: sourceType, @@ -435,6 +444,9 @@ async function createThought(body: z.infer) { status, status_updated_at: status ? new Date().toISOString() : null, }; + if (!matchedViaOriginalFingerprint) { + update.embedding = embedding; + } const { error } = await supabase.from("thoughts").update(update).eq("id", thoughtId); if (error) throw new Error(error.message); diff --git a/schemas/enhanced-thoughts/README.md b/schemas/enhanced-thoughts/README.md index fa596b7fc..5ae9ae13a 100644 --- a/schemas/enhanced-thoughts/README.md +++ b/schemas/enhanced-thoughts/README.md @@ -75,10 +75,10 @@ v1.1 brings the RPCs in line with how the reference Open Brain install runs them What changed for an existing install: -- **`search_thoughts_text` now reads three control keys out of `p_filter`.** `start_date` and `end_date` (ISO 8601 timestamps) filter `created_at` to that range; `exclude_restricted` (boolean) drops `sensitivity_tier = 'restricted'` rows. These keys are stripped from the metadata-containment predicate, so they no longer require a literal metadata key of the same name. Any other `p_filter` key keeps its original `metadata @> filter` behavior. If you were (accidentally) relying on a metadata key literally named `start_date`/`end_date`/`exclude_restricted`, it is now interpreted as a control key instead. +- **`search_thoughts_text` now reads three control keys out of `p_filter`.** `start_date` and `end_date` (ISO 8601 timestamps) filter `created_at` to that range; `exclude_restricted` (boolean) drops restricted rows. A row counts as restricted when **either** the promoted `sensitivity_tier` column **or** `metadata->>'sensitivity_tier'` is `'restricted'`, so rows captured before this schema — or by canonical flows that keep the tier only in `metadata` (the same place `schemas/provenance-chains` reads it) — do not leak through on the column's `'standard'` default. These keys are stripped from the metadata-containment predicate, so they no longer require a literal metadata key of the same name. Any other `p_filter` key keeps its original `metadata @> filter` behavior. If you were (accidentally) relying on a metadata key literally named `start_date`/`end_date`/`exclude_restricted`, it is now interpreted as a control key instead. - **`upsert_thought` gained two dedup/merge guards.** - - *Original-fingerprint fallback*: when a thought's content is corrected, its fingerprint changes. If your update path appends the pre-edit fingerprint to an append-only array `metadata.original_fingerprints[]`, a later reimport of the original source text now lands on the corrected row as a dedup hit instead of inserting a stale sibling that "outvotes" the correction. Exact-fingerprint match still wins first. If you never write `original_fingerprints`, behavior is identical to v1 (an extra indexed lookup on miss, no semantic change). + - *Original-fingerprint fallback*: when a thought's content is corrected, its fingerprint changes. If your update path appends the pre-edit fingerprint to an append-only array `metadata.original_fingerprints[]`, a later reimport of the original source text now lands on the corrected row as a dedup hit instead of inserting a stale sibling that "outvotes" the correction. Exact-fingerprint match still wins first. If you never write `original_fingerprints`, behavior is identical to v1 (an extra indexed lookup on miss, no semantic change). On this fallback path the incoming `p_content` is the **old, pre-correction** text, so the return payload now carries a `matched_via` field (`'inserted'`, `'fingerprint'`, or `'original_fingerprint'`) alongside the unchanged `{id, fingerprint}`. A caller that recomputes an embedding from `p_content` (such as `integrations/open-brain-rest`) checks for `matched_via = 'original_fingerprint'` and skips writing the stale-text embedding over the corrected row. The RPC itself never writes `content` or `embedding` on the merge path, so older callers that ignore `matched_via` keep their previous behavior. - *User-edit guard*: keys listed in `metadata.user_edits` are treated as human-owned. On the merge path they (and the system-managed `user_edits` / `original_fingerprints` maps) are stripped from the incoming patch so an automated reimport cannot resurrect stale values over a human correction. The guard now also covers the **promoted scalar columns**: if a field (`type`, `importance`, `quality_score`, `source_type`, `sensitivity_tier`) is marked human-owned, the scalar column is preserved too, so the column and `metadata.` stay in agreement instead of the column silently overwriting. If you never write `user_edits`, behavior is identical to v1. - *Merge preserves omitted fields*: on a re-upsert that **omits** a structured field, the existing column is now kept rather than reset to a hardcoded default. A metadata-only re-upsert (new tags, a note) no longer rewrites `importance`/`quality_score`/`type`/`source_type`/`sensitivity_tier` — in particular it no longer silently downgrades `sensitivity_tier` from `restricted` to `standard`. An explicitly provided value still updates the column, and brand-new rows still get the documented insert defaults. - To make the fallback possible the function now does an explicit fingerprint lookup and branches INSERT vs UPDATE instead of using `ON CONFLICT`. The visible result is the same `{id, fingerprint}` payload. @@ -109,3 +109,7 @@ Solution: Confirm your thoughts have content populated. Try a simple query first **Issue: brain_stats_aggregate returns empty types or topics** Solution: The function filters by `created_at`. Pass `p_since_days := 0` for all-time stats. Also confirm that your thoughts have the `type` column populated. If you use non-canonical type values in `metadata->>'type'` (anything outside `idea`, `task`, `person_note`, `reference`, `decision`, `lesson`, `meeting`, `journal`), call the backfill RPC with your own allowlist, e.g. `SELECT backfill_thought_types(ARRAY['idea','task','article','quote']);`, or `SELECT backfill_thought_types(NULL);` to accept whatever is present. + +## More from Nate + +Open Brain is built in the open by Nate B. Jones — more practical systems like this on his [Substack](https://substack.com/@natesnewsletter) and at [natebjones.com](https://natebjones.com). diff --git a/schemas/enhanced-thoughts/schema.sql b/schemas/enhanced-thoughts/schema.sql index fc4d3b48a..cea624255 100644 --- a/schemas/enhanced-thoughts/schema.sql +++ b/schemas/enhanced-thoughts/schema.sql @@ -39,7 +39,15 @@ CREATE INDEX IF NOT EXISTS idx_thoughts_content_tsvector -- start_date / end_date — ISO 8601 timestamps; filter created_at -- to the [start_date, end_date] range. -- exclude_restricted — boolean; when true, drop rows whose --- sensitivity_tier is 'restricted'. +-- sensitivity_tier is 'restricted'. A row is +-- treated as restricted if EITHER the promoted +-- sensitivity_tier column OR +-- metadata->>'sensitivity_tier' is 'restricted', +-- so rows captured before this schema (or by +-- canonical flows that store the tier only in +-- metadata, as provenance-chains reads it) do not +-- leak through on the column default of +-- 'standard'. -- These keys are stripped from the containment predicate so they do -- not accidentally require a literal metadata key of the same name. -- All other p_filter keys keep their original `metadata @> filter` @@ -101,7 +109,8 @@ BEGIN AND to_tsvector('simple', coalesce(t.content, '')) @@ q.ts_query AND t.metadata @> v_meta_filter AND (NOT v_exclude_restricted - OR coalesce(t.sensitivity_tier, 'standard') <> 'restricted') + OR (coalesce(t.sensitivity_tier, 'standard') <> 'restricted' + AND coalesce(t.metadata->>'sensitivity_tier', 'standard') <> 'restricted')) AND (v_start_date IS NULL OR t.created_at >= v_start_date) AND (v_end_date IS NULL OR t.created_at <= v_end_date) LIMIT 2000 @@ -116,7 +125,8 @@ BEGIN AND t.content ILIKE '%' || q.raw_query || '%' AND t.metadata @> v_meta_filter AND (NOT v_exclude_restricted - OR coalesce(t.sensitivity_tier, 'standard') <> 'restricted') + OR (coalesce(t.sensitivity_tier, 'standard') <> 'restricted' + AND coalesce(t.metadata->>'sensitivity_tier', 'standard') <> 'restricted')) AND (v_start_date IS NULL OR t.created_at >= v_start_date) AND (v_end_date IS NULL OR t.created_at <= v_end_date) AND NOT EXISTS (SELECT 1 FROM tsvector_hits th WHERE th.hit_id = t.id) @@ -404,7 +414,12 @@ WHERE source_type IS NULL AND metadata->>'source' IS NOT NULL; -- metadata.original_fingerprints[]. This RPC now treats an incoming -- fingerprint that matches that array as a dedup hit on the -- corrected row (merge metadata; never insert; never touch content). --- Exact content_fingerprint match still wins over the fallback. +-- Exact content_fingerprint match still wins over the fallback. On this +-- fallback path p_content is the OLD pre-correction text, so the return +-- payload reports matched_via = 'original_fingerprint' to tell callers +-- that recompute an embedding/content from p_content (e.g. +-- integrations/open-brain-rest) NOT to overwrite the corrected row with +-- those stale values. Other paths report 'fingerprint' or 'inserted'. -- -- b) User-edit guard. Keys listed in metadata.user_edits are owned by -- the human. On the merge path they are stripped from the incoming @@ -450,6 +465,17 @@ DECLARE v_user_edits JSONB := '{}'::jsonb; v_protected_keys TEXT[] := ARRAY[]::text[]; v_inserted BOOLEAN := false; + -- How the row was resolved, surfaced in the return payload so callers that + -- recompute an embedding/content from p_content can tell when they are + -- looking at a CORRECTED row reached via the original-fingerprint fallback + -- (p_content is the OLD, pre-correction text) and skip overwriting the + -- corrected row's embedding/content with stale values. Values: + -- 'inserted' — brand-new row (p_content is authoritative) + -- 'fingerprint' — exact content_fingerprint dedup hit + -- 'original_fingerprint' — landed on a corrected row via the + -- metadata.original_fingerprints[] fallback; + -- p_content is STALE, do not overwrite content/embedding + v_matched_via TEXT := 'inserted'; BEGIN v_metadata := COALESCE(p_payload->'metadata', '{}'::jsonb); @@ -502,8 +528,17 @@ BEGIN WHERE t.content_fingerprint = v_fingerprint FOR UPDATE; + IF v_id IS NOT NULL THEN + v_matched_via := 'fingerprint'; + END IF; + -- (a) Original-fingerprint fallback: land on the corrected row instead - -- of inserting a stale sibling that outvotes the correction. + -- of inserting a stale sibling that outvotes the correction. p_content + -- here is the OLD pre-correction text, so this is a PURE dedup hit: the + -- row's content/embedding belong to the correction and must not be + -- overwritten with the stale incoming text. The merge below never touches + -- content, and the 'original_fingerprint' signal in the return payload + -- tells the caller to skip recomputing the embedding from p_content. IF v_id IS NULL THEN SELECT t.id, t.metadata INTO v_id, v_existing_metadata @@ -513,6 +548,9 @@ BEGIN ORDER BY t.created_at ASC, t.id ASC LIMIT 1 FOR UPDATE; + IF v_id IS NOT NULL THEN + v_matched_via := 'original_fingerprint'; + END IF; END IF; IF v_id IS NULL THEN @@ -564,7 +602,9 @@ BEGIN v_inserted := true; EXCEPTION WHEN unique_violation THEN -- Another transaction inserted this fingerprint first. Adopt its row - -- and continue into the merge branch below. + -- and continue into the merge branch below. This is an exact-fingerprint + -- dedup hit (same content), so the caller may still write content/embedding. + v_matched_via := 'fingerprint'; SELECT t.id, t.metadata INTO v_id, v_existing_metadata FROM public.thoughts t @@ -655,7 +695,15 @@ BEGIN WHERE public.thoughts.id = v_id; END IF; - v_result := jsonb_build_object('id', v_id, 'fingerprint', v_fingerprint); + -- {id, fingerprint} is the unchanged v1 contract; matched_via is additive + -- (existing callers that read only id/fingerprint are unaffected) and lets a + -- caller skip overwriting a corrected row's content/embedding with stale text + -- when the match came via the original-fingerprint fallback. + v_result := jsonb_build_object( + 'id', v_id, + 'fingerprint', v_fingerprint, + 'matched_via', v_matched_via + ); RETURN v_result; END; $$ LANGUAGE plpgsql;