From 6070264cc19d75283de6b6117de68feb12e1ee76 Mon Sep 17 00:00:00 2001 From: Dmitry Voropaev Date: Sun, 21 Jun 2026 02:14:50 +0300 Subject: [PATCH] feat: cross-file entity links + Tier-3 entity questions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the entities extractor so one `entity` artifact spans multiple files — the structural reason knowbase beats RAG — mirroring how the API extractor grounds a route on its handler + response_model across files. - entities.py: two-pass extract. Pass 1 classifies every class and indexes entities by short name; pass 2 resolves each entity's field-type references and SQLAlchemy relationship() targets against that index, adding `related_entity` grounding edges (cross-file when the target lives elsewhere) + a `related_entities` payload list. extractor_version 1 -> 2 (derived_from set changes, so ids rotate; gated per DESIGN §6). FK / transitive imports are documented gaps. - embed/text.py: enrich entity embed text (qualified name, field + related names) so entity questions rank in search_knowledge. - tier1_entities_test.py: add a cross-file Cart -> Order pair and assert the artifact is grounded on both files (role related_entity). - questions.py + tier3_rag_test.py: a two-file Order/LineItem entity fixture and 3 entity questions; Tier-3 indexes both extractors and asserts knowbase cross-file recall@5 == 1.0 for entity questions too (now 11 questions). 52 eval tests pass; ruff + mypy --strict clean. End-to-end on knowbase itself: 9/25 entities resolve links incl. a true cross-file one (ExtractContext -> ParsedSpan). --- CHANGELOG.md | 17 ++- DESIGN.md | 2 +- README.md | 6 +- src/kb/embed/text.py | 10 ++ src/kb/eval/questions.py | 43 ++++++- src/kb/eval/tier1_entities_test.py | 26 ++++ src/kb/eval/tier3_rag_test.py | 13 +- src/kb/extract/deterministic/entities.py | 156 +++++++++++++++++++---- 8 files changed, 229 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c288541..975ac77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,13 +12,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Deterministic entities extractor** (`kb.extract.deterministic.entities`): a fully static (tree-sitter) extractor that emits one `entity` artifact per domain class — pydantic `BaseModel`, `@dataclass`, and SQLAlchemy declarative model — with its fields, grounded on the class-definition - span. Detection signals and limits are recorded in the payload (transitive bases / imperative - SQLAlchemy mapping are documented gaps, not silent losses); `framework_versions` (pydantic / - sqlalchemy) is folded into the artifact key. Surfaced via MCP `get_knowledge`/`search_knowledge`. + span **and, across files, on the first-party entities it references** (resolved from field-type + annotations and SQLAlchemy `relationship()` targets; role `related_entity`). One `entity:Order` + artifact then spans every file it depends on — the cross-file shape RAG-over-chunks misses. + Detection signals and limits are recorded in the payload (transitive bases, imperative SQLAlchemy + mapping, and `ForeignKey("table.col")` resolution are documented gaps, not silent losses); + `framework_versions` (pydantic / sqlalchemy) is folded into the artifact key. Surfaced via MCP + `get_knowledge`/`search_knowledge` (entity embed text enriched with field + related-entity names). - **Tier-1 entities gate** (`kb.eval.tier1_entities_test`): a hand-labeled HARD gate — extracted entities + fields match the oracle, a bare declarative `Base` is not an entity, a `create_model(...)` - model is asserted as a known gap, and every entity is grounded on a `class` span. Brings the headline - HARD gates to **eight**. + model is asserted as a known gap, every entity is grounded on a `class` span, and a cross-file + reference (`Cart` → `Order`) is grounded on both files. Brings the headline HARD gates to **eight**. +- **Tier-3 entity questions** (`kb.eval.questions`): the knowledge-vs-RAG A/B now also covers domain + entities (a two-file `Order`/`LineItem` fixture), asserting knowbase cross-file recall@k == 1.0 for + entity questions as well as API-contract questions. ## [0.2.0] - 2026-06-02 diff --git a/DESIGN.md b/DESIGN.md index b33d72c..329b88d 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -305,7 +305,7 @@ freshness(current|stale@sha)`, with a deterministic tie-break for reproducible e | Module | Responsibility | Key tech | |--------|----------------|----------| | `kb.structural` | Parse Python without executing it; enumerate symbols/imports/call-sites with per-SHA byte/line ranges; compute content-addressed span identity; incremental reparse. Hidden behind a `StructuralIndex`/`PathEngine` interface so a SCIP backend can replace tree-sitter later. | tree-sitter + tree-sitter-python (canonical bindings) | -| `kb.extract.deterministic` | No-LLM extractors → exact artifacts (confidence=1.0): import graph; FastAPI API contract (static, cross-file grounded); domain entities (pydantic/dataclass/SQLAlchemy, static, hand-labeled gate); griffe library surface (planned). | grimp, tree-sitter queries, griffe (static) | +| `kb.extract.deterministic` | No-LLM extractors → exact artifacts (confidence=1.0): import graph; FastAPI API contract (static, cross-file grounded); domain entities (pydantic/dataclass/SQLAlchemy, static, cross-file links to referenced entities, hand-labeled gate); griffe library surface (planned). | grimp, tree-sitter queries, griffe (static) | | `kb.introspect` | Eval-only runtime oracle: runs a FastAPI app in a network-blocked sandbox and emits `app.openapi()` for the Tier-1 API gate. Never on the index path. | subprocess sandbox, fastapi | | `kb.embed` | Replaceable embedding adapters + snapshot population for `search_knowledge`. Torch isolated behind the `embed` extra and a lazy import. | sentence-transformers (default), OpenAI (optional), pgvector | | `kb.rag` | Frozen pgvector RAG-over-source baseline — the "other arm" of the knowledge-vs-RAG A/B (no provenance/grounding). | deterministic line-window chunker, pgvector | diff --git a/README.md b/README.md index cad81f7..f38359f 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ flowchart LR **v0.2 — spine + the first knowledge extractors, MCP serving, and the knowledge-vs-RAG gate.** Everything here grounds what it claims, and nothing it cannot: - **Provenance spine** — content-addressed `span_id` (LOCKED); tree-sitter spans with a normalized S-expression fingerprint and per-SHA location; a single-Postgres, Alembic-managed store with content-addressed idempotent writes; the ≥ 1 `derived_from` anti-hallucination invariant enforced in-app *and* by a deferred DB trigger; pygit2 git ingest (no checkout) with a diff-based invalidation seed. -- **Deterministic extractors** — the **import / dependency graph** (grimp resolves the edge, tree-sitter grounds it on the exact import statement, with an honest `approximate` fallback for re-exports / relative / unmappable imports — never a silent loss); the **FastAPI API-contract** extractor, which grounds a single route **across files** (handler in `routes.py` + `response_model` class in `schemas.py`); and the **domain-entity** extractor (pydantic / dataclass / SQLAlchemy classes and their fields, grounded on the class definition — purely static, with documented detection limits). +- **Deterministic extractors** — the **import / dependency graph** (grimp resolves the edge, tree-sitter grounds it on the exact import statement, with an honest `approximate` fallback for re-exports / relative / unmappable imports — never a silent loss); the **FastAPI API-contract** extractor, which grounds a single route **across files** (handler in `routes.py` + `response_model` class in `schemas.py`); and the **domain-entity** extractor (pydantic / dataclass / SQLAlchemy classes and their fields, grounded on the class definition **and cross-file on the entities they reference** — purely static, with documented detection limits). - **`kb introspect`** — a sandboxed, network-blocked `app.openapi()` oracle, eval-only and never on the index path, that the API gate scores the static contract against. - **Read-only MCP server** — `find_provenance`, `get_knowledge`, and `search_knowledge`, each returning provenance-carrying units (method + confidence + freshness). - **pgvector embeddings + semantic search** — a replaceable embedding provider (sentence-transformers by default, OpenAI optional) populated by a separate `kb embed` pass; torch stays out of the index path. @@ -173,7 +173,7 @@ A Python package `kb` (uv, src-layout). Modules and their responsibilities: | `kb.git` | pygit2 ingest — reads blobs at a SHA (no checkout) — plus the diff-based invalidation seed. | | `kb.extract.deterministic.imports` | Deterministic import / dependency edges: tree-sitter spans grounded by line, grimp edge resolution. | | `kb.extract.deterministic.fastapi_contract` | Static FastAPI API-contract extractor; grounds a route across files (handler + `response_model` class), never imports user code. | -| `kb.extract.deterministic.entities` | Static domain-entity extractor — pydantic / dataclass / SQLAlchemy classes + their fields, grounded on the class definition; detection signals and limits recorded in the payload. | +| `kb.extract.deterministic.entities` | Static domain-entity extractor — pydantic / dataclass / SQLAlchemy classes + their fields, grounded on the class definition **and, across files, on the entities they reference** (field types + `relationship()`); detection signals and limits recorded in the payload. | | `kb.introspect` | Sandboxed, network-blocked `app.openapi()` oracle — eval-only ground truth for the API gate, never on the index path. | | `kb.mcp` | Read-only MCP server and its provenance-carrying records: `find_provenance`, `get_knowledge`, `search_knowledge`. | | `kb.embed` | Replaceable embedding adapters (sentence-transformers default, OpenAI optional) + snapshot population. Torch isolated behind the `embed` extra and a lazy import. | @@ -199,7 +199,7 @@ CI (GitHub Actions, workflow **"CI"**, `.github/workflows/ci.yml`) runs ruff, `m 3. **Tier-1 import oracle** — extracted import edges match a hand-labeled oracle, grounded on the actual import statement span; a dynamic import is asserted as a *known* gap, not a silent loss. 4. **Tier-1 API oracle** — the statically-extracted FastAPI contract equals the app's own `openapi()` (from the sandboxed introspect oracle), and the route's cross-file grounding (handler + `response_model`) is asserted. 5. **Tier-1 entities oracle** — extracted pydantic / dataclass / SQLAlchemy entities + their fields match a hand-labeled oracle, each grounded on its class span; a bare declarative `Base` is correctly *not* an entity and a `create_model(...)` model is asserted as a *known* gap. -6. **Tier-3 knowledge-vs-RAG recall** — knowbase cross-file recall@k == 1.0 for every contract question (a *structural* floor: one artifact already spans both files, so it holds regardless of embedding quality); the RAG arm is reported but **never asserted**, so a model bump can't redden CI. +6. **Tier-3 knowledge-vs-RAG recall** — knowbase cross-file recall@k == 1.0 for every cross-file question (API contracts **and** domain entities: in each case one artifact already spans both files, so the floor is *structural*, independent of embedding quality); the RAG arm is reported but **never asserted**, so a model bump can't redden CI. 7. **Tier-4 one-hop invalidation** — a content diff invalidates *exactly* the artifacts whose grounding span changed (set-equality: no over-invalidation, no stale survivors); a version bump invalidates everything. 8. **Invariants** — zero orphans (every snapshot artifact is grounded), and re-indexing the same SHA yields the identical set of artifact ids. diff --git a/src/kb/embed/text.py b/src/kb/embed/text.py index a2fc52c..d64670c 100644 --- a/src/kb/embed/text.py +++ b/src/kb/embed/text.py @@ -25,4 +25,14 @@ def embed_text(kind: str, payload: dict[str, Any]) -> str: return " ".join(p for p in parts if p.strip()) if kind == "import_edge": return f"{head} import {payload.get('importer', '')} {payload.get('imported', '')}" + if kind == "entity": + parts = [ + head, + f"entity {payload.get('qualified_name', '')}", + f"framework {payload.get('framework', '')}", + "fields " + " ".join(str(f.get("name", "")) for f in payload.get("fields", [])), + "related " + + " ".join(str(r.get("name", "")) for r in payload.get("related_entities", [])), + ] + return " ".join(p for p in parts if p.strip()) return head diff --git a/src/kb/eval/questions.py b/src/kb/eval/questions.py index 995e640..532414b 100644 --- a/src/kb/eval/questions.py +++ b/src/kb/eval/questions.py @@ -1,8 +1,12 @@ -"""Cross-file-contract questions for the knowledge-vs-RAG comparison (DESIGN.md §9, §10). +"""Cross-file questions for the knowledge-vs-RAG comparison (DESIGN.md §9, §10). -Every expected answer spans `src/app/routes.py` (the route/handler) AND `src/app/schemas.py` (the -pydantic model) of the Tier-1 FastAPI fixture — the case RAG-over-chunks fumbles. Reused by the -deterministic gate (PR-3a) and the nightly LLM A/B (PR-3b). +Two families, both spanning two files — the case RAG-over-chunks fumbles while a single grounded +knowbase artifact already covers both: + * **API contracts** — `src/app/routes.py` (route/handler) + `src/app/schemas.py` (response model), + from the Tier-1 FastAPI fixture (`FILES`). + * **Domain entities** — `src/app/domain/order.py` (the `Order` entity) + + `src/app/domain/line_item.py` (the `LineItem` it references), from `ENTITY_FILES` below. +Reused by the deterministic Tier-3 gate (PR-3a) and the nightly LLM A/B (PR-3b). """ from __future__ import annotations @@ -13,6 +17,30 @@ SCHEMAS = "src/app/schemas.py" CROSS_FILE = frozenset({ROUTES, SCHEMAS}) +ORDER_ENTITY = "src/app/domain/order.py" +LINE_ITEM_ENTITY = "src/app/domain/line_item.py" +ENTITY_CROSS_FILE = frozenset({ORDER_ENTITY, LINE_ITEM_ENTITY}) + +# A two-file entity fixture: Order references LineItem across files (the cross-file link). +ENTITY_FILES = { + "src/app/domain/__init__.py": "", + "src/app/domain/line_item.py": ( + "from dataclasses import dataclass\n\n\n" + "@dataclass\n" + "class LineItem:\n" + " sku: str\n" + " qty: int = 1\n" + ), + "src/app/domain/order.py": ( + "from dataclasses import dataclass\n" + "from app.domain.line_item import LineItem\n\n\n" + "@dataclass\n" + "class Order:\n" + " id: int\n" + " items: list[LineItem]\n" + ), +} + @dataclass(frozen=True) class Question: @@ -39,4 +67,11 @@ class Question: CROSS_FILE, frozenset({"api:GET /api/orders"})), Question("q8", "Which endpoint returns OrderOut and where is that model defined?", CROSS_FILE, frozenset({"api:GET /api/orders"})), + # Domain-entity questions — answered by the cross-file-grounded `entity:...Order` artifact. + Question("e1", "What does the Order entity contain, including its line items?", + ENTITY_CROSS_FILE, frozenset({"entity:app.domain.order.Order"})), + Question("e2", "What fields does the Order domain model have and what type are its items?", + ENTITY_CROSS_FILE, frozenset({"entity:app.domain.order.Order"})), + Question("e3", "Which model does the Order entity's items field reference, and where is it?", + ENTITY_CROSS_FILE, frozenset({"entity:app.domain.order.Order"})), ] diff --git a/src/kb/eval/tier1_entities_test.py b/src/kb/eval/tier1_entities_test.py index fe6ecbb..cab664a 100644 --- a/src/kb/eval/tier1_entities_test.py +++ b/src/kb/eval/tier1_entities_test.py @@ -16,6 +16,7 @@ from kb.eval._fixtures import make_git_repo from kb.extract.deterministic.entities import EntityExtractor from kb.store import models as m +from kb.store.queries import provenance_for_artifact # A src-layout module: a pydantic model, a dataclass, a SQLAlchemy model (plus a bare declarative # Base that is NOT an entity), and a dynamically-built model (invisible to static parsing). @@ -48,6 +49,15 @@ "\n\n" 'Dynamic = create_model("Dynamic", x=(int, ...))\n' ), + # A second module whose entity references one in shop/models.py (the cross-file link). + "src/shop/cart.py": ( + "from dataclasses import dataclass\n" + "from shop.models import Order\n" + "\n\n" + "@dataclass\n" + "class Cart:\n" + " orders: list[Order]\n" + ), } # Hand-labeled oracle: (framework, fq class). `Base` and `Dynamic` are deliberately absent. @@ -55,11 +65,13 @@ ("pydantic", "shop.models.Order"), ("dataclass", "shop.models.LineItem"), ("sqlalchemy", "shop.models.User"), + ("dataclass", "shop.cart.Cart"), } EXPECTED_FIELDS = { "shop.models.Order": {"id", "total", "note"}, "shop.models.LineItem": {"sku", "qty"}, "shop.models.User": {"id", "name", "legacy"}, # __tablename__ is metadata, not a field + "shop.cart.Cart": {"orders"}, } KNOWN_GAP = "shop.models.Dynamic" # create_model(): dynamic, invisible to static analysis @@ -132,3 +144,17 @@ def test_entities_grounded_on_class_spans(engine: Engine, tmp_path: Path) -> Non for row in rows: assert row.span_kind == "class" assert row.payload["span_mapping"] == "exact" + + +def test_cross_file_entity_links_grounded(engine: Engine, tmp_path: Path) -> None: + """`Cart` (cart.py) references `Order` (models.py) -> the artifact spans BOTH files.""" + sha = _index(engine, tmp_path) + with engine.connect() as conn: + prov = provenance_for_artifact(conn, sha, "entity:shop.cart.Cart") + by_role = {(p.file_path, p.role) for p in prov} + assert ("src/shop/cart.py", "class_definition") in by_role + assert ("src/shop/models.py", "related_entity") in by_role # cross-file grounding + + cart = next(p for p in _entity_payloads(engine, sha) if p["qualified_name"] == "shop.cart.Cart") + related = {(r["name"], r["target_fq"], r["via"]) for r in cart["related_entities"]} + assert ("Order", "shop.models.Order", "field_type") in related diff --git a/src/kb/eval/tier3_rag_test.py b/src/kb/eval/tier3_rag_test.py index 4cca382..db4fe88 100644 --- a/src/kb/eval/tier3_rag_test.py +++ b/src/kb/eval/tier3_rag_test.py @@ -15,8 +15,9 @@ from kb.daemon.pipeline import index_commit from kb.embed.population import embed_snapshot from kb.eval._fixtures import make_git_repo -from kb.eval.questions import QUESTIONS +from kb.eval.questions import ENTITY_FILES, QUESTIONS from kb.eval.tier1_api_test import FILES +from kb.extract.deterministic.entities import EntityExtractor from kb.extract.deterministic.fastapi_contract import FastAPIExtractor from kb.rag.baseline import index_rag_baseline, rag_retrieve from kb.store import queries as q @@ -28,8 +29,14 @@ @pytest.fixture(scope="module") def prepared(engine: Engine, tmp_path_factory, st_provider) -> tuple[Engine, str]: repo = tmp_path_factory.mktemp("tier3") - sha = make_git_repo(repo, [FILES])[0] - index_commit(engine, str(repo), sha, extractors=[FastAPIExtractor()], first_party_root="src") + sha = make_git_repo(repo, [{**FILES, **ENTITY_FILES}])[0] + index_commit( + engine, + str(repo), + sha, + extractors=[FastAPIExtractor(), EntityExtractor()], + first_party_root="src", + ) embed_snapshot(engine, sha, st_provider) index_rag_baseline(engine, str(repo), sha, st_provider) return engine, sha diff --git a/src/kb/extract/deterministic/entities.py b/src/kb/extract/deterministic/entities.py index 9c10f9f..082fbed 100644 --- a/src/kb/extract/deterministic/entities.py +++ b/src/kb/extract/deterministic/entities.py @@ -1,8 +1,13 @@ """Deterministic domain-entity extractor — pydantic / dataclass / SQLAlchemy (DESIGN.md §4, §14). Produces one ``entity`` artifact per domain class, grounded on that class's span (role -``class_definition``). Fully static: re-parses each class span's source with tree-sitter (the same -discipline as the FastAPI contract extractor); it never imports or executes user code. +``class_definition``) AND — across files — on the class spans of the **first-party entities it +references** (role ``related_entity``), resolved from field-type annotations and SQLAlchemy +``relationship(...)`` targets. That cross-file link is what RAG-over-chunks misses (mirrors how the +FastAPI extractor grounds a route on its handler + ``response_model``). + +Fully static: re-parses each class span's source with tree-sitter (same discipline as the FastAPI +contract extractor); it never imports or executes user code. Detection is best-effort and the signals are recorded in the payload (never a silent guess): * **dataclass** — a decorator whose dotted name ends in ``dataclass``. @@ -16,6 +21,7 @@ from __future__ import annotations +import re import textwrap import tomllib from collections.abc import Sequence @@ -30,13 +36,14 @@ from kb.structural.interface import ParsedSpan EXTRACTOR_ID = "entities" -EXTRACTOR_VERSION = "1" +EXTRACTOR_VERSION = "2" # v2: cross-file related_entity grounding (was v1: class span only) _LANGUAGE = Language(tsp.language()) _PYDANTIC_BASES = frozenset({"BaseModel", "BaseSettings"}) _SA_COLUMN_CALLS = frozenset({"Column", "mapped_column"}) _OPTIONAL_MARKERS = ("Optional[", "| None", "None |") _VERSIONED = ("pydantic", "sqlalchemy") +_IDENT = re.compile(r"[A-Za-z_][A-Za-z0-9_]*") @dataclass(frozen=True) @@ -56,19 +63,22 @@ def __init__(self) -> None: def extract(self, ctx: ExtractContext) -> list[ExtractedArtifact]: versions = _framework_versions(ctx, _VERSIONED) - artifacts: list[ExtractedArtifact] = [] + # Pass 1: parse every class into an entity record; index by short name (cross-file lookup). + parsed: list[_ParsedClass] = [] for module, spans in ctx.spans_by_module.items(): for span in spans: if span.span_kind != "class": continue - art = self._build_artifact(module, span, versions) - if art is not None: - artifacts.append(art) - return artifacts - - def _build_artifact( - self, module: str, span: ParsedSpan, versions: dict[str, str] - ) -> ExtractedArtifact | None: + pc = self._parse_class(module, span) + if pc is not None: + parsed.append(pc) + index: dict[str, list[_ParsedClass]] = {} + for pc in parsed: + index.setdefault(_basename(pc.span.fq_symbol_path), []).append(pc) + # Pass 2: build artifacts, grounding each entity on the spans of the entities it refers to. + return [self._build_artifact(pc, index, versions) for pc in parsed] + + def _parse_class(self, module: str, span: ParsedSpan) -> _ParsedClass | None: root = self._parser.parse(textwrap.dedent(span.raw_text).encode("utf-8")).root_node deco = _first_child_of_type(root, "decorated_definition") cls = ( @@ -78,23 +88,45 @@ def _build_artifact( ) if cls is None: return None - decorators = _decorator_names(deco) if deco is not None else [] bases = _base_names(cls) body = cls.child_by_field_name("body") tablename, raw_fields, relationships = _parse_body(body) - framework, signals, limitations = _classify(decorators, bases, tablename, raw_fields) if framework is None: return None + return _ParsedClass( + module=module, + span=span, + framework=framework, + fields=_select_fields(framework, raw_fields), + relationships=relationships, + tablename=tablename, + bases=bases, + signals=signals, + limitations=limitations, + ) + + def _build_artifact( + self, + pc: _ParsedClass, + index: dict[str, list[_ParsedClass]], + versions: dict[str, str], + ) -> ExtractedArtifact: + grounding: dict[bytes, DerivedEdge] = { + pc.span.span_id: DerivedEdge(pc.span.span_id, "class_definition") + } + related = _resolve_related(pc, index) + for r in related: + sid: bytes = r["span_id"] + grounding.setdefault(sid, DerivedEdge(sid, "related_entity")) - fields = _select_fields(framework, raw_fields) payload: dict[str, Any] = { - "framework": framework, - "class_name": span.fq_symbol_path.rsplit(".", 1)[-1], - "qualified_name": span.fq_symbol_path, - "module": module, - "bases": bases, + "framework": pc.framework, + "class_name": _basename(pc.span.fq_symbol_path), + "qualified_name": pc.span.fq_symbol_path, + "module": pc.module, + "bases": pc.bases, "fields": [ { "name": f.name, @@ -103,22 +135,27 @@ def _build_artifact( "required": f.required, "source": f.source, } - for f in fields + for f in pc.fields ], - "tablename": tablename, - "relationships": relationships, - "detection_signals": signals, + "tablename": pc.tablename, + "relationships": pc.relationships, + "related_entities": [ + {k: v for k, v in r.items() if k != "span_id"} for r in related + ], + "detection_signals": pc.signals, "span_mapping": "exact", - "limitations": limitations, + "limitations": pc.limitations, } framework_versions = ( - {} if framework == "dataclass" else {framework: versions.get(framework, "unknown")} + {} + if pc.framework == "dataclass" + else {pc.framework: versions.get(pc.framework, "unknown")} ) return ExtractedArtifact( kind="entity", - logical_key=f"entity:{span.fq_symbol_path}", + logical_key=f"entity:{pc.span.fq_symbol_path}", payload=payload, - derived_from=[DerivedEdge(span.span_id, "class_definition")], + derived_from=list(grounding.values()), extractor_id=self.extractor_id, extractor_version=self.extractor_version, framework_versions=framework_versions, @@ -137,6 +174,69 @@ class _Field: source: str # "annotated" | "column" +@dataclass +class _ParsedClass: + """A classified entity from pass 1, carrying everything pass 2 needs to ground cross-file.""" + + module: str + span: ParsedSpan + framework: str + fields: list[_Field] + relationships: list[dict[str, str | None]] + tablename: str | None + bases: list[str] + signals: list[str] + limitations: list[str] + + +def _resolve_related( + pc: _ParsedClass, index: dict[str, list[_ParsedClass]] +) -> list[dict[str, Any]]: + """Resolve ``pc``'s field-type and ``relationship()`` references to first-party entity classes. + + Each resolved target becomes a ``related_entity`` grounding edge (cross-file when it lives in + another module). Self-references and non-entity types are dropped; the raw ``relationships`` + stay in the payload so unresolved/external targets remain visible. + """ + refs: list[tuple[str, str]] = [] # (short name, via) + for field in pc.fields: + if field.annotation: + for token in _IDENT.findall(field.annotation): + if token in index: + refs.append((token, "field_type")) + for rel in pc.relationships: + target = rel.get("target") + if target: + name = _basename(target.strip("\"'")) + if name in index: + refs.append((name, "relationship")) + + out: list[dict[str, Any]] = [] + seen: set[str] = set() + for name, via in refs: + matches = [t for t in index[name] if t.span.span_id != pc.span.span_id] + ambiguous = len({t.span.fq_symbol_path for t in matches}) > 1 + for tgt in matches: + if tgt.span.fq_symbol_path in seen: + continue + seen.add(tgt.span.fq_symbol_path) + out.append( + { + "name": name, + "via": via, + "target_fq": tgt.span.fq_symbol_path, + "target_module": tgt.module, + "ambiguous": ambiguous, + "span_id": tgt.span.span_id, + } + ) + return out + + +def _basename(fq: str) -> str: + return fq.rsplit(".", 1)[-1] + + def _select_fields(framework: str, raw: Sequence[_RawField]) -> list[_Field]: out: list[_Field] = [] for rf in raw: