diff --git a/.gitignore b/.gitignore index a43f97d..96e22f2 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ bench/real/ .gitnexus/ # internal program/audit working docs — provenance only, never shipped in the release +/docs/design/C4_IMPORT_BINDING_REPORT.md /RESEARCH_REPORT_DORIAN_0_11_0.md /V1_IMPLEMENTATION_TRACKER.md /V1_ALIGNMENT_REPORT.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ecaed77..d56611a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,33 @@ All notable changes to dorian (`dorian-vwp`) are recorded here. Full per-release [`docs/releases/`](docs/releases/). The warrant format, checker grammar, exit codes, and trust semantics have been stable since 1.0.0. +## [Unreleased] + +C4 import-aware dependency binding. **No breaking changes** (a re-check *trigger* widening only; +warrant schema, checker grammar, exit codes, fold policy, and security posture are unchanged). + +### Added +- **C4 import-aware binding** (`src/dorian/test_deps.py`). A `pytest:` checker proves behavior *when + it runs*, but its sealed watch was only the nodeid's test file — so an edit to the implementation the + test imports could be silently skipped at revalidation even though an adequate behavior checker + existed (a re-check *trigger* gap, not a truth gap). `dorian verify` and `dorian rebind` now + statically parse the test file (stdlib `ast`, read-only — **no import execution, no `sys.path` + mutation, no package introspection, no network**) and add the tracked repo-local `.py` files it + imports to the claim's watch and auto-captured read-set. A source edit then re-runs the existing C4 + checker; **the checker still decides truth** (a file change never marks a claim `BROKEN` by itself). + Conservative: an import resolving to zero or to more than one tracked file is skipped, not guessed. +- **`dorian bench c4-import-binding`** — a deterministic, known-truth synthetic suite for the above: + the pre-fix test-file-only watcher selects 0% of implementation-only edits, the import-aware watcher + 100% of direct-import ones, with zero false `BROKEN` from a behavior-preserving edit. +- **`dorian bind-suggest`** now reports a third provenance, `bind_test_deps` / `bind (test-dep)`, for + the implementation files a claim's C4 test imports (content-free; paths only). + +### Changed +- The `bindings` / `--binding-gate` `trigger-only-symbol` diagnostic now treats a C4 test's + import-derived watches as **checker-exercised** (the test imports and runs them), so widening a + behavior claim's watch never spuriously flags it — and `--binding-gate=fail` does not start refusing + good C4 behavior claims. + ## [1.1.1] — 2026-06-19 Golden-path polish. **No breaking changes** (a scaffold default only; verification, warrant format, diff --git a/README.md b/README.md index 22df131..b9ea95b 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,18 @@ closes a silent-skip gap, but it is the honest half of the story: **binding wide re-checked; the checker still decides whether it's true.** A watched file changing never makes a claim `BROKEN` by itself. +The same trigger-coverage idea extends to behavior claims backed by a `pytest:` test. A C4 test proves +behavior *when it runs*, but its sealed watch used to be only the test file — so an edit to the +implementation the test imports could be silently skipped. `dorian` now statically parses the test +file (stdlib `ast`, read-only — no import execution, no `sys.path` mutation) and also watches the +repo-local files it imports, so a source edit re-runs the existing test even when the claim text names +no uniquely indexed symbol. It is the same honest split: **the test still decides truth; an imported +file changing only triggers the re-check.** Ambiguity is skipped, not guessed, and it is **not** a +sandbox. The `dorian bench c4-import-binding` suite measures it: the pre-fix test-file-only watcher +selects **0%** of implementation-only edits, the import-aware watcher **100%** of direct-import ones, +with zero false `BROKEN` from a behavior-preserving edit (the verdict tracks the test, not the file +change). + The binding-lifecycle benchmark measures exactly that split over **808 (artifact, mutation) pairs** across 63 invented domains, with two mechanically-frozen labels per edit — *should re-check* and *should alarm*: @@ -442,15 +454,16 @@ claims. `behavior` claim backed only by an existence checker, a vacuous pytest node). Informational, never a gate; output carries file paths only, never matched content. - `dorian bind-suggest --claims claims.json` — read-only preview of the files `verify` would auto-bind - for each claim, **with provenance** (symbol-definer vs config-key), the ambiguous symbols/keys it - would skip, and any unparseable config file. Writes nothing, never a gate. + for each claim, **with provenance** (symbol-definer, config-key, and C4 test-import dependency), the + ambiguous symbols/keys it would skip, and any unparseable config file. Writes nothing, never a gate. - `dorian revalidate --checker-source base` (also Action `checker_trust: base`; default `head`) — resolve each claim's checker spec from the `--since` base ref so a PR-added or PR-modified executable checker is never executed (public/fork PRs). Fail-closed, **not a sandbox** — pair with `--deny-exec`. -- `dorian rebind ` — re-derive a warrant's symbol-definer watches with the current binding - logic and re-seal it (born-verifiable, superseding the old id), so a warrant sealed before the symbol - index existed gains the wider watches. The watch only ever widens; a claim that has since become false - refuses the re-seal (exit 4) rather than being laundered into a fresh trusted state. +- `dorian rebind ` — re-derive a warrant's symbol-definer **and C4 test-import** watches with + the current binding logic and re-seal it (born-verifiable, superseding the old id), so a warrant sealed + before the symbol index or C4 import binding existed gains the wider watches. The watch only ever + widens; a claim that has since become false refuses the re-seal (exit 4) rather than being laundered + into a fresh trusted state. - `dorian suggest-data-checks [--columns ...] [--out f]` — born-verifiable C5 checker suggestions from a data file's current state, for review and pasting into a claim's `checkers` list. - `dorian suggest-claims [--out f]` — born-verifiable C3 claim suggestions (`symbol:` for @@ -514,10 +527,13 @@ work perishable, so you find out when it expired. daily and recording more of the breaks it catches that would otherwise have shipped. - **The binding gap, narrowed and measured** — a symbol→defining-file index now re-checks a claim when its symbol's definer changes, closing the silent-skip *trigger* gap - ([`docs/BENCHMARK_BINDING_LIFECYCLE.md`](docs/BENCHMARK_BINDING_LIFECYCLE.md)). What remains is the - honest ceiling: a definer change triggers a re-check, but only a behavior checker proves a behavior - change (the gutted-body case), and ambiguous or non-Python symbols are still left for explicit - binding ([`docs/NEXT_ALGORITHMIC_BETS.md`](docs/NEXT_ALGORITHMIC_BETS.md)). + ([`docs/BENCHMARK_BINDING_LIFECYCLE.md`](docs/BENCHMARK_BINDING_LIFECYCLE.md)). C4 behavior claims + get the same treatment: `dorian` statically resolves the repo-local files a `pytest:` test imports + and watches them too, so an implementation edit re-runs the test even when the claim text names no + symbol (`dorian bench c4-import-binding`). What remains is the honest ceiling: a trigger fires the + re-check, but only the behavior checker proves a behavior change (the gutted-body case), and + ambiguous or non-Python imports are still left for explicit binding + ([`docs/NEXT_ALGORITHMIC_BETS.md`](docs/NEXT_ALGORITHMIC_BETS.md)). - **A public benchmark on real repositories** — the `dorian bench public-repos` harness now runs **machine-derived** structural claims (operands extracted from source; known-truth observed by running the checker on the mutated copy) against frozen public-repo SHAs. Two subjects diff --git a/bench/c4_import_binding.py b/bench/c4_import_binding.py new file mode 100644 index 0000000..a1af713 --- /dev/null +++ b/bench/c4_import_binding.py @@ -0,0 +1,400 @@ +"""C4 import-aware binding benchmark — known-truth, trigger-vs-truth honest. + +Measures the C4 import-aware dependency binding fix. A C4 ``pytest:`` checker proves +behavior WHEN IT RUNS, but the pre-fix watch is only the nodeid's test file, so an edit +to the implementation the test imports was never SELECTED for re-check — the claim stayed +trusted on stale confidence. This suite scores, on synthetic scenarios with labels frozen +BEFORE measurement, two layers kept deliberately separate: + +SELECTION (trigger) layer — did a watcher SELECT the claim when the imported impl changed? + test_file_watcher the pre-fix C4 watch: the nodeid's test file ONLY (the ablation). + import_aware_watcher the fix: the test file UNION the repo-local files it imports. + +VERDICT (truth) layer — when the import-aware watcher selects and the C4 test RUNS, does +the verdict track the test? A broken implementation FAILs the test (BROKEN); a behavior- +preserving edit re-runs the test and PASSes (not BROKEN). A file change is a re-check +TRIGGER, never proof a claim is false — so the import-aware watcher's extra selections do +NOT become extra alarms. + +Scope / limits (stated up front) +-------------------------------- +- Invented synthetic fixtures authored and scored by the same tool. These numbers are a + reproducible demonstration of the MECHANISM on this suite, not evidence about any real + repository, and binding is NOT behavior proof — only the C4 test decides truth. +- Determinism: the summary is byte-identical across runs — no sha, warrant id, wall-clock + timestamp, or host path is emitted; provenance is a content digest of the scenarios plus + a deterministic run id. + +Usage +----- + python -m bench.c4_import_binding [--quick] [--out summary.json] [--md-out doc.md] + (or: dorian bench c4-import-binding ... from a dorian checkout) +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import subprocess +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: # bench/ is a repo-root package, not installed + sys.path.insert(0, str(_REPO_ROOT)) + +from dorian import gitio, test_deps # noqa: E402 +from dorian.capture.manual import parse_manual # noqa: E402 +from dorian.model import CheckerSpec, Claim # noqa: E402 +from dorian.revalidate import revalidate # noqa: E402 +from dorian.seal import seal_artifact # noqa: E402 + +SCHEMA = "dorian-c4-import-binding-v1" +BENCHMARK_ID = "c4_import_binding" + +FORBIDDEN_WORDS = ( + "proven", + "validated", + "production-grade", + "production-ready", + "universal", + "real-world validated", + "guaranteed", + "semantic proof", + "behavior proof", # binding is explicitly NOT behavior proof; never assert it +) + +GIT_ENV = { + "GIT_AUTHOR_NAME": "dorian-bench", + "GIT_AUTHOR_EMAIL": "bench@dorian.local", + "GIT_COMMITTER_NAME": "dorian-bench", + "GIT_COMMITTER_EMAIL": "bench@dorian.local", + "GIT_AUTHOR_DATE": "2026-01-01T00:00:00Z", + "GIT_COMMITTER_DATE": "2026-01-01T00:00:00Z", +} + +_PKG_INIT = ("src/__init__.py", "") # makes `from src.x import ...` import under `python -m` +_AUTH_DEF = "def verify_token(t):\n return t\n" + +# Each SELECTION scenario: a synthetic repo where a C4 test imports an impl file in one +# import style. `changed` is the implementation file the test exercises (the edit a +# reviewer would make); `should_recheck=True` means a watcher SHOULD select the claim. +# Direct-import styles are the recall target (import-aware -> 1.0); the ambiguous style is +# a deliberate, honest miss (a wrong watch is a false alarm, so the conservative resolver +# skips it — scored as a miss, never credited as a win). +_SELECTION_SCENARIOS: tuple[dict, ...] = ( + { + "name": "absolute_from", + "style": "from src.auth import verify_token", + "files": {_PKG_INIT[0]: _PKG_INIT[1], "src/auth.py": _AUTH_DEF}, + "test": "tests/test_x.py", + "test_src": "import os\nfrom src.auth import verify_token\n\ndef test_x():\n pass\n", + "nodeid": "tests/test_x.py::test_x", + "changed": "src/auth.py", + "should_recheck": True, + "kind": "direct", + }, + { + "name": "absolute_import", + "style": "import src.auth", + "files": {_PKG_INIT[0]: _PKG_INIT[1], "src/auth.py": _AUTH_DEF}, + "test": "tests/test_x.py", + "test_src": "import src.auth\n\ndef test_x():\n pass\n", + "nodeid": "tests/test_x.py::test_x", + "changed": "src/auth.py", + "should_recheck": True, + "kind": "direct", + }, + { + "name": "package_init", + "style": "import pkg (-> pkg/__init__.py)", + "files": {"pkg/__init__.py": "VALUE = 1\n"}, + "test": "tests/test_x.py", + "test_src": "import pkg\n\ndef test_x():\n pass\n", + "nodeid": "tests/test_x.py::test_x", + "changed": "pkg/__init__.py", + "should_recheck": True, + "kind": "direct", + }, + { + "name": "relative", + "style": "from .impl import f", + "files": {"pkg/__init__.py": "", "pkg/impl.py": "def f():\n return 1\n"}, + "test": "pkg/test_impl.py", + "test_src": "from .impl import f\n\ndef test_x():\n pass\n", + "nodeid": "pkg/test_impl.py::test_x", + "changed": "pkg/impl.py", + "should_recheck": True, + "kind": "direct", + }, + { + "name": "nested_module", + "style": "from src.sub.deep import g", + "files": { + "src/__init__.py": "", + "src/sub/__init__.py": "", + "src/sub/deep.py": "def g():\n return 1\n", + }, + "test": "tests/test_x.py", + "test_src": "from src.sub.deep import g\n\ndef test_x():\n pass\n", + "nodeid": "tests/test_x.py::test_x", + "changed": "src/sub/deep.py", + "should_recheck": True, + "kind": "direct", + }, + { + "name": "ambiguous", + "style": "import auth (two tracked auth.py: definer unknowable)", + "files": {"auth.py": "X = 1\n", "src/auth.py": "Y = 2\n"}, + "test": "tests/test_x.py", + "test_src": "import auth\n\ndef test_x():\n pass\n", + "nodeid": "tests/test_x.py::test_x", + "changed": "src/auth.py", + "should_recheck": True, + "kind": "ambiguous_skip", # honest miss: conservative resolver leaves it unbound + }, +) + +# VERDICT scenarios: the import-aware watcher selects the claim; the C4 test then RUNS and +# its result decides truth. `broken` makes the test fail (-> BROKEN); `benign` preserves +# behavior (-> not BROKEN). Same starting impl + test; only the edit differs. +_VERDICT_FILES = { + "src/__init__.py": "", + "src/auth.py": "def verify_token(token):\n return token == 'good'\n", + "tests/test_login.py": ( + "from src.auth import verify_token\n\n\n" + "def test_login():\n" + " assert verify_token('good') is True\n" + " assert verify_token('bad') is False\n" + ), +} +_VERDICT_NODEID = "tests/test_login.py::test_login" +_VERDICT_CASES = ( + ("broken", "def verify_token(token):\n return token == 'nope'\n", "BROKEN"), + ("benign", "def verify_token(token):\n # tidy\n return token == 'good'\n", "VERIFIED"), +) + + +def _git(repo: Path, *args: str) -> None: + import os + + subprocess.run( + ["git", *args], cwd=repo, env={**os.environ, **GIT_ENV}, check=True, capture_output=True + ) + + +def _build_repo(repo: Path, files: dict[str, str]) -> None: + repo.mkdir(parents=True, exist_ok=True) + _git(repo, "init", "-q", "-b", "main") + for rel, content in files.items(): + p = repo / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + _git(repo, "add", "-A") + _git(repo, "commit", "-q", "-m", "init") + + +def _selection_records(workspace: Path) -> list[dict]: + records: list[dict] = [] + for sc in _SELECTION_SCENARIOS: + repo = workspace / "sel" / sc["name"] + files = dict(sc["files"]) + files[sc["test"]] = sc["test_src"] + _build_repo(repo, files) + deps = test_deps.python_import_dependencies(repo, sc["test"]) + test_file_watch = {sc["test"]} + import_aware_watch = test_file_watch | set(deps) + records.append( + { + "layer": "selection", + "scenario": sc["name"], + "style": sc["style"], + "kind": sc["kind"], + "changed": sc["changed"], + "should_recheck": sc["should_recheck"], + "selected_test_file_watcher": sc["changed"] in test_file_watch, + "selected_import_aware_watcher": sc["changed"] in import_aware_watch, + # precision guard: every resolved dep is a repo-local tracked file (never a + # stdlib/third-party module) — content-free, paths only + "resolved_deps": list(deps), + } + ) + return records + + +def _verdict_records(workspace: Path) -> list[dict]: + records: list[dict] = [] + for name, mutated_impl, expected in _VERDICT_CASES: + repo = workspace / "verdict" / name + _build_repo(repo, dict(_VERDICT_FILES)) + claim = Claim( + id="login-behavior", + text="Login rejects invalid tokens.", # names no uniquely indexed symbol + kind="behavior", + load_bearing=True, + checkers=(CheckerSpec(type="C4", program=f"pytest:{_VERDICT_NODEID}"),), + ) + # seal the way `dorian verify` does: union the import-aware watch into extra_watch + + # read-set, so the impl file is watched and captured. Born-verifiable: the test passes. + extra = test_deps.c4_dependency_watch_paths(repo, [claim]) + readset = parse_manual(["tests/test_login.py", *extra.get("login-behavior", ())], repo) + seal_artifact(repo, "tests/test_login.py", readset, [claim], extra_watch=extra) + base = gitio.head_ref(repo) + (repo / "src/auth.py").write_text(mutated_impl) # the only edit: implementation + res = revalidate(repo, since=base) + if res.broken: + verdict = "BROKEN" + elif res.errored: + verdict = "ERRORED" + elif res.passed or res.relocated: + verdict = "VERIFIED" + else: + verdict = "NOT_SELECTED" + records.append( + { + "layer": "verdict", + "scenario": name, + "selected": res.candidates >= 1, + "verdict": verdict, + "expected": expected, + "match": verdict == expected, + } + ) + return records + + +def _recall(records: list[dict], watcher: str, kind: str) -> tuple[int, int]: + rel = [r for r in records if r["layer"] == "selection" and r["kind"] == kind] + hit = sum(1 for r in rel if r[watcher]) + return hit, len(rel) + + +def _summarize(records: list[dict]) -> dict: + direct_old = _recall(records, "selected_test_file_watcher", "direct") + direct_new = _recall(records, "selected_import_aware_watcher", "direct") + amb_new = _recall(records, "selected_import_aware_watcher", "ambiguous_skip") + verdicts = [r for r in records if r["layer"] == "verdict"] + alarm_correct = sum(1 for r in verdicts if r["match"]) + false_broken = sum( + 1 for r in verdicts if r["scenario"] == "benign" and r["verdict"] == "BROKEN" + ) + + def rate(hl: tuple[int, int]) -> float: + return round(hl[0] / hl[1], 4) if hl[1] else 0.0 + + return { + "schema": SCHEMA, + "provenance": {"benchmark_id": BENCHMARK_ID, "run_id": _run_id(records)}, + "composition": { + "selection_scenarios": sum(1 for r in records if r["layer"] == "selection"), + "direct_import_scenarios": direct_new[1], + "ambiguous_skip_scenarios": amb_new[1], + "verdict_scenarios": len(verdicts), + }, + "selection_layer": { + # the headline: the pre-fix test-file-only watcher misses every implementation + # edit (it watches only the test file); the import-aware watcher selects them all + "test_file_watcher_recall_direct": rate(direct_old), + "import_aware_watcher_recall_direct": rate(direct_new), + "import_aware_selected_ambiguous": amb_new[0], # 0 = the honest, conservative skip + }, + "verdict_layer": { + # selection is not alarm: the import-aware watcher's extra re-checks only become + # BROKEN when the C4 test actually fails + "alarm_match": f"{alarm_correct}/{len(verdicts)}", + "false_broken_from_benign_edit": false_broken, # MUST be 0 (survival condition) + }, + "limits": ( + "Synthetic fixtures authored and scored by one tool: a reproducible demonstration" + " of the mechanism on this suite, not evidence about any real repository. Binding" + " widens the re-check TRIGGER set; the C4 test still decides truth." + ), + } + + +def _run_id(records: list[dict]) -> str: + payload = json.dumps(records, sort_keys=True, separators=(",", ":")).encode("utf-8") + return hashlib.sha256(payload).hexdigest()[:16] + + +def run_benchmark(workspace: Path, *, quick: bool = False) -> tuple[dict, list[dict]]: + """Run both layers in ``workspace`` and return (summary, records). Deterministic: + no sha/timestamp/host path is emitted. ``quick`` is accepted for CLI parity (the + suite is already small); both layers always run.""" + records = _selection_records(workspace) + records += _verdict_records(workspace) + return _summarize(records), records + + +def _render_md(summary: dict) -> str: + s = summary["selection_layer"] + v = summary["verdict_layer"] + c = summary["composition"] + return "\n".join( + [ + "# C4 import-aware binding benchmark (current)", + "", + f"_schema `{summary['schema']}`, run_id `{summary['provenance']['run_id']}`_", + "", + "Synthetic, known-truth. Binding widens the re-check **trigger** set; the C4 test" + " still decides truth (a watched file changing never makes a claim BROKEN by" + " itself).", + "", + "## Selection (trigger) layer", + "", + "| watcher | recall on direct-import impl edits |", + "| --- | ---: |", + f"| pre-fix (test-file-only) | {s['test_file_watcher_recall_direct']} |", + f"| import-aware (this fix) | {s['import_aware_watcher_recall_direct']} |", + "", + f"- direct-import scenarios: {c['direct_import_scenarios']};" + f" ambiguous (honest skip): {c['ambiguous_skip_scenarios']}" + f" — import-aware selected {s['import_aware_selected_ambiguous']} of them" + " (a wrong watch is a false alarm, so ambiguity is skipped, not guessed).", + "", + "## Verdict (truth) layer", + "", + f"- alarm match (broken impl -> BROKEN, benign impl -> not BROKEN): {v['alarm_match']}", + f"- false BROKEN from a behavior-preserving edit: {v['false_broken_from_benign_edit']}" + " (must be 0 — file drift is a trigger, not a verdict).", + "", + f"_{summary['limits']}_", + "", + ] + ) + + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser(prog="dorian bench c4-import-binding", description=__doc__) + ap.add_argument("--quick", action="store_true", help="(accepted for parity; suite is small)") + ap.add_argument("--out", type=Path, help="write the JSON summary here") + ap.add_argument("--md-out", type=Path, help="write a markdown report here") + ap.add_argument("--records", type=Path, help="write per-scenario records (JSONL) here") + args = ap.parse_args(argv) + + import tempfile + + with tempfile.TemporaryDirectory(prefix="dorian-c4ib-") as tmp: + summary, records = run_benchmark(Path(tmp), quick=args.quick) + + blob = json.dumps(summary, indent=2, sort_keys=True) + # overclaim guard over the HUMAN-READABLE prose (the markdown body + the limits line), + # not the JSON structure — so a structural key like "provenance" never trips "proven". + prose = (_render_md(summary) + " " + summary["limits"]).lower() + for word in FORBIDDEN_WORDS: + if word in prose: + print(f"bench c4-import-binding: overclaim guard tripped on {word!r}", file=sys.stderr) + return 2 + if args.out: + args.out.write_text(blob + "\n") + if args.md_out: + args.md_out.write_text(_render_md(summary)) + if args.records: + args.records.write_text("\n".join(json.dumps(r, sort_keys=True) for r in records) + "\n") + print(blob) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/SECURITY_BOUNDARY.md b/docs/SECURITY_BOUNDARY.md index 28e2bb3..f0aa489 100644 --- a/docs/SECURITY_BOUNDARY.md +++ b/docs/SECURITY_BOUNDARY.md @@ -25,6 +25,11 @@ the deny-exec gate and these docs both derive from it. - Strips the environment of executed checkers to a small allowlist (`PATH`, `HOME`, `LANG`, `LC_ALL`) so secrets in other env vars do not leak in. - Confines checker file references to the repo root (path-escape attempts ERROR). +- Resolves a C4 test's import dependencies **statically** at seal/rebind time + (stdlib `ast` over tracked `.py` files only): it parses source to widen the + re-check watch set — it never imports application modules, executes setup code, + mutates `sys.path`, inspects installed packages, or reaches the network, and an + unresolvable/untracked/ambiguous import simply adds nothing (`src/dorian/test_deps.py`). - Bounds C3 `regex:` patterns to 500 chars, compile-guards them, and runs the match in a worker process killed at `timeout_s` so catastrophic backtracking cannot stall the run (ERROR `regex_timeout`). diff --git a/docs/V1_SCOPE.md b/docs/V1_SCOPE.md index 29504c8..f445c12 100644 --- a/docs/V1_SCOPE.md +++ b/docs/V1_SCOPE.md @@ -22,6 +22,10 @@ All additive and backward-compatible; default behavior is unchanged unless you o pytest node). Advisory; it never changes a verdict, trust state, or exit code. - **Multi-index binding** — config keys in tracked `.toml`/`.json` files now widen a claim's re-check trigger set (with provenance in `bind-suggest`). Conservative and trigger-only. +- **C4 import-aware binding** — a `pytest:` test's statically resolved repo-local imports + (stdlib `ast`, read-only — no execution) now widen the behavior claim's re-check trigger set, + so an implementation edit re-runs the test even when the claim text names no symbol. Trigger + coverage only — the test still decides truth (`dorian bench c4-import-binding`). - **Trusted-base checker-source mode** — `revalidate --checker-source base` / Action `checker_trust: base` runs only base-approved checker specs, for public/fork PRs. - **Warrant-quality harness** — `dorian bench warrant-quality` scores per-claim whether a diff --git a/spec/checkers.md b/spec/checkers.md index 54a56e1..65c28c1 100644 --- a/spec/checkers.md +++ b/spec/checkers.md @@ -140,9 +140,23 @@ Runs `python -m pytest ` (PATH interpreter, stripped env, repo cwd, 4 FAIL `test_gone` only on the nodeid-gone stderr signatures ("ERROR: file or directory not found" / "ERROR: not found:") — any other exit 4 (broken conftest, bad ini/plugin, unimportable target file) is ERROR, as are exits -2/3, timeouts, spawn failures, and a PATH python lacking pytest. Derived -watch: the nodeid's file part. The file part resolves through the rename log, -so a renamed test file is not "gone". +2/3, timeouts, spawn failures, and a PATH python lacking pytest. The file part +resolves through the rename log, so a renamed test file is not "gone". + +**Derived watch (import-aware).** The nodeid's test file, *plus* the repo-local +implementation files that test statically imports. When a claim is sealed through +`dorian verify` or `dorian rebind`, dorian parses the test file with the stdlib +`ast` (read-only — **no import execution, no `sys.path` mutation, no package +introspection, no network**) and adds the tracked `.py` files its imports resolve +to (`src/dorian/test_deps.py`). So an edit to the implementation a behavior test +exercises re-runs that test even when the claim text names no uniquely indexed +symbol — closing a silent re-check skip. This widens the re-check *trigger* set +only: the C4 test still decides truth, and an imported file changing never makes a +claim `BROKEN` by itself (the test does). An import resolving to zero or to more +than one tracked file (ambiguous) is skipped, not guessed. The plain explicit +`dorian seal --readset` path keeps its given watch unchanged; run `dorian rebind` +(or `dorian bind-suggest` to preview) to add the import-derived watches to an +older warrant. This is not a sandbox — see `docs/SECURITY_BOUNDARY.md`. ## C5 — data reconciliation diff --git a/src/dorian/bindings.py b/src/dorian/bindings.py index 976cb91..42bf9da 100644 --- a/src/dorian/bindings.py +++ b/src/dorian/bindings.py @@ -125,9 +125,9 @@ def analyze_candidate( amb = ambiguous.get(claim.id, {}) if any(not any(_covered(f, cover) for f in files) for files in amb.values()): flags.append("ambiguous-mention") - named = _checker_named_files(claim, entry_uris) + exercised = _checker_exercised_files(repo, claim, entry_uris) if claim.load_bearing and any( - w not in named for spec in claim.checkers for w in spec.watch + w not in exercised for spec in claim.checkers for w in spec.watch ): flags.append("trigger-only-symbol") mentions: list[dict] = [] @@ -193,10 +193,11 @@ def weak_binding_lines(diags: list[dict]) -> list[str]: def _checker_named_files(claim: Claim, entry_uris: dict[str, str]) -> set[str]: - """The files a claim's checker PROGRAMS name (the truth they verify), independent of - symbol-definer watch paths added at verify time. A watch path NOT in this set is a - re-check TRIGGER that no checker exercises — the binding fix's trigger != truth gap, - which the 'trigger-only-symbol' flag surfaces.""" + """The files a claim's checker PROGRAMS literally name (the truth they verify), + independent of symbol-definer watch paths added at verify time. This is the narrow, + pre-binding-fix watch surface — the bench's ``checker_path_watcher`` ablation depends + on it staying narrow, so C4 test-import deps are NOT added here (they live in + ``_checker_exercised_files``).""" # lazy: reuse seal's canonical C3 file-operand form set and C5 path grammar from dorian.seal import _C3_FILE_OPERAND_FORMS, _c5_data_paths @@ -219,6 +220,24 @@ def _checker_named_files(claim: Claim, entry_uris: dict[str, str]) -> set[str]: return {f for f in named if f} +def _checker_exercised_files(repo: Path, claim: Claim, entry_uris: dict[str, str]) -> set[str]: + """The files a claim's checkers actually EXERCISE when they run: the checker-named + files PLUS, for each C4 ``pytest:`` checker, the repo-local implementation files its + test statically imports. A watch path NOT in this set is a re-check TRIGGER that no + checker exercises — the binding fix's trigger != truth gap the 'trigger-only-symbol' + flag surfaces. Adding C4 import-deps here keeps import-aware C4 binding from spuriously + flagging a good behavior claim (and so from tripping --binding-gate=fail): a test + IMPORTS and runs the files it depends on, so those widened watches are exercised.""" + from dorian import test_deps # lazy: cycle-safe module load (cached); the cost is gated below + + exercised = _checker_named_files(claim, entry_uris) + for spec in claim.checkers: + test_file = test_deps._c4_test_file(spec) + if test_file: + exercised.update(test_deps.python_import_dependencies(repo, test_file)) + return exercised + + def _backtick_binds(tok: str) -> bool: """A backtick span is a candidate identifier only when it is a single identifier-shaped token of >= _MIN_IDENT chars that is not a bare common word. diff --git a/src/dorian/commands.py b/src/dorian/commands.py index 23ff276..b246230 100644 --- a/src/dorian/commands.py +++ b/src/dorian/commands.py @@ -34,6 +34,7 @@ strength, suggestclaims, symbol_index, + test_deps, ) from dorian.blast import blast_conn from dorian.capture.manual import parse_manual @@ -267,8 +268,14 @@ def cmd_verify(args: argparse.Namespace) -> int: except gitio.GitError: definers = None config_index, unparseable_config = symbol_index.config_key_index(repo) - # multi-index binding: Python symbol-definers + pyproject scripts + config keys - symbol_watch = symbol_index.claim_watch_paths(repo, claims, definers, config_index) + # multi-index binding: Python symbol-definers + pyproject scripts + config keys, + # UNIONed with the repo-local implementation files a C4 pytest test statically + # imports (test_deps) — so an edit to the code a behavior test exercises re-runs + # that C4 checker even when the claim text names no uniquely indexed symbol. + symbol_watch = test_deps.merge_watch_maps( + symbol_index.claim_watch_paths(repo, claims, definers, config_index), + test_deps.c4_dependency_watch_paths(repo, claims), + ) for path in sorted({p for ps in symbol_watch.values() for p in ps}): if path not in paths: paths.append(path) @@ -561,9 +568,10 @@ def cmd_bind_suggest(args: argparse.Namespace) -> int: except (ValueError, OSError) as exc: print(f"dorian bind-suggest: {exc}", file=sys.stderr) return EXIT_USAGE - # multi-index binding with provenance: symbol-definer/script vs config-key + # multi-index binding with provenance: symbol-definer/script vs config-key vs C4 test-dep watch = symbol_index.claim_symbol_watch_paths(repo, claims) config_watch = symbol_index.claim_config_watch_paths(repo, claims) + test_dep_watch = test_deps.c4_dependency_watch_paths(repo, claims) ambiguous = symbol_index.ambiguous_symbol_mentions(repo, claims) ambiguous_config = symbol_index.ambiguous_config_mentions(repo, claims) _, unparseable_config = symbol_index.config_key_index(repo) @@ -575,14 +583,16 @@ def cmd_bind_suggest(args: argparse.Namespace) -> int: covered = set() # C1 span / C5 shell: no auto-derivable read-set to compare bind = [f for f in watch.get(c.id, ()) if f not in covered] bind_config = [f for f in config_watch.get(c.id, ()) if f not in covered] + bind_test_deps = [f for f in test_dep_watch.get(c.id, ()) if f not in covered] amb = {s: list(files) for s, files in ambiguous.get(c.id, {}).items()} amb_cfg = {k: list(files) for k, files in ambiguous_config.get(c.id, {}).items()} - if bind or bind_config or amb or amb_cfg: + if bind or bind_config or bind_test_deps or amb or amb_cfg: suggestions.append( { "claim_id": c.id, "bind": bind, # symbol-definer / console-script provenance "bind_config": bind_config, # config-key provenance + "bind_test_deps": bind_test_deps, # C4 test-import impl-file provenance "ambiguous": amb, "ambiguous_config": amb_cfg, } @@ -600,6 +610,8 @@ def cmd_bind_suggest(args: argparse.Namespace) -> int: print(f"{s['claim_id']} bind (symbol): {', '.join(s['bind'])}") if s["bind_config"]: print(f"{s['claim_id']} bind (config): {', '.join(s['bind_config'])}") + if s["bind_test_deps"]: + print(f"{s['claim_id']} bind (test-dep): {', '.join(s['bind_test_deps'])}") for sym, files in sorted(s["ambiguous"].items()): print(f"{s['claim_id']} ambiguous symbol: {sym} ({len(files)} definers, unbound)") for key, files in sorted(s["ambiguous_config"].items()): @@ -647,7 +659,10 @@ def cmd_rebind(args: argparse.Namespace) -> int: file=sys.stderr, ) return EXIT_USAGE - symbol_watch = symbol_index.claim_watch_paths(repo, claims) # symbol-definer + config-key + symbol_watch = test_deps.merge_watch_maps( + symbol_index.claim_watch_paths(repo, claims), # symbol-definer + config-key + test_deps.c4_dependency_watch_paths(repo, claims), # C4 test-import impl files + ) new_paths = {p for ps in symbol_watch.values() for p in ps} already_watched = {w for c in claims for spec in c.checkers for w in spec.watch} if new_paths <= already_watched: @@ -793,6 +808,7 @@ def cmd_report(args: argparse.Namespace) -> int: "mutation": ("bench.controlled_mutation", False), "large-mutation": ("bench.large_mutation", False), "binding-lifecycle": ("bench.binding_lifecycle", False), + "c4-import-binding": ("bench.c4_import_binding", False), "realworld-usecases": ("bench.realworld_usecases", False), "warrant-quality": ("bench.warrant_quality", False), "public-repos": ("bench.public_repos", False), diff --git a/src/dorian/test_deps.py b/src/dorian/test_deps.py new file mode 100644 index 0000000..65046e2 --- /dev/null +++ b/src/dorian/test_deps.py @@ -0,0 +1,257 @@ +"""Deterministic C4 test-import dependency binding (stdlib ``ast`` only). + +A C4 ``pytest:`` checker proves behavior WHEN IT RUNS, but the watch +``seal._derive_watch`` derives for it is only the nodeid's test file. If the +implementation file the test exercises changes and the claim text names no +uniquely indexed symbol/config key, ``revalidate`` can skip the claim entirely +even though an adequate behavior checker exists — a re-check TRIGGER gap (the +silent revalidation skip), not a truth-check failure. + +This module statically resolves the repo-local Python files a test file imports +and ``c4_dependency_watch_paths`` maps each claim to the implementation files its +C4 test imports. ``verify`` / ``rebind`` add those to the claim's watch + read-set +(``seal`` keeps the truth decision in the checker), so a source edit re-runs the +existing C4 checker. A file change is a re-check TRIGGER, never a BROKEN: whether +the claim breaks is still decided by the test result. + +Read-only and conservative by construction, mirroring ``symbol_index``: + +- stdlib ``ast`` only — never imports application modules, executes setup code, + mutates ``sys.path``, inspects installed packages, or touches the network; +- only TRACKED ``.py`` files are parsed and resolved; a stdlib import is skipped + outright via the interpreter's module-name table, and a third-party import resolves + to nothing UNLESS the repo happens to contain a file whose path tail matches the + module name — a bounded, conservative false re-check TRIGGER, never a false BROKEN + (the C4 test still decides truth); +- an import that resolves to EXACTLY ONE tracked file is bound; zero or more than + one (ambiguous tail match) is skipped, because a wrong watch is a false-BROKEN + risk and a false BROKEN is what gets the tool suppressed; +- syntax errors, symlinked / unreadable / oversized files, a non-git repo, an + untracked test file, and relative imports that climb above the repo root all degrade + to a partial result or ``()`` — the resolver is never the reason a seal fails; +- content-free: only repo-relative paths are produced, never source text. + +``revalidate`` never calls this — the wider watch set is baked into the sealed +sidecar at verify/rebind time (the permanent "revalidate stays symbol/import +blind" design constraint). +""" + +from __future__ import annotations + +import ast +import sys +from collections.abc import Iterable, Sequence +from pathlib import Path + +from dorian import gitio +from dorian.model import CheckerSpec, Claim + +_MAX_FILE_BYTES = 1 << 20 # skip files > 1 MiB (mirrors symbol_index / bindings) + +# Top-level names of THIS interpreter's standard library. An ABSOLUTE import whose first +# dotted component is a stdlib module (`import json`, `from os import path`) is never +# repo-local, so it is skipped before any path-tail match — otherwise a tracked repo file +# that happens to share the basename (e.g. `pkg/json.py`) would be a false re-check +# trigger. Read-only interpreter data: no import, no `sys.path`, no package introspection. +# Relative imports are repo-local by construction and are never checked against this set. +_STDLIB_TOP = frozenset(sys.stdlib_module_names) + + +def _is_stdlib_absolute(module: str) -> bool: + """True when an absolute module's top-level package is a stdlib module.""" + return module.split(".", 1)[0] in _STDLIB_TOP + + +def python_import_dependencies(repo: Path, file: str, *, max_depth: int = 1) -> tuple[str, ...]: + """The sorted, unique repo-relative tracked ``.py`` files statically imported by + ``file`` (a repo-relative tracked test file), to ``max_depth`` import hops. + + Pure and read-only: no execution, no ``sys.path`` mutation, no application + imports. Returns ``()`` for a non-git repo, an untracked/oversized/unreadable + or unparseable ``file``, or when nothing repo-local resolves; the file itself is + never listed. ``max_depth=1`` (the default and first-release setting) resolves + only the test file's direct imports. + """ + repo = repo.resolve() + try: + tracked = set(gitio.ls_files(repo)) + except gitio.GitError: + return () # not a git checkout: no tracked-file set to resolve against + return _resolve_imports(repo, file, tracked, max_depth) + + +def _resolve_imports(repo: Path, file: str, tracked: set[str], max_depth: int) -> tuple[str, ...]: + """Core resolver over a PRE-BUILT tracked-file set, so a multi-claim caller + (``c4_dependency_watch_paths``) builds the set once per verify/rebind rather than + once per test file. Same result as ``python_import_dependencies``: ``()`` when the + file is untracked; the file itself is never listed.""" + if file not in tracked: + return () # parse only TRACKED .py files (per spec); also blocks `..`/abs paths + out: set[str] = set() + seen: set[str] = {file} + frontier = [file] + depth = 0 + while frontier and depth < max_depth: + nxt: list[str] = [] + for rel in frontier: + for dep in _imports_of_file(repo, rel, tracked): + if dep not in seen: + seen.add(dep) + out.add(dep) + nxt.append(dep) + frontier = nxt + depth += 1 + return tuple(sorted(out)) + + +def _imports_of_file(repo: Path, rel: str, tracked: set[str]) -> list[str]: + """Repo-local tracked ``.py`` files the single file ``rel`` imports (one hop). + Unreadable / oversized / unparseable / pathological files yield ``[]`` — never + raise, so one bad tracked file cannot break the whole resolution.""" + path = repo / rel + try: + # skip symlinks: a tracked symlink-to-.py can point OUTSIDE the repo, and following + # it would ast.parse out-of-repo bytes (is_file()/stat() resolve the link target). + if path.is_symlink() or not path.is_file() or path.stat().st_size > _MAX_FILE_BYTES: + return [] + tree = ast.parse(path.read_bytes()) # bytes: honour the PEP 263 coding cookie + except (OSError, SyntaxError, ValueError, RecursionError, MemoryError): + return [] + deps: set[str] = set() + file_dir_parts = rel.split("/")[:-1] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + if not _is_stdlib_absolute(alias.name): + _add_if_unique(_module_files(alias.name, tracked), deps) + elif isinstance(node, ast.ImportFrom): + if node.level: + _resolve_relative(node, file_dir_parts, tracked, deps) + else: + _resolve_absolute(node, tracked, deps) + return sorted(deps) + + +def _resolve_absolute(node: ast.ImportFrom, tracked: set[str], deps: set[str]) -> None: + """``from pkg.mod import a, b`` (level 0): bind the module ``pkg.mod`` and, for + each imported name, the candidate submodule ``pkg.mod.`` — each only when it + resolves to exactly one tracked file (a name may be an attribute OR a submodule; + we cannot tell without importing, so consider both, conservatively). A stdlib + ``from`` target (``from os.path import join``) is skipped, like ``import`` above.""" + if not node.module or _is_stdlib_absolute(node.module): + return + _add_if_unique(_module_files(node.module, tracked), deps) + for alias in node.names: + if alias.name != "*": + _add_if_unique(_module_files(f"{node.module}.{alias.name}", tracked), deps) + + +def _resolve_relative( + node: ast.ImportFrom, file_dir_parts: list[str], tracked: set[str], deps: set[str] +) -> None: + """``from . import x`` / ``from .mod import y`` / ``from ..pkg import z``: resolve + against the importing file's package, derived from its path (no ``sys.path``). A + relative import that climbs above the repo root is skipped. Candidate paths are + EXACT (relative imports resolve to a precise location), bound only when exactly + one of ``x.py`` / ``x/__init__.py`` is tracked.""" + up = node.level - 1 + if up > len(file_dir_parts): + return # climbs above the repo root: unresolvable here + base = file_dir_parts[: len(file_dir_parts) - up] + if node.module: + target = [*base, *node.module.split(".")] + _add_if_unique(_exact_tracked(_module_path_candidates(target), tracked), deps) + for alias in node.names: + if alias.name != "*": + cands = _module_path_candidates([*target, alias.name]) + _add_if_unique(_exact_tracked(cands, tracked), deps) + else: # from . import names -> the package __init__ plus each submodule + _add_if_unique(_exact_tracked(["/".join([*base, "__init__.py"])], tracked), deps) + for alias in node.names: + if alias.name != "*": + cands = _module_path_candidates([*base, alias.name]) + _add_if_unique(_exact_tracked(cands, tracked), deps) + + +def _module_files(module: str, tracked: set[str]) -> list[str]: + """Tracked ``.py`` files whose repo-relative path matches a dotted ABSOLUTE module + by path tail (so ``pkg.cli`` matches ``pkg/cli.py``, ``src/pkg/cli.py``, or + ``pkg/cli/__init__.py``). A pure string match — never imports, executes, or + consults ``sys.path``; mirrors ``symbol_index._module_candidate_files``.""" + tail = module.replace(".", "/") + wants = (tail + ".py", tail + "/__init__.py") + return sorted(f for f in tracked if any(f == w or f.endswith("/" + w) for w in wants)) + + +def _module_path_candidates(parts: list[str]) -> list[str]: + """EXACT repo-relative candidate paths for a relative-import target (no tail + match): ``a/b.py`` and ``a/b/__init__.py``.""" + stem = "/".join(parts) + return [stem + ".py", stem + "/__init__.py"] + + +def _exact_tracked(candidates: Iterable[str], tracked: set[str]) -> list[str]: + return sorted({c for c in candidates if c in tracked}) + + +def _add_if_unique(files: list[str], deps: set[str]) -> None: + """Bind only an UNAMBIGUOUS resolution: exactly one tracked file. Zero or more + than one is skipped (a wrong watch is a false-BROKEN risk).""" + if len(files) == 1: + deps.add(files[0]) + + +def _c4_test_file(spec: CheckerSpec) -> str | None: + """The nodeid's file part of a C4 ``pytest:`` checker, else ``None``. + Mirrors ``seal._derive_watch`` so the derived test-file watch and the + import-derived watches resolve from the same nodeid parse.""" + if spec.type != "C4": + return None + prefix, sep, nodeid = spec.program.partition(":") + if prefix != "pytest" or not sep: + return None + file = nodeid.partition("::")[0].strip() + return file or None + + +def c4_dependency_watch_paths( + repo: Path, claims: Sequence[Claim], *, max_depth: int = 1 +) -> dict[str, tuple[str, ...]]: + """claim id -> the sorted repo-local implementation files its C4 ``pytest:`` test + files statically import. Claims with no C4 checker (or whose tests import nothing + repo-local) are omitted. Each distinct test file is parsed at most once. Additive + and trigger-only — a resolved file is a re-check trigger; the C4 test still decides + truth when it runs.""" + repo = repo.resolve() + # build the tracked-file set ONCE per call (not once per claim/test file): a pure + # function of the repo tree, threaded into every resolution below. A non-git repo + # self-degrades to {} exactly as the symbol helpers do. + try: + tracked = set(gitio.ls_files(repo)) + except gitio.GitError: + return {} + cache: dict[str, tuple[str, ...]] = {} + out: dict[str, tuple[str, ...]] = {} + for claim in claims: + paths: set[str] = set() + for spec in claim.checkers: + tf = _c4_test_file(spec) + if tf is None: + continue + if tf not in cache: + cache[tf] = _resolve_imports(repo, tf, tracked, max_depth) + paths.update(cache[tf]) + if paths: + out[claim.id] = tuple(sorted(paths)) + return out + + +def merge_watch_maps(*maps: dict[str, tuple[str, ...]]) -> dict[str, tuple[str, ...]]: + """Union per-claim watch maps (claim id -> paths), sorted and deduped. Mirrors + ``symbol_index.claim_watch_paths``' own merge so the combined ``extra_watch`` is + byte-identical regardless of source order.""" + merged: dict[str, set[str]] = {} + for m in maps: + for cid, paths in m.items(): + merged.setdefault(cid, set()).update(paths) + return {cid: tuple(sorted(paths)) for cid, paths in merged.items()} diff --git a/tests/test_c4_import_binding.py b/tests/test_c4_import_binding.py new file mode 100644 index 0000000..a321509 --- /dev/null +++ b/tests/test_c4_import_binding.py @@ -0,0 +1,537 @@ +"""C4 import-aware dependency binding: a C4 ``pytest:`` checker proves behavior +WHEN IT RUNS, but its sealed watch is only the nodeid's test file. If the +implementation the test exercises changes and the claim text names no uniquely +indexed symbol/config key, revalidation can skip the claim entirely even though +an adequate behavior checker exists — a re-check TRIGGER gap (silent skip), not a +truth gap. + +``test_deps.python_import_dependencies`` statically resolves the repo-local Python +files a test file imports (stdlib ``ast`` only — no import execution, no sys.path +mutation, no package introspection); ``verify``/``rebind`` add those to the C4 +claim's watch + read-set so a source edit re-runs the existing C4 checker. The +checker still decides truth: a file change is a trigger, never a BROKEN. +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import pytest + +from conftest import commit_all, write +from dorian import claims_io, cli, gitio +from dorian.model import CheckerSpec, Claim, Warrant +from dorian.revalidate import revalidate + +# --- unit coverage for the static import resolver ------------------------------------- + + +def _repo(tmp_path: Path) -> Path: + repo = tmp_path / "repo" + repo.mkdir() + gitio_init(repo) + return repo + + +def gitio_init(repo: Path) -> None: + from conftest import git + + git(repo, "init", "-q", "-b", "main") + + +def test_resolves_repo_local_absolute_imports(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/auth.py", "def verify_token(t):\n return bool(t)\n") + write(repo, "src/rate_limit.py", "limiter = object()\n") + write( + repo, + "tests/test_auth.py", + "import os\n" + "from src.auth import verify_token\n" + "from src.rate_limit import limiter\n" + "import pytest\n\n" + "def test_x():\n assert verify_token('x')\n", + ) + commit_all(repo, "init") + deps = test_deps.python_import_dependencies(repo, "tests/test_auth.py") + # repo-local imports resolve; stdlib (os) and third-party (pytest) are skipped + assert deps == ("src/auth.py", "src/rate_limit.py") + + +def test_resolves_plain_module_import(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/config.py", "TIMEOUT = 30\n") + write(repo, "tests/test_cfg.py", "import src.config\n\ndef test_x():\n pass\n") + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_cfg.py") == ("src/config.py",) + + +def test_resolves_package_init(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "pkg/__init__.py", "VALUE = 1\n") + write(repo, "tests/test_pkg.py", "import pkg\n\ndef test_x():\n pass\n") + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_pkg.py") == ("pkg/__init__.py",) + + +def test_ambiguous_module_tail_is_skipped(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + # `auth` as a module tail matches BOTH files -> the definer is unknowable, skip it + # (a wrong watch is a false-BROKEN risk; mirrors symbol_index ambiguity handling) + write(repo, "auth.py", "X = 1\n") + write(repo, "src/auth.py", "Y = 2\n") + write(repo, "tests/test_a.py", "import auth\n\ndef test_x():\n pass\n") + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_a.py") == () + + +def test_relative_import_within_package(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "pkg/__init__.py", "") + write(repo, "pkg/impl.py", "def f():\n return 1\n") + write(repo, "pkg/test_impl.py", "from .impl import f\n\ndef test_x():\n assert f()\n") + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "pkg/test_impl.py") == ("pkg/impl.py",) + + +def test_relative_import_above_root_is_skipped(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "pkg/test_x.py", "from ... import nope\n\ndef test_x():\n pass\n") + commit_all(repo, "init") + # too many leading dots to resolve inside the repo: skip, never crash + assert test_deps.python_import_dependencies(repo, "pkg/test_x.py") == () + + +def test_third_party_and_stdlib_never_resolve(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write( + repo, + "tests/test_x.py", + "import os\nimport pytest\nfrom collections import OrderedDict\n", + ) + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_x.py") == () + + +def test_syntax_error_file_is_non_fatal(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "tests/test_x.py", "def oops(:\n pass\n") # unparseable + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_x.py") == () + + +def test_oversized_file_is_skipped(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/m.py", "def f():\n return 1\n") + big = "from src.m import f\n" + ("# pad\n" * 200_000) # > 1 MiB + write(repo, "tests/test_big.py", big) + commit_all(repo, "init") + assert (repo / "tests/test_big.py").stat().st_size > (1 << 20) + assert test_deps.python_import_dependencies(repo, "tests/test_big.py") == () + + +def test_non_git_repo_yields_no_deps(tmp_path: Path) -> None: + from dorian import test_deps + + (tmp_path / "tests").mkdir() + (tmp_path / "src.py").write_text("X = 1\n") + (tmp_path / "tests" / "test_x.py").write_text("import src\n") + assert test_deps.python_import_dependencies(tmp_path, "tests/test_x.py") == () + + +def test_untracked_test_file_yields_no_deps(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/m.py", "def f():\n return 1\n") + commit_all(repo, "init") + write(repo, "tests/test_untracked.py", "from src.m import f\n") # never git-added + assert test_deps.python_import_dependencies(repo, "tests/test_untracked.py") == () + + +def test_self_import_excluded_and_deterministic(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/a.py", "X = 1\n") + write(repo, "src/b.py", "Y = 2\n") + write( + repo, + "tests/test_ab.py", + "from src.b import Y\nfrom src.a import X\n\ndef test_x():\n pass\n", + ) + commit_all(repo, "init") + a = test_deps.python_import_dependencies(repo, "tests/test_ab.py") + b = test_deps.python_import_dependencies(repo, "tests/test_ab.py") + assert a == b == ("src/a.py", "src/b.py") # sorted, deduped, deterministic + assert "tests/test_ab.py" not in a # never lists itself + + +def test_stdlib_name_collision_is_not_bound(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + # a repo file shares a basename with a stdlib module. A stdlib `import json` must NOT + # bind it: the resolver consults the interpreter's stdlib module-name table, so a + # same-named repo file is never a false re-check trigger for a stdlib import (which is + # what the bare path-tail match would otherwise do). + write(repo, "pkg/json.py", "X = 1\n") + write(repo, "tests/test_x.py", "import json\n\ndef test_x():\n pass\n") + commit_all(repo, "init") + assert test_deps.python_import_dependencies(repo, "tests/test_x.py") == () + + +def test_symlinked_test_file_is_not_followed_out_of_repo(tmp_path: Path) -> None: + from conftest import git + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/m.py", "def f():\n return 1\n") + # an out-of-repo source the symlink points at; following it would ast.parse bytes + # OUTSIDE the tracked repo. The resolver must skip symlinks, not follow them. + outside = tmp_path / "outside.py" + outside.write_text("from src.m import f\n") + link = repo / "tests" / "test_link.py" + link.parent.mkdir(parents=True, exist_ok=True) + os.symlink(outside, link) + git(repo, "add", "-A") + git(repo, "commit", "-q", "-m", "track a symlinked test file") + assert "tests/test_link.py" in gitio.ls_files(repo) # git tracks it (mode 120000) + # tracked, but a symlink to an out-of-repo file: skipped, so nothing is read or bound + assert test_deps.python_import_dependencies(repo, "tests/test_link.py") == () + + +def test_c4_dependency_watch_paths_maps_claims(tmp_path: Path) -> None: + from dorian import test_deps + + repo = _repo(tmp_path) + write(repo, "src/auth.py", "def verify_token(t):\n return bool(t)\n") + write( + repo, + "tests/test_auth.py", + "from src.auth import verify_token\n\ndef test_x():\n pass\n", + ) + commit_all(repo, "init") + claim = Claim( + id="c1", + text="login works", + kind="behavior", + load_bearing=True, + checkers=(CheckerSpec(type="C4", program="pytest:tests/test_auth.py::test_x"),), + ) + other = Claim( + id="c2", + text="no c4 here", + kind="reference", + load_bearing=False, + checkers=(CheckerSpec(type="C3", program="path:src/auth.py"),), + ) + out = test_deps.c4_dependency_watch_paths(repo, [claim, other]) + assert out == {"c1": ("src/auth.py",)} # only the C4 claim, only the imported impl + + +def test_merge_watch_maps_unions_per_claim(tmp_path: Path) -> None: + from dorian import test_deps + + a = {"c1": ("x.py",), "c2": ("y.py",)} + b = {"c1": ("z.py",), "c3": ("w.py",)} + assert test_deps.merge_watch_maps(a, b) == { + "c1": ("x.py", "z.py"), + "c2": ("y.py",), + "c3": ("w.py",), + } + + +# --- integration: verify / revalidate / rebind / bindings / scope / deny-exec --------- +# +# The gap this patch closes: a C4-backed BEHAVIOR claim whose text names NO uniquely +# indexed symbol, so the symbol index binds nothing. Without C4 import binding the +# sealed watch is only the test file, so an edit to the implementation the test +# imports is never selected for revalidation — the claim stays trusted on stale +# confidence. With it, the imported impl file is watched, so the edit re-runs the +# existing C4 checker and the checker decides truth. + +AUTH_SRC = "def verify_token(token):\n return token == 'good'\n" + +TEST_LOGIN = ( + "from src.auth import verify_token\n\n\n" + "def test_login_rejects_invalid_token():\n" + " assert verify_token('good') is True\n" + " assert verify_token('bad') is False\n" +) + +# vague behavior claim: the text names no snake_case/CamelCase/backtick/path token, so +# symbol_index binds nothing — the ONLY source of src/auth.py in the watch is the C4 +# test's `from src.auth import verify_token` (pinned in the test below). +C4_BEHAVIOR_CLAIM = Claim( + id="login-behavior", + text="Login rejects invalid tokens.", + kind="behavior", + load_bearing=True, + checkers=( + CheckerSpec( + type="C4", + program="pytest:tests/test_login.py::test_login_rejects_invalid_token", + ), + ), +) + + +@pytest.fixture +def c4_import_repo(tmp_path: Path) -> Path: + repo = _repo(tmp_path) + write(repo, "docs/design.md", "# Design\n\nLogin rejects invalid tokens.\n") + write(repo, "src/__init__.py", "") # `from src.auth import ...` resolves under `python -m` + write(repo, "src/auth.py", AUTH_SRC) + write(repo, "tests/test_login.py", TEST_LOGIN) + commit_all(repo, "init c4 import scenario") + return repo + + +@pytest.fixture +def interpreter_on_path(monkeypatch): + # the C4 checker spawns bare `python -m pytest`; ensure it resolves an interpreter + # that has pytest regardless of how this suite was invoked (mirrors test_c4.py) + monkeypatch.setenv("PATH", str(Path(sys.executable).parent), prepend=os.pathsep) + + +def _verify(repo: Path, claims: list[Claim], *extra: str) -> int: + return cli.main( + ["--repo", str(repo), "verify", "docs/design.md", "--claims", _saved(repo, claims), *extra] + ) + + +def _warrant(repo: Path) -> Warrant: + return Warrant.load(repo / "docs/design.md.warrant") + + +@pytest.mark.slow +def test_verify_widens_watch_and_captures_imported_impl( + c4_import_repo: Path, interpreter_on_path +) -> None: + from dorian import symbol_index, test_deps + + repo = c4_import_repo + # isolate the contribution: the claim text binds NO symbol-definer / config watch + assert symbol_index.claim_watch_paths(repo, [C4_BEHAVIOR_CLAIM]) == {} + assert test_deps.c4_dependency_watch_paths(repo, [C4_BEHAVIOR_CLAIM]) == { + "login-behavior": ("src/auth.py",) + } + + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 # born verifiable: the pytest passes now + claim = _warrant(repo).claims[0] + # the C4 checker named only the test file; import binding added src/auth.py to both + # the sealed watch and the auto-captured (hashed, scope-linted) read-set + assert claim.checkers[0].watch == ("tests/test_login.py", "src/auth.py") + assert "src/auth.py" in {e.uri for e in _warrant(repo).read_set} + + +@pytest.mark.slow +def test_impl_change_makes_claim_broken(c4_import_repo: Path, interpreter_on_path) -> None: + repo = c4_import_repo + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 + base = gitio.head_ref(repo) + # break the behavior the test asserts; the test file and claim text are untouched, so + # WITHOUT import binding this drift is silently skipped (the old test-file-only watch) + write(repo, "src/auth.py", "def verify_token(token):\n return token == 'nope'\n") + + res = revalidate(repo, since=base) + assert res.candidates >= 1 # was 0 before the fix: the claim was never re-checked + assert "login-behavior" in {cid for _, cid, _ in res.broken} # a real CATCH + assert res.exit_code == 4 # load-bearing claim broke -> warrant REVOKED + + +@pytest.mark.slow +def test_benign_impl_change_re_runs_but_does_not_alarm( + c4_import_repo: Path, interpreter_on_path +) -> None: + repo = c4_import_repo + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 + base = gitio.head_ref(repo) + # a behavior-preserving edit: the test still passes. The widened watch selects the + # claim (the imported file changed) but the verdict comes from the test -> PASS, not + # BROKEN. File drift is a re-check TRIGGER, never proof a claim is false. + write( + repo, + "src/auth.py", + "def verify_token(token):\n # tidy up\n return token == 'good'\n", + ) + + res = revalidate(repo, since=base) + assert res.candidates >= 1 + assert "login-behavior" in {cid for _, cid, _ in res.passed} + assert "login-behavior" not in {cid for _, cid, _ in res.broken} + assert res.exit_code == 0 + + +@pytest.mark.slow +def test_deny_exec_makes_selected_claim_errored_not_broken( + c4_import_repo: Path, interpreter_on_path +) -> None: + from dorian.policy import ExecutionPolicy + + repo = c4_import_repo + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 + base = gitio.head_ref(repo) + # would fail the test if executed: + write(repo, "src/auth.py", "def verify_token(token):\n return token == 'nope'\n") + + res = revalidate(repo, since=base, policy=ExecutionPolicy(allow_exec=False)) + assert res.candidates >= 1 # the wider watch still SELECTS the claim + assert "login-behavior" in {cid for _, cid, _ in res.errored} # blocked C4 -> ERRORED + assert "login-behavior" not in {cid for _, cid, _ in res.broken} # never BROKEN under deny-exec + assert res.exit_code == 5 # UNKNOWN (fail-closed), not REVOKED and not OK + + +def test_scope_lint_sees_import_derived_path(c4_import_repo: Path) -> None: + # the import-derived impl file is captured into the read-set, so it is scope-linted + # like any other entry: a restricted impl file refuses the seal (exit 6) unless allowed. + # No interpreter fixture needed: scope lint runs BEFORE any checker. + repo = c4_import_repo + write(repo, "pyproject.toml", '[tool.dorian.scopes]\nrestricted = ["src/auth.py"]\n') + commit_all(repo, "restrict src/auth.py") + cp = _saved(repo, [C4_BEHAVIOR_CLAIM]) + base = ["--repo", str(repo), "verify", "docs/design.md", "--claims", cp] + assert cli.main(base) == 6 # EXIT_SCOPE: import-derived src/auth.py is restricted + assert not (repo / "docs/design.md.warrant").exists() + assert cli.main([*base, "--allow-restricted"]) == 0 # explicit allowance seals + + +@pytest.mark.slow +def test_bindings_does_not_flag_import_exercised_watch_as_trigger_only( + c4_import_repo: Path, capsys +) -> None: + import json + + repo = c4_import_repo + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 + capsys.readouterr() # flush verify's stdout before reading the bindings JSON + assert cli.main(["--repo", str(repo), "--json", "bindings", "docs/design.md"]) == 0 + diags = json.loads(capsys.readouterr().out)["claims"] + claim_diag = next(c for c in diags if c["claim_id"] == "login-behavior") + # PIN THE WIDENING FIRST: import binding must have added src/auth.py to the watch. + # Without this, the test is vacuous — a narrow test-file-only watch is trivially + # checker-exercised, so 'trigger-only-symbol' would be absent even if the feature + # were reverted. The flag assertion only means something once widening is proven. + assert "src/auth.py" in claim_diag["watch"] + # the C4 test IMPORTS src/auth.py, so when it runs it exercises that file: the widened + # watch is checker-exercised, NOT trigger-only. Flagging it would (a) be wrong and (b) + # trip --binding-gate=fail on a good behavior claim. + assert "trigger-only-symbol" not in claim_diag["flags"] + + +@pytest.mark.slow +def test_binding_gate_fail_still_seals_good_c4_behavior_claim( + c4_import_repo: Path, +) -> None: + # the accidental-gate guard: --binding-gate=fail must NOT refuse a load-bearing C4 + # claim merely because import binding widened its watch (it would, if the import-deps + # were treated as trigger-only). It seals (exit 0). + repo = c4_import_repo + cp = _saved(repo, [C4_BEHAVIOR_CLAIM]) + rc = cli.main( + ["--repo", str(repo), "verify", "docs/design.md", "--claims", cp, "--binding-gate", "fail"] + ) + assert rc == 0 + # PIN THE WIDENING: prove the gate tolerated an IMPORT-WIDENED watch specifically (not + # just an un-widened claim). The sealed C4 watch must contain the import-derived + # src/auth.py — else rc==0 would hold even with the feature reverted (vacuous pass). + assert "src/auth.py" in _warrant(repo).claims[0].checkers[0].watch + + +def test_bind_suggest_surfaces_test_deps(c4_import_repo: Path, capsys) -> None: + import json + + repo = c4_import_repo + cp = _saved(repo, [C4_BEHAVIOR_CLAIM]) + assert cli.main(["--repo", str(repo), "--json", "bind-suggest", "--claims", cp]) == 0 + sugg = json.loads(capsys.readouterr().out)["suggestions"] + s = next(x for x in sugg if x["claim_id"] == "login-behavior") + assert s["bind_test_deps"] == ["src/auth.py"] # import-derived, content-free (path only) + + +@pytest.mark.slow +def test_rebind_upgrades_old_c4_warrant_watch(c4_import_repo: Path, interpreter_on_path) -> None: + from dorian.capture.manual import parse_manual + from dorian.seal import seal_artifact + + repo = c4_import_repo + # simulate a PRE-import-binding warrant: seal with extra_watch=None so the imported + # impl (src/auth.py) is NOT watched and a change there would be silently skipped. + old = seal_artifact( + repo, "docs/design.md", parse_manual(["tests/test_login.py"], repo), [C4_BEHAVIOR_CLAIM] + ) + assert old.claims[0].checkers[0].watch == ("tests/test_login.py",) # narrow (pre-fix) + + assert cli.main(["--repo", str(repo), "rebind", "docs/design.md"]) == 0 + new = _warrant(repo) + assert new.claims[0].checkers[0].watch == ("tests/test_login.py", "src/auth.py") # widened + assert new.supersedes == old.id + assert "src/auth.py" in {e.uri for e in new.read_set} + + +@pytest.mark.slow +def test_rebind_refuses_to_launder_a_since_broken_c4_claim( + c4_import_repo: Path, interpreter_on_path +) -> None: + from dorian.capture.manual import parse_manual + from dorian.seal import seal_artifact + + repo = c4_import_repo + # a PRE-import-binding warrant: narrow test-file-only watch (src/auth.py not watched). + old = seal_artifact( + repo, "docs/design.md", parse_manual(["tests/test_login.py"], repo), [C4_BEHAVIOR_CLAIM] + ) + assert old.claims[0].checkers[0].watch == ("tests/test_login.py",) + # break the behavior the C4 test asserts. rebind re-runs every checker (born-verifiable), + # so the now-FAILING pytest must REFUSE the re-seal (exit 4) rather than launder a false + # claim into a fresh trusted sidecar with a widened watch. + write(repo, "src/auth.py", "def verify_token(token):\n return token == 'nope'\n") + assert cli.main(["--repo", str(repo), "rebind", "docs/design.md"]) == 4 + # nothing written: the old sidecar (and its id and narrow watch) is left intact + assert _warrant(repo).id == old.id + assert _warrant(repo).claims[0].checkers[0].watch == ("tests/test_login.py",) + + +@pytest.mark.slow +def test_trusted_base_mode_runs_base_spec_with_widened_watch( + c4_import_repo: Path, interpreter_on_path +) -> None: + # checker-source=base: the wider watch (baked into the head sidecar) still SELECTS the + # claim; base mode resolves the (here identical) checker spec from the base ref and runs + # it against head sources. A broken impl on head -> BROKEN, exit 4. Import binding does + # not weaken trusted-base fail-closed semantics. + repo = c4_import_repo + assert _verify(repo, [C4_BEHAVIOR_CLAIM]) == 0 + commit_all(repo, "commit warrant so it exists on the base ref") + base = gitio.head_ref(repo) + write(repo, "src/auth.py", "def verify_token(token):\n return token == 'nope'\n") + + res = revalidate(repo, since=base, checker_source="base") + assert "login-behavior" in {cid for _, cid, _ in res.broken} + assert res.exit_code == 4 + + +def _saved(repo: Path, claims: list[Claim]) -> str: + claims_io.save_claims(repo / "claims.json", claims) + return str(repo / "claims.json") diff --git a/tests/test_c4_import_binding_bench.py b/tests/test_c4_import_binding_bench.py new file mode 100644 index 0000000..a1a6b3d --- /dev/null +++ b/tests/test_c4_import_binding_bench.py @@ -0,0 +1,95 @@ +"""Integration tests for the C4 import-aware binding benchmark. + +Pins the EVIDENCE properties, not just that it runs: the pre-fix (test-file-only) +watcher misses implementation-only edits while the import-aware watcher selects +them; ambiguity is an honest skip (not credited); the verdict layer tracks the +test (broken impl -> BROKEN, benign impl -> not BROKEN) with zero false BROKEN +from a behavior-preserving edit; the summary is deterministic and content-safe. + +Slow: the verdict layer seals + revalidates with real pytest (C4) subprocesses. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: # bench/ is a repo-root package, not installed + sys.path.insert(0, str(REPO_ROOT)) + +from bench import c4_import_binding as bench # noqa: E402 + +pytestmark = pytest.mark.slow + + +@pytest.fixture(scope="module") +def _interpreter_on_path(): + # the verdict layer spawns `python -m pytest`; make pytest resolvable on PATH + os.environ["PATH"] = str(Path(sys.executable).parent) + os.pathsep + os.environ["PATH"] + + +@pytest.fixture(scope="module") +def run(tmp_path_factory, _interpreter_on_path) -> tuple[dict, list[dict]]: + ws = tmp_path_factory.mktemp("c4ib-bench") + return bench.run_benchmark(ws, quick=True) + + +@pytest.fixture(scope="module") +def summary(run) -> dict: + return run[0] + + +def test_schema_and_composition(summary: dict) -> None: + assert summary["schema"] == bench.SCHEMA + comp = summary["composition"] + assert comp["direct_import_scenarios"] >= 4 + assert comp["ambiguous_skip_scenarios"] >= 1 + assert comp["verdict_scenarios"] == 2 + + +def test_selection_recall_is_the_headline_improvement(summary: dict) -> None: + sel = summary["selection_layer"] + # the gap this patch closes: the pre-fix test-file-only watcher selects NONE of the + # implementation-only edits; the import-aware watcher selects ALL direct-import ones. + assert sel["test_file_watcher_recall_direct"] == 0.0 + assert sel["import_aware_watcher_recall_direct"] == 1.0 + + +def test_ambiguous_import_is_an_honest_skip(summary: dict) -> None: + # a wrong watch is a false alarm, so an ambiguous import (two tracked tail matches) is + # left unbound — selected 0 times, scored as a miss rather than credited as a win. + assert summary["selection_layer"]["import_aware_selected_ambiguous"] == 0 + + +def test_verdict_layer_tracks_the_test_not_the_file_change(summary: dict) -> None: + v = summary["verdict_layer"] + # selection is not alarm: of the selected+rechecked claims, the verdict matches the test + # (broken impl -> BROKEN, benign impl -> not BROKEN) and a benign edit NEVER alarms. + assert v["alarm_match"] == "2/2" + assert v["false_broken_from_benign_edit"] == 0 + + +def test_deterministic_rerun_is_byte_identical(tmp_path_factory, _interpreter_on_path) -> None: + a = bench.run_benchmark(tmp_path_factory.mktemp("c4ib-a"))[0] + b = bench.run_benchmark(tmp_path_factory.mktemp("c4ib-b"))[0] + assert json.dumps(a, sort_keys=True) == json.dumps(b, sort_keys=True) + + +def test_no_forbidden_overclaim_wording(summary: dict) -> None: + prose = (bench._render_md(summary) + " " + summary["limits"]).lower() + for word in bench.FORBIDDEN_WORDS: + assert word not in prose, f"overclaim wording in benchmark output: {word!r}" + + +def test_records_are_content_free_paths_only(run) -> None: + # resolved deps are repo-relative paths, never source content + _summary, records = run + for r in records: + for dep in r.get("resolved_deps", []): + assert dep.endswith(".py") + assert "\n" not in dep and "def " not in dep