diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4dc14d5 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,14 @@ +version: 2 +updates: + # Keep the SHA-pinned GitHub Actions fresh: Dependabot bumps each `@ # vX` + # to the next version's SHA with an updated comment, so pinning does not freeze + # us on a stale (potentially vulnerable) action version. + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: weekly + # Python deps (reads pyproject.toml: dev group + optional extras). + - package-ecosystem: pip + directory: "/" + schedule: + interval: weekly diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 05f5c17..3ab3288 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,8 +13,8 @@ jobs: matrix: python: ["3.11", "3.12", "3.13"] steps: - - uses: actions/checkout@v6.0.3 - - uses: astral-sh/setup-uv@v8.2.0 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 with: { python-version: "${{ matrix.python }}" } - run: uv sync --all-extras - run: uv run ruff check src tests bench diff --git a/.github/workflows/public-microbench.yml b/.github/workflows/public-microbench.yml index 1659fb1..0f23797 100644 --- a/.github/workflows/public-microbench.yml +++ b/.github/workflows/public-microbench.yml @@ -27,14 +27,14 @@ jobs: microbench: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6.0.3 - - uses: astral-sh/setup-uv@v8.2.0 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 with: { python-version: "3.12" } - run: uv sync --all-extras # Cache cloned public subjects keyed by the manifest's frozen SHAs, so a re-run is offline. - name: Cache frozen subject checkouts - uses: actions/cache@v4 + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4 with: path: .bench/public-work/checkouts key: public-microbench-checkouts-${{ hashFiles('bench/public/manifest.v1.yaml') }} @@ -55,7 +55,7 @@ jobs: - name: Upload results if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: public-microbench-results path: bench/public/results/ diff --git a/.github/workflows/publish-testpypi.yml b/.github/workflows/publish-testpypi.yml index 98dc5bc..12c044d 100644 --- a/.github/workflows/publish-testpypi.yml +++ b/.github/workflows/publish-testpypi.yml @@ -27,17 +27,17 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6.0.3 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: ref: ${{ inputs.ref }} - - uses: actions/setup-python@v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: { python-version: "3.12" } - name: Build sdist + wheel run: | python -m pip install --upgrade build twine python -m build python -m twine check dist/* - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: dist path: dist/ @@ -51,11 +51,11 @@ jobs: permissions: id-token: write # OIDC: mint a short-lived token, no stored secret steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 with: name: dist path: dist/ - name: Publish to TestPyPI (Trusted Publishing dry-run) - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0 with: repository-url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ab78b75..b785e00 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -26,10 +26,10 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref }} - - uses: actions/setup-python@v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: python-version: "3.12" - name: Build sdist + wheel @@ -48,7 +48,7 @@ jobs: PY env: REF: ${{ inputs.ref }} - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: dist path: dist/ @@ -62,9 +62,9 @@ jobs: permissions: id-token: write # OIDC: mint a short-lived token, no stored secret steps: - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 with: name: dist path: dist/ - name: Publish to PyPI (Trusted Publishing) - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0 diff --git a/.github/workflows/release-gate.yml b/.github/workflows/release-gate.yml index a327cec..19637d7 100644 --- a/.github/workflows/release-gate.yml +++ b/.github/workflows/release-gate.yml @@ -31,11 +31,11 @@ jobs: matrix: python: ["3.11", "3.12", "3.13"] steps: - - uses: actions/checkout@v6.0.3 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: ref: ${{ inputs.ref || github.ref }} fetch-depth: 0 - - uses: astral-sh/setup-uv@v8.2.0 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 with: { python-version: "${{ matrix.python }}" } - run: uv sync --all-extras - run: uv run ruff check src tests bench @@ -50,11 +50,11 @@ jobs: id-token: write # OIDC for Sigstore signing — short-lived, no stored secret attestations: write # write the build-provenance attestation steps: - - uses: actions/checkout@v6.0.3 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: ref: ${{ inputs.ref || github.ref }} fetch-depth: 0 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: { python-version: "3.12" } - name: Build sdist + wheel run: | @@ -78,10 +78,10 @@ jobs: run: | cd dist && sha256sum * | tee SHA256SUMS - name: Attest build provenance - uses: actions/attest-build-provenance@v1 + uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1 with: subject-path: "dist/*.whl, dist/*.tar.gz" - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: dist path: dist/ diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..77ceed6 --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,25 @@ +name: security +on: + push: { branches: [main] } + pull_request: + schedule: + - cron: "0 6 * * 1" # weekly Monday 06:00 UTC — catch newly-disclosed CVEs +# Least privilege: this job only reads the repo and queries advisory data. +permissions: + contents: read +jobs: + sca-sast: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 + with: { python-version: "3.12" } + - run: uv sync --all-extras + # SCA: audit the resolved dependency tree (dev + extras) for known CVEs. + # Runtime deps are [], so the value is the dev/extras transitive trees. + - name: pip-audit (SCA) + run: uvx pip-audit + # SAST: static analysis of first-party source. Excludes the documented, + # policy-gated execution primitives via [tool.bandit] in pyproject.toml. + - name: bandit (SAST) + run: uvx bandit -c pyproject.toml -r src/ diff --git a/README.md b/README.md index 42e4b97..ad4fea7 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@

Quickstart - Demo + Demo GitHub Action

@@ -42,6 +42,7 @@ now and is re-checked on every future change, so a confident summary doesn't qui ## Table of contents +- [Try it in 30 seconds](#try-it-in-30-seconds) - [The 60-second aha](#the-60-second-aha) - [We ran this on dorian itself](#we-ran-this-on-dorian-itself) - [About](#about) @@ -60,10 +61,42 @@ now and is re-checked on every future change, so a confident summary doesn't qui - [License](#license) - [Contact](#contact) +## Try it in 30 seconds + +A self-contained run on a throwaway repo — copy-paste it; it leaves nothing behind but a +temp directory. (This exact sequence is pinned by a black-box test, so it is executable and +kept working, not just illustrative.) + +```bash +tmp=$(mktemp -d) && cd "$tmp" && git init -q +printf 'def handler():\n return 200\n' > app.py +printf '# change note\n\n`handler()` lives in app.py.\n' > note.md +git add -A && git commit -q -m "app + note" + +cat > claims.json <<'JSON' +{"claims": [ + {"id": "handler-exists", "text": "handler() lives in app.py.", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C3", "program": "symbol:app.py::handler"}]} +]} +JSON + +dorian verify note.md --claims claims.json # -> verified 1/1 claim(s) (exit 0) + +# now a refactor renames the function the note claims exists: +printf 'def renamed():\n return 200\n' > app.py +dorian revalidate --since HEAD # -> handler-exists BROKEN; WARRANTED -> REVOKED (exit 4) +``` + +`note.md` never changed and `git`/CI stay quiet — but the warrant flips to REVOKED, naming +the exact claim that stopped being true. (Don't have `dorian` yet? See +[Getting started](#getting-started).) + ## The 60-second aha -An agent finishes a change and emits the claims it just made — a `claims.json` next to the work, -each claim bound to a read-only deterministic checker: +*(Illustrative — these files are not in your checkout; run the copy-paste demo above to try it +yourself.)* An agent finishes a change and emits the claims it just made — a `claims.json` next to +the work, each claim bound to a read-only deterministic checker: ```json { @@ -121,6 +154,16 @@ those claims named made `dorian revalidate` flag exactly that claim `BROKEN` and a committed artifact and not a benchmark figure — but it is evidence that the mechanism can catch this kind of checked break on real code, for zero model tokens. +We have since recorded a **documented, reproducible cross-PR catch on a public repo**. A +load-bearing claim sealed against [`encode/httpx`](https://github.com/encode/httpx) at one +commit — `requires-python` is `">=3.8"` — was flipped `WARRANTED → REVOKED` (exit 4) by a real +*later* upstream PR ([#3592](https://github.com/encode/httpx/pull/3592), "Drop Python 3.8 +support", which moved it to `">=3.9"`), while httpx's own test suite stayed green (no test +references `requires-python`) and no stateless per-PR review bot would have re-opened the +original claim. The full command output and a from-scratch reproduction on the public repo are +in [`docs/REAL_CATCH_LOG.md`](docs/REAL_CATCH_LOG.md) — one documented catch, with honest +scope, not a validation claim. + ## About An AI agent writes the code and then a confident account of what it did — a PR description, a commit @@ -286,8 +329,15 @@ rebuildable at any time with `dorian sync` — and is never committed. ## Getting started -The distribution is `dorian-vwp`; the import and CLI are `dorian`. The first PyPI release is on the -roadmap — until it lands, install from source: +The distribution is `dorian-vwp`; the import and CLI are `dorian`. Install from PyPI: + +```bash +pip install dorian-vwp # core, zero runtime dependencies +pip install 'dorian-vwp[data]' # + duckdb for parquet data claims +pip install 'dorian-vwp[extract]' # + anthropic for LLM claim drafting (frozen/experimental) +``` + +To install the latest unreleased changes, install from source instead: ```bash pip install 'dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git' @@ -297,14 +347,6 @@ pip install 'dorian-vwp[data] @ git+https://github.com/ajaysurya1221/dorian.git' pip install 'dorian-vwp[extract] @ git+https://github.com/ajaysurya1221/dorian.git' # + anthropic for LLM claim drafting (frozen/experimental) ``` -After the first PyPI release: - -```bash -pip install dorian-vwp # core, zero runtime dependencies -pip install 'dorian-vwp[data]' # + duckdb for parquet data claims -pip install 'dorian-vwp[extract]' # + anthropic for LLM claim drafting (frozen/experimental) -``` - Then run `dorian verify --claims claims.json` on one change. For CI, add the composite [GitHub Action](action/README.md) — it revalidates the claims a pull request touches and posts a sticky PR comment. **Read its @@ -334,35 +376,8 @@ jobs: install: 'dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git' ``` -### Try it in 30 seconds - -A self-contained run on a throwaway repo — copy-paste it; it leaves nothing behind but a -temp directory. (This exact sequence is pinned by a black-box test, so it is executable and -kept working, not just illustrative.) - -```bash -tmp=$(mktemp -d) && cd "$tmp" && git init -q -printf 'def handler():\n return 200\n' > app.py -printf '# change note\n\n`handler()` lives in app.py.\n' > note.md -git add -A && git commit -q -m "app + note" - -cat > claims.json <<'JSON' -{"claims": [ - {"id": "handler-exists", "text": "handler() lives in app.py.", - "kind": "behavior", "load_bearing": true, - "checkers": [{"type": "C3", "program": "symbol:app.py::handler"}]} -]} -JSON - -dorian verify note.md --claims claims.json # -> verified 1/1 claim(s) (exit 0) - -# now a refactor renames the function the note claims exists: -printf 'def renamed():\n return 200\n' > app.py -dorian revalidate --since HEAD # -> handler-exists BROKEN; WARRANTED -> REVOKED (exit 4) -``` - -`note.md` never changed and `git`/CI stay quiet — but the warrant flips to REVOKED, naming -the exact claim that stopped being true. +Now that `dorian` is installed, the copy-paste runnable demo at the top — +[Try it in 30 seconds](#try-it-in-30-seconds) — runs end to end against a throwaway repo. ## Writing claims an agent can be held to @@ -374,13 +389,17 @@ shape-tolerant checks like `regex:`/`symbol:`/typed-C5 over brittle `string:`) path/symbol/string/regex plus the V1 structural forms `py-signature:`/`py-const:` and the comment/docstring-stripped `code:`, C4 `pytest:`, C5 typed data) are documented in [`spec/checkers.md`](spec/checkers.md). What V1 strengthening does and does not promise is in -[`docs/V1_SCOPE.md`](docs/V1_SCOPE.md). +[`docs/V1_SCOPE.md`](docs/V1_SCOPE.md). Worked good/bad claim pairs — and the gutted-body +ceiling, where an existence check is too weak and you need a C4/C5 behavior check — are in +[`docs/WRITING_GOOD_CLAIMS.md`](docs/WRITING_GOOD_CLAIMS.md). > **Checker programs are executable.** `dorian verify` *runs* every checker at seal time. C3 and typed > C5 only inspect files, but C4 (`pytest:`) and C5 `shell:` execute code — review an agent-emitted > `claims.json` exactly as you would review agent-emitted code, and never run `verify` on claims from > an untrusted source. In untrusted contexts add `--deny-exec` to refuse the executable families -> (fail-closed, not a sandbox — see [SECURITY.md](SECURITY.md)). +> (fail-closed, not a sandbox — see [SECURITY.md](SECURITY.md)). For one copy-paste safe recipe for +> public/untrusted fork PRs (`checker_trust: base` + `deny_exec`), see +> [`docs/SECURITY_AND_SAFE_RUNNERS.md`](docs/SECURITY_AND_SAFE_RUNNERS.md). ## Command surface @@ -416,6 +435,14 @@ claims. refuses the re-seal (exit 4) rather than being laundered into a fresh trusted state. - `dorian suggest-data-checks [--columns ...] [--out f]` — born-verifiable C5 checker suggestions from a data file's current state, for review and pasting into a claim's `checkers` list. +- `dorian suggest-claims [--out f]` — born-verifiable C3 claim suggestions (`symbol:` for + defs/classes, `py-const:` for literal constants) for a Python file: each candidate is run and only + passing ones are emitted, `load_bearing` defaults to false, ambiguous symbols are skipped. Review + scaffolding (existence/value, not behavior) — see + [`docs/design/SUGGEST_CLAIMS.md`](docs/design/SUGGEST_CLAIMS.md). +- `dorian export --in-toto ` — project a sealed `.warrant` into an experimental in-toto + `ClaimVerification` Statement (deterministic, no signing, zero deps); experimental interop — + see [`docs/ATTESTATION_INTEROP.md`](docs/ATTESTATION_INTEROP.md). - `dorian report --audit` — the full event log as `dorian-audit-v1` JSONL, byte-identical across runs; checker details truncated to 160 chars to bound source-content carryover. - `dorian revalidate --format md|json` — `md` is the PR-comment body posted by the @@ -464,8 +491,9 @@ work perishable, so you find out when it expired. ## Roadmap -- **Real catches on real repos** — the dogfood above made the loop usable; next is using it daily and - recording the breaks it catches that would otherwise have shipped. +- **Real catches on real repos** — the loop is usable and the first documented cross-PR catch is + recorded ([`docs/REAL_CATCH_LOG.md`](docs/REAL_CATCH_LOG.md), on `encode/httpx`); next is using it + daily and recording more of the breaks it catches that would otherwise have shipped. - **The binding gap, narrowed and measured** — a symbol→defining-file index now re-checks a claim when its symbol's definer changes, closing the silent-skip *trigger* gap ([`docs/BENCHMARK_BINDING_LIFECYCLE.md`](docs/BENCHMARK_BINDING_LIFECYCLE.md)). What remains is the @@ -479,8 +507,8 @@ work perishable, so you find out when it expired. ([`docs/BENCHMARK_PUBLIC_REAL_REPOS.md`](docs/BENCHMARK_PUBLIC_REAL_REPOS.md)). These are **reproducible on those frozen SHAs only** — not a real-world performance claim; the trigger and truth layers are reported separately. -- **PyPI trusted publishing** — tagged releases now ship (latest: **`v1.0.0rc2`**, a V1 release - candidate / prerelease); publishing `dorian-vwp` to PyPI via a Trusted Publisher is next. +- **PyPI trusted publishing** — `dorian-vwp` is published to PyPI via a Trusted Publisher + (latest: **`v1.0.0`**); `pip install dorian-vwp` installs the released package. Non-goals stay non-goals: no servers, no dashboards, no hosted control plane, no model at check time. Local-first is the design center. diff --git a/action/README.md b/action/README.md index 51b2254..736db1d 100644 --- a/action/README.md +++ b/action/README.md @@ -25,11 +25,11 @@ jobs: with: fetch-depth: 0 # REQUIRED: revalidate diffs against the PR base # sha, which a shallow clone does not contain - - uses: ajaysurya1221/dorian/action@main + - uses: ajaysurya1221/dorian/action@v1.0.0 with: fail_on: revoked - # until the first PyPI release, install from source: - install: 'dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git' + # install defaults to the published PyPI package (dorian-vwp); + # override only to pin a version or install unreleased changes. ``` `fetch-depth: 0` is required because `dorian revalidate --since` runs @@ -77,7 +77,7 @@ self-attested-verdict problem for *non-executable* checkers — that is what ```yaml # untrusted / public-fork posture -- uses: ajaysurya1221/dorian/action@main +- uses: ajaysurya1221/dorian/action@v1.0.0 with: deny_exec: "true" # C4/C5 ERROR instead of executing ``` @@ -93,7 +93,7 @@ executed). Implemented and proven by the ```yaml # public / forked-PR posture: trusted checker specs + no code execution -- uses: ajaysurya1221/dorian/action@main +- uses: ajaysurya1221/dorian/action@v1.0.0 with: checker_trust: base # run only base-approved checker specs deny_exec: "true" # and refuse to execute even those (belt and braces) @@ -126,12 +126,13 @@ Hard rules either way: | --------------- | -------------------------------------------- | ------------------------------------------------------------------------ | | `fail_on` | `revoked` | when to fail the step: `revoked` (exit 4 only), `degraded` (3 or 4), `never` | | `base` | `${{ github.event.pull_request.base.sha }}` | git ref passed to `dorian revalidate --since` | -| `install` | `dorian-vwp` | pip spec; until the first PyPI release use the git source spec (below), or `.` for checkout installs | +| `install` | `dorian-vwp` | pip spec; defaults to the published PyPI package. Use the git source spec (below) for unreleased changes, or `.` for checkout installs | | `deny_exec` | `false` | refuse to run executable checkers (C4 pytest, C5 shell): they ERROR. For untrusted/fork PRs; fail-closed, not a sandbox | | `deny_shell` | `false` | narrower than `deny_exec`: block only C5 shell, still allow C4 pytest | | `checker_trust` | `head` | `head` runs the checked-out checker spec (trusted repos); `base` runs the base-ref spec so PR-authored executable checkers never run (public/fork PRs) | -Until the first PyPI release of `dorian-vwp`, set `install` to a source spec: +To install unreleased changes instead of the published `dorian-vwp` package, set +`install` to a source spec: `install: 'dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git'`. ## Behavior diff --git a/action/action.yml b/action/action.yml index 67d5697..0f9b11e 100644 --- a/action/action.yml +++ b/action/action.yml @@ -23,9 +23,10 @@ inputs: default: ${{ github.event.pull_request.base.sha }} install: description: >- - pip requirement spec for dorian. Until the first PyPI release, use a git - source spec ('dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git'), - pass '.' to install the checked-out source, or pin a tag once published. + pip requirement spec for dorian. Defaults to the published PyPI package + ('dorian-vwp'); pin a version to lock it. Use a git source spec + ('dorian-vwp @ git+https://github.com/ajaysurya1221/dorian.git') for + unreleased changes, or pass '.' to install the checked-out source. required: false default: dorian-vwp deny_exec: diff --git a/bench/public/README.md b/bench/public/README.md index be9a258..6e48f6d 100644 --- a/bench/public/README.md +++ b/bench/public/README.md @@ -1,24 +1,33 @@ -# bench/public — public-repo micro-benchmark (scaffold) +# bench/public — public-repo micro-benchmark -Scaffold for the public-repo micro-benchmark. **No harness and no results live here yet** — this is -the home a future `dorian bench public-repos` subcommand will occupy. +Home of the public-repo micro-benchmark. The `dorian bench public-repos` harness is **implemented** +(`bench/public_repos.py`) and has been **executed**; the results doc is +[`../../docs/BENCHMARK_PUBLIC_REAL_REPOS.md`](../../docs/BENCHMARK_PUBLIC_REAL_REPOS.md). The design is pre-registered in -[`../../docs/PUBLIC_BENCHMARK_PROTOCOL.md`](../../docs/PUBLIC_BENCHMARK_PROTOCOL.md). In brief: +[`../../docs/PUBLIC_BENCHMARK_PROTOCOL.md`](../../docs/PUBLIC_BENCHMARK_PROTOCOL.md); the shipped run +diverged from that pre-registration on two points, recorded in its +[§9 Amendment (shipped)](../../docs/PUBLIC_BENCHMARK_PROTOCOL.md#9-amendment-shipped). In brief: -- Two genuinely public repos at frozen SHAs (`encode/httpx`, `pallets/click`). -- The committed public manifest is [`repos.public.json`](repos.public.json), which is evidence - scaffolding only. -- Manual claims only (no `--extract`). -- Trigger/selection metrics and truth/alarm metrics reported separately. -- Published results must add a reproducibility manifest (repo URL + SHA, artifact, claim ids, exact - rerun command, tool versions) so results reproduce byte-for-byte. -- Reproducibility evidence on the pinned set — not a real-world performance claim. +- **Executed repos:** `humanize` (`2ddb5903cdc1`, MIT) and `python-dotenv` (`36004e0e34be`, + BSD-3-Clause), each with 4 machine-derived claims (8 total). The candidate top-5 starter set is + pinned in [`manifest.v1.yaml`](manifest.v1.yaml); `tomli`/`bandit`/`jaffle_shop_duckdb` remain + `NO_CLAIMS` and `sigstore-python` is excluded (unconfirmed `NOASSERTION` license). +- The original two-repo pre-registration (`encode/httpx`, `pallets/click`) stays pinned in + [`repos.public.json`](repos.public.json) as the frozen pre-registration inputs; those two were not + executed. +- **Claims are machine-derived**, not hand-authored: `bench/public_claims.py` extracts each operand + from source (stdlib `ast`/`tomllib`/`json`) and derives the ground-truth label by + Chain-of-Verification — running the real C3 checker on the mutated copy and recording the observed + verdict. `--extract` stays frozen and is not used. Because the label is the checker's own verdict, + this is determinism / reproducibility on these frozen SHAs, not a measure of catch power. +- Trigger/selection metrics and truth/alarm metrics are reported separately; `ERRORED` is its own + bucket and is never an alarm. +- Results carry a reproducibility manifest (repo URL + SHA, claim ids, exact rerun command, tool + versions); each subject was run twice and the output compared byte-for-byte. +- Reproducibility evidence on the pinned set — not a real-world performance claim, and it does not + transfer to other repositories. The frozen clones used for local development may sit under `bench/real/` (gitignored: clones and worktrees, never committed, never linted). Private or local clones are excluded from any published public benchmark — see §2 of the protocol. - -When the harness lands, it goes here and is wired into `_BENCH_DISPATCH` in -`src/dorian/commands.py`; results are published in a separate `docs/BENCHMARK_PUBLIC_REPOS.md` that -cites the protocol. No benchmark results are published yet. diff --git a/docs/ATTESTATION_INTEROP.md b/docs/ATTESTATION_INTEROP.md new file mode 100644 index 0000000..0a40047 --- /dev/null +++ b/docs/ATTESTATION_INTEROP.md @@ -0,0 +1,60 @@ +# Attestation interop — `dorian export --in-toto` (experimental) + +dorian can project a sealed `.warrant` into an [in-toto](https://in-toto.io/) Statement so a +claim-verification result can ride the same attestation rails as build provenance (SLSA), +Test Result, and Vulnerability predicates. This is **experimental interop**, not an official +predicate standard. + +## Why + +in-toto's vetted predicate registry has no `ClaimVerification` type (the closest are Test +Result, Vulnerability, and the Verification Summary Attestation). A sealed warrant — a +natural-language claim bound to a deterministic checker, verified at a content digest — is a +natural fit for one. dorian emits its own experimental predicate type until/unless a standard +one exists. + +## Usage + +```bash +dorian export --in-toto path/to/note.md # -> JSON in-toto Statement on stdout +``` + +It reads `path/to/note.md.warrant`, verifies the sidecar's content-addressed id, and prints a +Statement. It is a **pure, deterministic projection** of the warrant: no signing, no DSSE +envelope, no network, no model, no extra dependencies. Pipe it to a file or a signer of your +choice. + +## Shape + +```json +{ + "_type": "https://in-toto.io/Statement/v1", + "subject": [{ "name": "", "digest": { "sha256": "" } }], + "predicateType": "https://dorian.vwp/attestations/ClaimVerification/v0.1", + "predicate": { + "warrantId": "sha256:...", + "specVersion": "...", + "dorianVersion": "1.0.0", + "gitRef": "", + "sealedAt": "", + "bornVerifiable": true, + "producedBy": { "runner": "...", "captured_at": "..." }, + "claims": [ + { "id": "...", "text": "...", "kind": "...", "loadBearing": true, "backed": true, + "checkers": [{ "type": "C3", "program": "config-value:..." }] } + ] + } +} +``` + +## Honest scope + +- The Statement attests **what dorian sealed and verified at seal time** (`bornVerifiable` + means the seal exists only because every backed claim passed its checker then). The **live** + trust state — which can later flip to REVOKED as code drifts — comes from `dorian status` / + `dorian revalidate`, **not** from this static export. Re-export after a revalidate to capture + a newer state. +- `predicateType` is a dorian-namespaced, **experimental** URI (`…/v0.1`); it is not a + registered in-toto predicate and may change. +- No signing is performed. Wrap the Statement in DSSE / sign with your own tooling (cosign, + etc.) if you need a verifiable envelope. diff --git a/docs/BENCHMARK_CURRENT.md b/docs/BENCHMARK_CURRENT.md index 373d283..19653db 100644 --- a/docs/BENCHMARK_CURRENT.md +++ b/docs/BENCHMARK_CURRENT.md @@ -10,20 +10,22 @@ and are kept as-is for provenance. | field | value | | --- | --- | -| dorian version | `1.0.0rc2` (V1 release candidate) | +| dorian version | `1.0.1` | | metric commit | `33e9eaf` (the benchmark figures were measured here, during the release audit) | -| release commit | rc2 changes after the metric commit are release-state tooling, docs, and version metadata; the figures below remain the stamped metric-run evidence, not a fresh rc2 benchmark claim | +| release commit | `81cebbc` (1.0.1). Changes since the metric commit include checker edge-case fixes (C4 leading-dash nodeid rejection, C5 reconcile per-query timeout), a byte-identical index-once `verify` refactor, and two additive commands (`suggest-claims`, `export --in-toto`); both suites below were **re-run at 1.0.1 and reproduce the metric-commit figures exactly** (binding-lifecycle to the same content-derived `run_id`), so these changes do not move what the suites measure | | Python | 3.12.4 | | platform | darwin (CI matrix: 3.11 / 3.12 / 3.13) | | reproduce | `dorian bench large-mutation` · `dorian bench binding-lifecycle` · `dorian bench realworld-usecases` | -These numbers were re-run at the `1.0.0rc1` commit *after* the adversarial-review fixes -landed AND again during the independent release audit, confirming those fixes (py-const type -check, `code:` docstring handling, config-key stopwords) did not move the benchmark figures — -expected, since the suites exercise C1/C3 (symbol/regex/string/path)/C5, not the new -structural/config-binding paths. The `1.0.0rc2` stamp keeps this current-version doc aligned -with the source package version without upgrading the benchmark claim beyond the recorded -metric-run evidence. +These numbers were re-run at the `1.0.0rc1` commit after the adversarial-review fixes landed, +again during the independent release audit, and again at `1.0.1` — each time reproducing the +metric-commit figures unchanged (the binding-lifecycle rerun lands on the same content-derived +`run_id 168b50d9aa631d52`, a byte-identical result). The suites exercise C1/C3 +(symbol/regex/string/path) and C5, not malformed-nodeid, pathological-query, or the +structural/config-binding paths, so the `1.0.1` checker fixes and the two additive commands do +not — and did not — move them. The version stamp keeps this current-version doc aligned with the +source package version without upgrading the benchmark claim beyond the recorded metric-run +evidence. ## Results diff --git a/docs/PUBLIC_BENCHMARK_PROTOCOL.md b/docs/PUBLIC_BENCHMARK_PROTOCOL.md index 46eacac..cb10f02 100644 --- a/docs/PUBLIC_BENCHMARK_PROTOCOL.md +++ b/docs/PUBLIC_BENCHMARK_PROTOCOL.md @@ -1,5 +1,14 @@ # Public micro-benchmark protocol (pre-registration) +> **Amendment (shipped) — read first.** The §2 repository table below (httpx/click) and the §3 +> "claims authored by hand" rule are the **original pre-registration**, kept verbatim so the design +> cannot be retro-edited to flatter the outcome. **What actually shipped diverged**, by design — see +> [§9 Amendment (shipped)](#9-amendment-shipped) for the executed repos (`humanize`, `python-dotenv`), +> the machine-derived (Chain-of-Verification) ground-truth method that replaced hand-authored claims, +> and why. The executed results live in +> [`BENCHMARK_PUBLIC_REAL_REPOS.md`](BENCHMARK_PUBLIC_REAL_REPOS.md). Sections 1 and 4 (what this +> proves; the two-layer split) carried through to the shipped run unchanged. + **Status: protocol only — no results are published in this document.** This is the next rung of [`SOLO_VALIDATION_LADDER.md`](SOLO_VALIDATION_LADDER.md): moving from invented synthetic fixtures (the [v0.7.0 controlled-mutation benchmark](BENCHMARK_v0.7.0.md) and the @@ -75,7 +84,13 @@ fields per (repo, artifact): 3. Publish results in a separate doc that cites this protocol; never edit measured numbers to match the expectation — record the mismatch. -## 7. Reproduce (once a harness lands) +## 7. Reproduce + +> **Shipped status:** the `dorian bench public-repos` subcommand **is now implemented** +> (`bench/public_repos.py`) and was executed against the [§9](#9-amendment-shipped) repo set; the +> exact reproduce commands are in [`BENCHMARK_PUBLIC_REAL_REPOS.md`](BENCHMARK_PUBLIC_REAL_REPOS.md). +> The paragraph below is the **original pre-registration text** (written when the harness was still a +> scaffold) and is kept for the audit trail. A `dorian bench public-repos` subcommand is **not yet implemented** — see [`bench/public/README.md`](../bench/public/README.md) for the scaffold and @@ -91,3 +106,40 @@ the pinned set", "reproducibility evidence, not a real-world performance claim". proven, validated, real-world validated, universal, guaranteed, production-ready, production-grade, semantic proof, "works on real repos", "fully solves" anything. Benchmark contributions carry aggregate numbers only — never private repository content. + +## 9. Amendment (shipped) + +The shipped run diverged from §2–§3 above on two points. Both divergences are recorded here rather +than by editing the frozen pre-registration, so the change is visible. + +**(a) Repositories executed.** §2 pre-registered `encode/httpx` and `pallets/click` as the candidate +inputs. The executed run instead used two smaller, stable-API repos with frozen SHAs: + +| repo | frozen SHA | license | status | +|---|---|---|---| +| `humanize` | `2ddb5903cdc1` | MIT | PASS (4 claims) | +| `python-dotenv` | `36004e0e34be` | BSD-3-Clause | PASS (4 claims) | + +The candidate top-5 starter set (`humanize`, `python-dotenv`, `tomli`, `bandit`, `jaffle_shop_duckdb`, +with `sigstore-python` excluded for an unconfirmed `NOASSERTION` license) is pinned in +[`bench/public/manifest.v1.yaml`](../bench/public/manifest.v1.yaml). Only `humanize` and +`python-dotenv` produced a `benchmark-ready` claim set; the other three remain `NO_CLAIMS` and emit no +number. `httpx`/`click` were not executed; their SHAs stay pinned in +[`bench/public/repos.public.json`](../bench/public/repos.public.json) as the original frozen inputs. + +**(b) How claims and ground truth are produced.** §3 pre-registered **hand-authored** claims. The +shipped run replaced hand-authoring with a **deterministic, machine-derived** synthesizer +([`bench/public_claims.py`](../bench/public_claims.py)) — `--extract` stays frozen and is still not +used. The synthesizer extracts each claim operand from source with the stdlib (`ast` for Python, +`tomllib`/`json` for config), then derives the ground-truth label by **Chain-of-Verification**: +*"the truth label is machine-observed, not a human assertion"* — it applies the pre-declared mutation +to a one-file copy, runs the real C3 checker, and records the **observed** verdict (`FAIL → BROKEN`, +`PASS → TRUSTED`). Because the label comes from the checker's **own** verdict, this measures +**determinism / reproducibility on these frozen SHAs**, not catch power: it cannot show the checker +catches drift it would itself miss. A target whose clean claim does not PASS, or whose mutation does +not produce the declared verdict, is auto-rejected and never forced into the set. + +**(c) What carried through unchanged.** §1 (reproducibility evidence only, not a real-world +performance claim) and §4 (trigger/selection and truth/alarm layers reported separately, with +`ERRORED` as its own bucket that is never an alarm) held for the shipped run. The §8 wording +discipline is enforced mechanically by the report renderer. diff --git a/docs/REAL_CATCH_LOG.md b/docs/REAL_CATCH_LOG.md index 5c33b69..7501a24 100644 --- a/docs/REAL_CATCH_LOG.md +++ b/docs/REAL_CATCH_LOG.md @@ -45,6 +45,143 @@ checker could not confirm — trigger-vs-truth gap), or **weak-binding-warning** ## Entries -_None yet. This file ships empty on purpose: dorian has not yet accumulated real -external catches, and inventing them would violate [VALIDATION_HONESTY.md](VALIDATION_HONESTY.md). -The first honest entry here is worth more than any number in the benchmark docs._ +### 2026-06-17 — httpx `requires-python` floor `>=3.8` → `>=3.9` (real upstream PR #3592) + +- **Claim:** "httpx declares a minimum supported Python of 3.8 (pyproject project.requires-python is `">=3.8"`)." +- **Checker:** `C3 config-value:pyproject.toml:project.requires-python:">=3.8"` +- **Repo / project:** [`encode/httpx`](https://github.com/encode/httpx) (public-safe? **yes** — BSD-3-Clause, frozen public SHAs) +- **Source commit that sealed it:** `336204f0121a9aefdebac5cacd81f912bafe8057` (commit A) +- **Change that triggered revalidation:** `4fb9528c2f5ac000441c3634d297e77da23067cd` — real upstream **"Drop Python 3.8 support (#3592)"** by Alex Grönholm +- **Outcome:** **true-catch** +- **Verdict dorian gave:** **BROKEN** (`WARRANTED → REVOKED`, exit 4) +- **Would you have shipped the break otherwise?** **yes** — `requires-python` is packaging metadata covered by no test (`grep -rn requires-python tests/` is empty; B's diff touches 8 files, none under `tests/`), so httpx's CI is green at B; and a per-PR review bot is stateless — it has no memory of commit A's claim and PR #3592's own diff is self-consistent, so nothing re-opens the old note. +- **User time spent (setup + review):** ~10 minutes +- **Reviewer notes:** dogfood on a real public repo at frozen SHAs; **independently reproduced** end-to-end. The other three sealed claims (`Client` defined in `_client.py`, `Client` exported, version `0.28.1`) stayed VERIFIED — dorian narrowed revalidation to the **1 candidate** whose source (`pyproject.toml`) actually changed. Full captured output and reproduction below. + +#### Captured output + +**1. Seal at A — `dorian verify` (exit 0):** + +``` +$ dorian verify note.md --claims claims.json +sha256:7db02138b329729b4f84b20d37a1154e237c07993783750a3c26e3531334b8a2 +verified 4/4 claim(s) against current sources -> note.md.warrant +# exit 0 +``` + +The warrant id is `sha256(canonical_json(body))` over the sealed body — **tamper-evident** +(any later edit to the warrant is detected on load via an id mismatch). The body includes the +seal timestamp, so a *fresh* seal of the same inputs produces a *different* id; the id shown +here is from this run. What reproduces across runs is the **outcome**, not the id: a seal at A +(exit 0, 4/4) and a flip to REVOKED at B (exit 4). + +**2. The real upstream drift — `git show 4fb9528 --stat`:** + +``` + Drop Python 3.8 support (#3592) + .github/workflows/publish.yml | 2 +- + .github/workflows/test-suite.yml | 2 +- + CHANGELOG.md | 6 ++++++ + README.md | 2 +- + docs/async.md | 2 +- + docs/index.md | 2 +- + pyproject.toml | 3 +-- + requirements.txt | 3 +-- + 8 files changed, 13 insertions(+), 9 deletions(-) + +# the pyproject.toml hunk: -requires-python = ">=3.8" +requires-python = ">=3.9" +``` + +**3. The drift is silent to the test suite** — PR #3592 touches no test file, and no test +references the key: + +``` +$ grep -rn "requires-python" tests/ +# (no matches) +``` + +**4. Re-check at B — `dorian revalidate --since A` (exit 4):** + +``` +$ dorian revalidate --since 336204f0121a9aefdebac5cacd81f912bafe8057 +checked 1 candidate claim(s) +BROKEN sha256:7db02138b329729b httpx-python-floor-38 C3: config_value_mismatch: project.requires-python +fold sha256:7db02138b329729b WARRANTED -> REVOKED +# exit 4 +``` + +**5. Resulting state — `dorian status` (exit 4):** + +``` +$ dorian status note.md +REVOKED note.md sha256:7db02138b329729b BROKEN=1 VERIFIED=3 +``` + +#### The change-note and claims (verbatim, so the run is reproducible) + +`note.md`: + +```markdown +# Change note: pin our integration to httpx's supported Python floor + +We depend on `httpx` and need our CI matrix to track the library's own support +window. As of this change, the facts our integration relies on are: + +- httpx's packaging declares a minimum Python of **3.8** (`project.requires-python` + is `">=3.8"` in `pyproject.toml`), so our service may still run on Python 3.8. +- The public `Client` class is defined in `httpx/_client.py`. +- `Client` is listed in the top-level `httpx` package exports (`httpx/__init__.py`). +- The pinned library version is `0.28.1` (`httpx/__version__.py`). + +If httpx raises its supported Python floor, our 3.8 CI lane must be dropped in the +same change — that is the load-bearing fact below. +``` + +`claims.json`: + +```json +{ + "claims": [ + {"id": "httpx-python-floor-38", "text": "httpx declares a minimum supported Python of 3.8 (pyproject project.requires-python is \">=3.8\").", + "kind": "quantity", "load_bearing": true, + "checkers": [{"type": "C3", "program": "config-value:pyproject.toml:project.requires-python:\">=3.8\""}]}, + {"id": "httpx-client-defined", "text": "The public Client class is defined in httpx/_client.py.", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C3", "program": "symbol:httpx/_client.py::Client"}]}, + {"id": "httpx-client-exported", "text": "Client is listed in the top-level httpx package exports.", + "kind": "behavior", "load_bearing": false, + "checkers": [{"type": "C3", "program": "string:httpx/__init__.py::\"Client\""}]}, + {"id": "httpx-version-0281", "text": "The pinned httpx version is 0.28.1.", + "kind": "quantity", "load_bearing": false, + "checkers": [{"type": "C3", "program": "py-const:httpx/__version__.py::__version__::\"0.28.1\""}]} + ] +} +``` + +#### Reproduce it yourself (public repo, frozen SHAs) + +```bash +pip install dorian-vwp +git clone https://github.com/encode/httpx && cd httpx +git checkout -b dorian-catch 336204f0121a9aefdebac5cacd81f912bafe8057 # A +# write note.md and claims.json exactly as above, then: +dorian verify note.md --claims claims.json # -> verified 4/4, exit 0 +git add note.md note.md.warrant claims.json && git commit -m "seal at A" +git cherry-pick 4fb9528c2f5ac000441c3634d297e77da23067cd # real upstream B: Drop Python 3.8 (#3592) +dorian revalidate --since 336204f0121a9aefdebac5cacd81f912bafe8057 +# -> httpx-python-floor-38 BROKEN; WARRANTED -> REVOKED; exit 4 +dorian status note.md # -> REVOKED BROKEN=1 VERIFIED=3 +``` + +#### Honest scope — what this does and does **not** show + +**Does show:** on a real public repo, a load-bearing claim sealed at A was flipped to REVOKED +by a real, unrelated later commit, deterministically and reproducibly, when no test, no CI +signal, and no stateless per-PR review would have re-opened it. + +**Does not show:** that dorian flags drift it was not bound to, that one example "proves" httpx +correct, or that this result extrapolates. It re-checks **only the properties you explicitly +bound** — here, one config value. A claim bound only to a structural existence checker (e.g. `symbol:`) would **not** +flip if a function were gutted while keeping its name (the trigger≠truth / gutted-body +ceiling — see [WRITING_GOOD_CLAIMS.md](WRITING_GOOD_CLAIMS.md)). This is **one documented +catch**, not a benchmark; inventing more would violate [VALIDATION_HONESTY.md](VALIDATION_HONESTY.md). diff --git a/docs/ROADMAP_BACKLOG.md b/docs/ROADMAP_BACKLOG.md index f0b916f..938ea5f 100644 --- a/docs/ROADMAP_BACKLOG.md +++ b/docs/ROADMAP_BACKLOG.md @@ -79,11 +79,10 @@ before marketing, deterministic verification before AI automation.* - id: pypi-trusted-publishing title: PyPI Trusted Publishing workflow (manual, OIDC, no token) - status: PARTIAL - problem: Source install works; PyPI install reduces friction and signals maturity. - evidence: .github/workflows/publish.yml (workflow_dispatch only; environment-gated; OIDC). - remaining: A maintainer must create the PyPI Trusted Publisher + `pypi` GitHub environment, then trigger manually. Nothing publishes automatically. - human_review_required: yes # credentials / PyPI project ownership + status: DONE + evidence: .github/workflows/publish.yml (workflow_dispatch only; environment-gated; OIDC); `dorian-vwp` 1.0.0 is live on PyPI (`pip install dorian-vwp`). + acceptance_criteria: PyPI Trusted Publisher + `pypi` GitHub environment configured; a tagged release published `dorian-vwp` to PyPI via OIDC (no token). + human_review_required: no confidence: high - id: public-microbenchmark-execution diff --git a/docs/SECURITY_AND_SAFE_RUNNERS.md b/docs/SECURITY_AND_SAFE_RUNNERS.md new file mode 100644 index 0000000..9416d1e --- /dev/null +++ b/docs/SECURITY_AND_SAFE_RUNNERS.md @@ -0,0 +1,146 @@ +# Security and safe runners + +One opinionated, copy-paste recipe for running dorian's GitHub Action on +**public / untrusted fork PRs**, plus the reasoning behind each setting. For the +full trust model — which checkers execute code, what the env strip does, what +scope-lint does and does not contain — read +[docs/SECURITY_BOUNDARY.md](SECURITY_BOUNDARY.md). This page is the runner recipe; +that page is the boundary. + +## The one safe recipe (public / untrusted fork PRs) + +Drop this into a workflow file (for example `.github/workflows/dorian.yml`): + +```yaml +on: pull_request # never pull_request_target for untrusted forks +permissions: + contents: read + pull-requests: write + +jobs: + dorian: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # the action needs full history to revalidate --since base + - uses: ajaysurya1221/dorian/action@v1.0.0 + with: + checker_trust: base # resolve checker SPECs from the trusted base ref + deny_exec: "true" # C4 pytest / C5 shell ERROR instead of executing + fail_on: never # start report-only; tighten once you trust the signal +``` + +Three settings carry the safety. The rest of this page explains why each one is +load-bearing and, just as importantly, what it does **not** buy you. + +## This is policy, not a sandbox + +`checker_trust: base` is a trust-**root** control, not execution isolation. With +`checker_trust: base` the action reads each claim's checker SPEC from the **base +ref** (the branch the PR targets) and runs it against the PR-head sources. So a +fork PR cannot introduce a *new* executing checker, and it cannot rewrite an +existing checker to self-attest a passing verdict — the base-approved spec wins, +and the change is surfaced in the PR comment. A missing or tampered base sidecar +fails closed (ERRORED, never executed). + +What it is **not**: it does not isolate execution. A `pytest:` checker that was +already approved in the base ref still runs `python -m pytest` against the +**PR-head** test and source files, so it can import and execute code the fork +author wrote. `checker_trust: base` decides *which checker specs are allowed to +run*; it does nothing about *what those allowed checkers then do*. That is why +the recipe also sets `deny_exec: "true"`. Never describe `checker_trust: base` as +a sandbox — it is a checker-source trust root. + +## Why `deny_exec: "true"` matters on untrusted forks + +`deny_exec: "true"` removes the ability to execute code at all. It fail-closes the +two checker families that spawn a process: + +- **C4 `pytest:`** — would run `python -m pytest`. +- **C5 `shell:`** — would run an arbitrary command. + +Under deny-exec, a blocked checker becomes **ERROR — never PASS and never FAIL**. +A checker that was refused permission to run has not proven the claim true and has +not proven it false, and the surrounding protocol already fails closed on ERROR +(seal refuses to be born; revalidate folds ERROR to ERRORED, never to a silent +pass). So a fork PR that tries to smuggle an executing checker cannot make this +action run its code, and cannot make a load-bearing claim silently pass either. + +The pairing matters: `checker_trust: base` stops a fork from *introducing* a new +executing checker; `deny_exec: "true"` stops even a *base-approved* executing +checker from running PR-head code. For untrusted forks, use both. + +**One honest caveat.** deny-exec gates only the process-spawning families (C4 +pytest, C5 shell). The **typed C5 data reads** — `rowcount`, `schema`, +`nullrate`, `domain`, `freshness`, `snapshot`, `reconcile` — read CSV / SQLite / +parquet **in-process**; they are deliberately not deny-exec-gated because they do +not spawn a command. They are bounded instead: SQLite reconcile queries run under +a read-only authorizer and a per-query wall-clock timeout (5s) that interrupts a +pathological query (for example an infinite recursive CTE) and reports ERROR. So +deny-exec is about *code execution*, and the typed data path has its own bounded +in-process protection rather than relying on deny-exec. + +## Why never `pull_request_target` for untrusted forks + +`pull_request_target` runs the workflow in the **base** repository's context — with +a read/write `GITHUB_TOKEN` and access to repository secrets — while checking out +a tree the fork author controls. That combination (write-capable secrets plus an +attacker-controlled tree) is exactly what turns any of the exposure above into +secret exfiltration. Always trigger on `pull_request`, where fork PRs get a +read-only token and no secrets. Never use `pull_request_target` for untrusted +forks. (On fork `pull_request` runs the default token is read-only, so the sticky +PR-comment step needs `pull-requests: write` in `permissions` — that is the write +scope in the recipe above, and it is scoped to PR comments, not to code or +secrets.) + +## Trusted / internal repos + +If everyone who can open a PR is already trusted to run code in your CI — your own +repo, an internal repo, a team you trust — you can run the executing checkers +(C4 pytest, C5 shell) and get the full signal. Use the defaults: + +```yaml +on: pull_request +permissions: + contents: read + pull-requests: write + +jobs: + dorian: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: ajaysurya1221/dorian/action@v1.0.0 + # checker_trust defaults to head, deny_exec defaults to false: + # executing checkers run, because contributors are trusted. + with: + fail_on: revoked # fail the check when a warranted claim is revoked +``` + +Here `checker_trust: head` (the default) runs the checked-out checker spec and +`deny_exec: false` (the default) lets C4/C5 execute — correct precisely because +the people who write the sidecars are trusted. Review a `.warrant` file the same +way you review code. + +## Supply-chain note + +dorian's own repository hardens its CI supply chain: + +- Every third-party Action it uses is **pinned to a full commit SHA** (not a + floating tag) across its workflows, so a re-tagged upstream action cannot + silently change what runs. +- The `security` workflow runs **pip-audit** (SCA — audits the resolved + dependency tree for known CVEs, including a weekly scheduled run to catch + newly-disclosed advisories) and **bandit** (SAST — static analysis of + first-party source). See [`.github/workflows/security.yml`](../.github/workflows/security.yml). + +The action itself is composite and stdlib-only: it pulls in no third-party +actions, and installs only the `dorian-vwp` package you pin via the `install` +input. + +--- + +See also [docs/SECURITY_BOUNDARY.md](SECURITY_BOUNDARY.md) for the full trust model. diff --git a/docs/USE_WITH_CLAUDE_CODE.md b/docs/USE_WITH_CLAUDE_CODE.md index 8013a5a..f2a382d 100644 --- a/docs/USE_WITH_CLAUDE_CODE.md +++ b/docs/USE_WITH_CLAUDE_CODE.md @@ -19,6 +19,15 @@ emit the claims. A complete, runnable version of everything below is in > `--deny-exec` (env `DORIAN_DENY_EXEC=1`) so C4/C5 `shell:` ERROR instead of running — fail-closed, > but **not a sandbox** (see [`SECURITY.md`](../SECURITY.md)). +> **⚠️ The agent only PROPOSES claims — dorian VERIFIES them. No model runs at check time.** The +> agent writes prose (`change-note.md`) and a `claims.json`; that is *all* it decides. `dorian verify` +> then runs each claim's deterministic checker against the real source — stdlib AST parsing, regex, +> file/symbol lookups, an actual `pytest` run — with **no LLM, no model, no judgment call** anywhere +> in the loop. A warrant is *born verifiable*: if any load-bearing checker FAILs or ERRORs, the seal +> is **refused** and nothing is written (exit 4). **An agent cannot make a false claim seal** — a +> failing checker refuses it, no matter how confidently the prose was worded. The agent's job is to +> name what it did and pick a check; dorian's job is to decide whether the check passes. + ## 1. The loop ```bash @@ -104,21 +113,82 @@ The claims ([`examples/claude-code/claims.json`](../examples/claude-code/claims. } ``` +The output below is what running the shipped example prints (dorian 1.0.0). The leading +`sha256:` is the content-addressed warrant id — its exact digits vary per run, so it is +shown abbreviated here; everything else is stable: + ```text $ dorian verify change-note.md --claims claims.json -verified 2/2 claim(s) against current sources -> change-note.md.warrant # exit 0 +sha256: +verified 2/2 claim(s) against current sources -> change-note.md.warrant # exit 0 # later, app.py renames login_handler and drops the timeout to 10: $ dorian revalidate --since HEAD -BROKEN login-handler-added C3: symbol_missing -BROKEN login-timeout-30s C3: regex_missing -fold WARRANTED -> REVOKED # exit 4 +checked 2 candidate claim(s) +BROKEN sha256: login-handler-added C3: symbol_missing +BROKEN sha256: login-timeout-30s C3: regex_missing +fold sha256: WARRANTED -> REVOKED # exit 4 ``` Run it end-to-end with the copy-paste block in -[`examples/claude-code/README.md`](../examples/claude-code/README.md). +[`examples/claude-code/README.md`](../examples/claude-code/README.md) — the exact commands and +expected output are in §4 below. + +## 4. Run the example yourself + +[`examples/claude-code/`](../examples/claude-code/) is a self-contained, **runnable** version of the +files above (`app.py`, `change-note.md`, `claims.json`). Copy the three core files into a throwaway +git repo so the sealed `.warrant` lands there, not in your working tree, then run: + +```bash +cd examples/claude-code +tmp=$(mktemp -d) && cp app.py change-note.md claims.json "$tmp" && cd "$tmp" && git init -q +git add -A && git commit -q -m "login handler + note" + +dorian verify change-note.md --claims claims.json +# then a refactor renames the function and drops the timeout — the note never changes: +printf 'LOGIN_TIMEOUT = 10\n\n\ndef signin(request):\n return {"ok": True}\n' > app.py +dorian revalidate --since HEAD +``` + +Expected output (dorian 1.0.0; the `sha256:` digits vary per run, the rest is stable): + +```text +$ dorian verify change-note.md --claims claims.json +sha256: +verified 2/2 claim(s) against current sources -> change-note.md.warrant # exit 0 + +$ dorian revalidate --since HEAD +checked 2 candidate claim(s) +BROKEN sha256: login-handler-added C3: symbol_missing +BROKEN sha256: login-timeout-30s C3: regex_missing +fold sha256: WARRANTED -> REVOKED # exit 4 +``` + +`change-note.md` still reads perfectly and `git`/CI stay quiet — but the warrant flipped to REVOKED, +naming the exact claims that stopped being true. (This whole flow is pinned by +`tests/test_examples_claude_code.py`, so it fails CI if the example ever stops working.) + +**Reading the binding flags.** Before you trust a seal, run `dorian bindings change-note.md` (or +`dorian verify … --binding-gate warn`). On this example it flags `login-handler-added` as **high +risk** — a `behavior` claim backed only by a `symbol:` existence checker can prove the function still +*exists*, not that it still *behaves*: + +```text +$ dorian bindings change-note.md +login-handler-added flags: single-file + strength: existence risk: high (adequacy_mismatch) + adequacy_mismatch: 'behavior' claim backed only by existence — only a C4 pytest checker proves behavior +login-timeout-30s flags: single-file + strength: raw_text risk: low +2 claim(s), 2 flagged +``` + +That is the tool doing its job: existence is the right *trigger* but not *proof of behavior*. To +actually pin behavior you would add a `C4 pytest:` checker (which RUNS the test). The flag is a review +smell, **not** a claim being false — the seal still succeeds. -## 4. Permissions +## 5. Permissions [`examples/claude-code/settings.example.json`](../examples/claude-code/settings.example.json) is a review-first snippet to merge into your project's `.claude/settings.json`. By default it pre-allows @@ -137,14 +207,14 @@ only when you trust the local repo and review agent-emitted `claims.json` before trusted-local sample pre-allows `verify`/`revalidate` for speed, but it is not the default and still does not pre-allow `seal`, `--extract`, or arbitrary shell. -## 5. Claim extraction is frozen — emit claims, don't extract them +## 6. Claim extraction is frozen — emit claims, don't extract them `dorian seal --extract` (drafting claims with an LLM from a blank file) still works but is **frozen and experimental** — it failed its stability gate twice. The supported path is the agent emitting `claims.json` directly, as above; treat any `--extract` output as a draft for review, never a stable warrant input. dorian itself runs **no model at check time, ever** — that is the point. -## 6. What dorian is / is not +## 7. What dorian is / is not **Is:** a local-first, git-native CLI that turns the checkable claims in an AI-authored change into deterministic, token-free checks, seals them into a content-addressed `.warrant` sidecar, and diff --git a/docs/WRITING_GOOD_CLAIMS.md b/docs/WRITING_GOOD_CLAIMS.md new file mode 100644 index 0000000..dec1caa --- /dev/null +++ b/docs/WRITING_GOOD_CLAIMS.md @@ -0,0 +1,258 @@ +# Writing good claims + +`dorian` does not understand your prose. When sources change it re-checks **only +the properties you explicitly bind** — and it re-checks them with the checker you +chose, nothing more. So the quality of a warrant is the quality of its checkers. + +> **A claim is only as strong as its checker.** The natural-language `text` is for +> humans; the `checkers` list is the contract dorian enforces. A perfectly worded +> claim bound to a weak checker is a weak claim. + +This page is about *choosing the right checker for what you actually mean*. For the +full grammar of every checker program string, see +[`../spec/checkers.md`](../spec/checkers.md); for the `claims.json` schema and the +authoring workflow, see [`AGENT_CLAIMS.md`](AGENT_CLAIMS.md). + +The truth-strength ladder dorian uses to score a checker (low → high) is: + +``` +existence < raw_text < semantic_text < snapshot < data < structural < behavioral +``` + +`symbol:`/`path:` are `existence`; `string:`/`regex:` are `raw_text`; `code:` is +`semantic_text`; `py-signature:`/`py-const:`/`config-value:` are `structural`; +typed C5 is `data`; `pytest:` is `behavioral`. Pick the highest rung your claim +actually warrants. + +--- + +## Three good / bad claim pairs + +Each pair shows the weak version (unbound, ambiguous, or under-checked) and the +good version (bound to the checker that actually falsifies the claim). Every +program string below is real dorian grammar, verified by running `dorian verify` +and `dorian revalidate`. + +### Pair 1 — "the timeout is 30 seconds" (a *quantity* claim) + +**Bad** — bound to a bare substring `30`. A two-character literal near-matches half +the file; it also passes if `30` survives only in a comment or an unrelated line. +dorian seals it, but the binding is weak (it flags `short-literal`). + +```json +{"id": "timeout-30", "text": "the request timeout is 30 seconds", + "kind": "quantity", "load_bearing": true, + "checkers": [{"type": "C3", "program": "string:src/auth.py::30"}]} +``` + +**Good** — bound to the *value* of the named constant. `py-const:` parses the AST +and compares the literal value (`30` matches `0x1E`, tolerant of formatting), so it +FAILs the instant the number changes and cannot be satisfied by a stray `30` +elsewhere. + +```json +{"id": "timeout-30", "text": "the request timeout (TIMEOUT) is 30 seconds", + "kind": "quantity", "load_bearing": true, + "checkers": [{"type": "C3", "program": "py-const:src/auth.py::TIMEOUT::30"}]} +``` + +``` +py-const::::: # structural (Python AST) +``` + +On a real run, editing `TIMEOUT = 30` to `25` flips this to +`BROKEN C3: const_mismatch`; the value is genuinely bound. (If the constant is not +a Python module/class assignment — e.g. it lives in TOML — use `config-value:`; if +the fact must survive whitespace reformatting in raw text, an anchored +`regex:src/auth.py::TIMEOUT\s*=\s*30` is the `raw_text` middle ground.) + +### Pair 2 — "verify_token takes (token, algo)" (a signature *fact*) + +**Bad** — bound to `symbol:`, which only proves a `def verify_token` *exists*. It +passes even after every parameter is renamed or reordered, so it does not actually +hold the signature. + +```json +{"id": "sig", "text": "verify_token takes (token, algo)", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C3", "program": "symbol:src/auth.py::verify_token"}]} +``` + +**Good** — bound to `py-signature:`, which parses the AST and compares parameter +names, order, and kind. Adding a parameter flips it to +`BROKEN C3: signature_mismatch: verify_token: param count 3 != expected 2`. + +```json +{"id": "sig", "text": "verify_token takes (token, algo)", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C3", "program": "py-signature:src/auth.py::verify_token::token, algo"}]} +``` + +``` +py-signature::::: # names/order/kind always compared; + # annotations/defaults/return/async + # compared ONLY when you state them +``` + +State exactly as much as you mean: `token, algo` checks names and order; +`token: str, algo: str = "RS256" -> bool` additionally pins the annotations, +default, and return type. (Note: `py-signature:` still does **not** prove the +function *behaves* correctly — see the gutted-body ceiling below.) + +### Pair 3 — "requires-python is >=3.9" (a config *quantity* claim) + +**Bad** — a bare `regex:` over `pyproject.toml`. It matches the literal text but +treats the value as a string blob, and a regex on a structured file is brittle to +quoting and key relocation. + +```json +{"id": "pyfloor", "text": "requires-python is >=3.9", + "kind": "quantity", "load_bearing": true, + "checkers": [{"type": "C3", "program": "regex:pyproject.toml::requires-python\\s*=\\s*\">=3.9\""}]} +``` + +**Good** — bound to `config-value:`, which parses the TOML and compares the value at +the dotted key path **by value and type**. A real upstream PR that bumped this floor +flipped it to `BROKEN C3: config_value_mismatch: project.requires-python` (see +[`REAL_CATCH_LOG.md`](REAL_CATCH_LOG.md)). + +```json +{"id": "pyfloor", "text": "requires-python is >=3.9", + "kind": "quantity", "load_bearing": true, + "checkers": [{"type": "C3", "program": "config-value:pyproject.toml:project.requires-python:\">=3.9\""}]} +``` + +``` +config-value::: # structural (TOML/JSON), + # single ':' separators, + # value AND type compared +``` + +`config-value:` uses single-`:` separators (unlike the `::` C3 forms) and is +TOML/JSON only — no YAML in v1. + +--- + +## The gutted-body ceiling (trigger ≠ truth) + +This is the single most important limitation to internalize, and the one that most +often disappoints. **A structural existence checker re-checks that a name/signature +still exists — it does not re-check what the code does.** If a function's *body* is +gutted while its name and signature stay the same, a `symbol:` or `py-signature:` +claim stays GREEN. + +Worked example, reproduced end-to-end. We seal a `symbol:` claim, then replace the +function body with `return True` (name and signature untouched): + +```python +# before +def verify_token(token, algo="RS256"): + return token.algo == algo + +# after — GUTTED: name and signature identical, behavior destroyed +def verify_token(token, algo="RS256"): + return True +``` + +```json +{"id": "verify-exists", "text": "verify_token exists", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C3", "program": "symbol:src/auth.py::verify_token"}]} +``` + +`dorian revalidate` on the gutted commit — the binding *fires the re-check*, but the +existence checker still passes, so the warrant does **not** flip: + +``` +checked 1 candidate claim(s) +VERIFIED sha256:dc07f6acd661d841 verify-exists +fold sha256:dc07f6acd661d841 WARRANTED -> TRUSTED +# exit 0 — the body was gutted and dorian stayed green +``` + +The *only* thing that catches this is a checker that observes behavior. With a +**C4 `pytest:`** test bound to the same claim, the same gutted commit flips: + +```json +{"id": "verify-behavior", "text": "verify_token returns False on an algorithm mismatch", + "kind": "behavior", "load_bearing": true, + "checkers": [{"type": "C4", "program": "pytest:tests/test_auth.py::test_rs256"}]} +``` + +``` +checked 1 candidate claim(s) +BROKEN sha256:b4f5a06cac89ab6e verify-behavior C4: test_failing +fold sha256:b4f5a06cac89ab6e WARRANTED -> REVOKED +# exit 4 — the behavior checker caught the gutted body +``` + +This is honest and by design: **structural checkers re-check existence and shape, +not behavior.** `py-signature:` is blind to a body-only change for exactly the same +reason (the signature is unchanged, so it PASSes). A **C4 `pytest:`** test, or a +**C5** data checker for a data fact, is the only thing that can falsify a behavior +or value-of-output claim. If your claim is about what the code *does*, an existence +checker is not enough — it only tells you the symbol is still there. + +--- + +## Authoring checklist + +Run through this for every claim before you seal it: + +- [ ] **Is the claim explicit?** The `text` names the specific symbol, file, key, or + value — not "we improved auth." A vague claim cannot be bound to a precise + checker. +- [ ] **Is it load-bearing?** Set `load_bearing: true` only when a downstream + decision depends on it (a load-bearing break folds the warrant to REVOKED; a + non-load-bearing one only to DEGRADED). Don't inflate everything. +- [ ] **Is the checker structural, behavioral, or data-backed — and does that match + the claim?** Match the rung to the meaning: + - existence of a name/file → `symbol:` / `path:` + - presence of a literal → `string:` (short) / `regex:` (reformat-tolerant) + - a signature / a constant value / a config value → `py-signature:` / + `py-const:` / `config-value:` (structural) + - what the code **does** → `pytest:` (C4, behavioral) + - a data property (rows, schema, nullrate, snapshot) → typed **C5** +- [ ] **What edit SHOULD revoke this claim?** Name it. Then convince yourself the + checker actually FAILs on that edit. If you can't, the checker is too weak. +- [ ] **What edit should NOT revoke it?** A whitespace reformat, a rename of an + unrelated symbol, a comment edit. Prefer `regex:`/`py-const:`/`py-signature:` + over `string:` so benign churn doesn't false-alarm. +- [ ] **For every behavior claim: is there a C4/C5 check — not just a `symbol:` + existence check?** If `kind` is `behavior` and the only checker is `symbol:`/ + `py-signature:`, you have a gutted-body blind spot. Bind a `pytest:` test. + +--- + +## Let the strength advisory find your weak claims + +You don't have to eyeball this. `dorian bindings ` reports each claim's +**trigger** flags (when it gets re-checked) *and* its **truth strength** and risk +(whether the checker can actually falsify it). It never runs a checker, never +changes a verdict — it is purely advisory. + +On the two weak claims from the pairs above (a `behavior` claim backed only by +`symbol:`, and a `quantity` claim backed only by a short string literal), the real +output is: + +``` +$ dorian bindings note.md +verify-exists flags: single-file + strength: existence risk: high (adequacy_mismatch) + adequacy_mismatch: 'behavior' claim backed only by existence — only a C4 pytest checker proves behavior +timeout-30 flags: single-file, short-literal + strength: raw_text risk: medium (binding:short-literal) +2 claim(s), 2 flagged +``` + +`adequacy_mismatch` is the advisory telling you the checker is too weak for the +claim's `kind`; `short-literal` warns that a tiny literal near-matches too much. +Treat a `high` risk on a load-bearing claim as a prompt to upgrade the checker +before you ship the warrant. + +--- + +See [`../README.md`](../README.md) for how `verify` / `revalidate` fit together, and +[`REAL_CATCH_LOG.md`](REAL_CATCH_LOG.md) for the running ledger of what dorian has +actually caught (and missed) on real changes — including the trigger-vs-truth +ceiling described above, on a real repo. diff --git a/docs/design/BENCHMARK_C4_C5.md b/docs/design/BENCHMARK_C4_C5.md new file mode 100644 index 0000000..a566c2c --- /dev/null +++ b/docs/design/BENCHMARK_C4_C5.md @@ -0,0 +1,41 @@ +# Design note — broadening the public benchmark to executed C4 + C5 cases + +The shipped public benchmark (`dorian bench public-repos`, subjects `humanize` and +`python-dotenv`) executes **structural C3** claims only and is **byte-deterministic across two +runs** on frozen SHAs — that determinism is the property it exists to demonstrate +([../BENCHMARK_PUBLIC_REAL_REPOS.md](../BENCHMARK_PUBLIC_REAL_REPOS.md)). The plan asks to add +one executed **C4 (`pytest:`)** and one executed **C5 (typed data)** case so the corpus is not +structural-only. + +## Why this is deferred (not just unbuilt) + +The two checker families differ sharply in how safely they fit a *portable, deterministic* +benchmark: + +- **C5 typed data** (`rowcount:`/`schema:`/`reconcile:` over a committed `.csv`/`.db`) is an + in-process, read-only check — **deterministic and portable**. The only friction is that the + current public subjects are code libraries with no committed data fixture, so a C5 case needs + either a new frozen data-bearing subject or a synthetic-but-labeled data fixture (which is what + the existing controlled-mutation benchmark already covers). +- **C4 `pytest:`** spawns a real `pytest` subprocess against a cloned repo. Its verdict depends + on the **runner environment** (installed deps, Python version, plugin set). Folding that into a + benchmark advertised as *byte-deterministic on frozen SHAs* would make the headline determinism + claim environment-dependent — i.e. it could **weaken the exact honesty property** the benchmark + is for. A flaky benchmark row is worse than no row. + +So adding C4 here is intentionally deferred until it can be done **without** compromising +determinism, and C5 is deferred behind picking a stable data-bearing public subject. + +## Intended approach when built + +1. **C5 row:** add a frozen public (or vendored, clearly-labeled) data fixture; author 1–2 + typed-data claims; extend the `bench/public` manifest + `public_claims.py` machine-derived + labelling to cover the C5 verdict; assert byte-determinism ×2, same as today. +2. **C4 row:** pin the subject's full dependency closure to hashes and run inside the benchmark's + own locked environment so the pytest verdict is reproducible; record the exact toolchain in the + report; keep the overclaim guard. If reproducibility across machines cannot be guaranteed, keep + C4 out of the *public* benchmark and demonstrate it only in the controlled-mutation suite + (which already executes C4 deterministically against synthetic fixtures). + +Both extensions preserve the existing honesty framing: reproducibility/determinism on frozen +SHAs, never broad validation; trigger and truth layers reported separately. diff --git a/docs/design/SUGGEST_CLAIMS.md b/docs/design/SUGGEST_CLAIMS.md new file mode 100644 index 0000000..23f850d --- /dev/null +++ b/docs/design/SUGGEST_CLAIMS.md @@ -0,0 +1,47 @@ +# Design note — `dorian suggest-claims` + +`suggest-claims` is the C3 counterpart to `suggest-data-checks`: a deterministic, +zero-model helper that lowers the authoring tax by proposing born-verifiable claims for a +Python file. It is **scaffolding for review**, not auto-application, and never a substitute +for thinking about what is load-bearing. + +## Shipped (v1.0.x) + +`dorian suggest-claims [--out F]` (implemented in `src/dorian/suggestclaims.py`): + +- Proposes `symbol:::` for every non-private top-level `def`/`class`. A name + defined in more than one tracked file is **skipped loudly** (ambiguous binding) and noted + on stderr. +- Proposes `py-const:::::` for every non-private module-level assignment + whose RHS is a simple Python literal (int/float/str/bool/None). Containers are skipped + (conservative). +- **Runs every candidate** against current source via the real C3 checker and emits **only + the ones that PASS** — so the `{"claims": [...]}` fragment seals unmodified + (`dorian verify --claims ` → exit 0). Pinned by + `tests/test_suggest_claims.py::test_suggest_claims_output_seals_unmodified`. +- `load_bearing` defaults to **false** on every suggestion; the reviewer promotes the ones + that matter. + +## Honest scope / known ceiling + +These suggestions check **existence and value, not behavior**. A `symbol:` claim stays green +when a function body is gutted (trigger ≠ truth — see +[../WRITING_GOOD_CLAIMS.md](../WRITING_GOOD_CLAIMS.md)). The command exists to remove +boilerplate, not to certify behavior; pair behavior claims with a C4 `pytest:` or C5 check. + +## Deferred (gated on adoption demand, per the launch plan) + +The launch analysis (four research packets + red-team) cautions against racing breadth here: +auto-suggestion is a commoditized space, and the durable value is the persisted cross-PR +re-check, not the generator. So the following are **deferred until real users ask**, each a +clean, additive extension of the same run-and-keep-green pattern: + +- `--since ` diff mode: propose claims only for symbols/constants that **changed** + between a base ref and HEAD (uses `gitio` for the changed-file set). +- `py-signature:` suggestions (propose the current signature of changed public functions). +- `config-value:` suggestions for TOML/JSON keys with literal values. +- `path:` suggestions for referenced files. +- A `--load-bearing ` convenience to flag specific suggestions up front. + +None of these change the seal/verify contract; each is "add a candidate generator, run it, +keep the green ones." They are intentionally not built yet. diff --git a/docs/releases/v1.0.1.md b/docs/releases/v1.0.1.md new file mode 100644 index 0000000..491f6bd --- /dev/null +++ b/docs/releases/v1.0.1.md @@ -0,0 +1,73 @@ +# dorian 1.0.1 + +A hardening, DX, and interop patch on top of 1.0.0. No breaking changes; the warrant format, +checker grammar, exit codes, and trust semantics are unchanged. The headline addition is the +first **documented, reproducible cross-PR catch on a public repo**. + +## Proof + +- **`docs/REAL_CATCH_LOG.md`** — one documented catch on [`encode/httpx`](https://github.com/encode/httpx) + (BSD-3): a load-bearing claim sealed when `requires-python` was `">=3.8"` was flipped + `WARRANTED → REVOKED` (exit 4) by a real later upstream PR ([#3592](https://github.com/encode/httpx/pull/3592), + "Drop Python 3.8 support") while httpx's own test suite stayed green and no stateless per-PR + review would have re-opened the original claim. From-scratch reproduction included. This is + one documented catch with honest scope, not a validation claim. + +## Security + +- **C4 hardening**: a `pytest:` checker nodeid whose file part is empty or starts with `-` + (e.g. `pytest:-pevil`, `pytest:--collect-only`) is now rejected as `ERROR(bad_program)` + before any subprocess spawns — it can no longer reach pytest as an option. Red/green tested. +- **C5 sqlite reconcile timeout**: a pathological reconcile query (e.g. an infinite recursive + CTE the read-only authorizer permits) is now bounded by a per-query wall-clock deadline and + returns `ERROR(query_timeout)` instead of hanging the process — closing a DoS that survived + `--deny-exec` (typed C5 reads are deliberately not exec-gated). Red/green tested. +- **Supply chain**: every third-party GitHub Action is pinned to an immutable commit SHA (each + verified via `git ls-remote`); a new `security.yml` runs `pip-audit` (SCA) and `bandit` + (SAST), and Dependabot keeps the pins and deps fresh. bandit excludes only dorian's + documented, policy-gated execution primitives, with a reason per check. + +## Performance + +- `dorian verify` now builds the whole-repo Python-symbol and config-key indexes **once** per + run instead of 2×/3×; output is byte-identical (pinned by a call-count spy + the existing + watch/read-set assertions). + +## Features (additive, opt-in) + +- **`dorian suggest-claims `** — a deterministic, zero-model C3 counterpart to + `suggest-data-checks`. Proposes `symbol:` claims for non-private defs/classes and `py-const:` + claims for literal module constants, runs each, and emits only the passing ones, so the + `{"claims": [...]}` fragment seals unmodified. `load_bearing` defaults to false; ambiguous + symbols are skipped. Scaffolding for review (existence/value, not behavior) — see + `docs/design/SUGGEST_CLAIMS.md`. +- **`dorian export --in-toto `** — project a sealed `.warrant` into an experimental + in-toto `ClaimVerification` Statement (deterministic, no signing, no network, zero deps). + Experimental interop — see `docs/ATTESTATION_INTEROP.md`. + +## Docs / DX + +- The runnable "Try it in 30 seconds" demo is promoted above the fold and the Demo badge points + at it; the illustrative `/login` story is clearly labeled. +- New: `docs/WRITING_GOOD_CLAIMS.md` (worked good/bad claim pairs + the gutted-body ceiling), + `docs/SECURITY_AND_SAFE_RUNNERS.md` (one safe public-fork recipe), a sharpened + `docs/USE_WITH_CLAUDE_CODE.md`, and the public benchmark protocol reconciled with what shipped. + +## Honest scope (unchanged from 1.0.0) + +The public benchmark is reproducibility evidence on frozen SHAs only, not general real-world +validation. Trigger and truth layers are reported separately, and `ERROR` is not `BROKEN`. +`--deny-exec`/`--deny-shell` are fail-closed policies, **not** sandboxes; `checker_trust: base` +is a checker-source trust root, not a sandbox. `suggest-claims` checks existence/value, not +behavior (a gutted body keeps a `symbol:` claim green); the in-toto export is experimental. +A warrant id is content-addressed and **tamper-evident**, but its body includes the seal +timestamp, so a fresh seal yields a different id — what reproduces is the outcome, not the id. + +## Install + +```bash +pip install dorian-vwp +``` + +PyPI publishing is a separate step and is **not** performed by this GitHub Release; `pip` will +serve 1.0.0 until 1.0.1 is published to PyPI via the Trusted Publisher workflow. diff --git a/pyproject.toml b/pyproject.toml index 4317825..9ebf823 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "dorian-vwp" -version = "1.0.0" +version = "1.0.1" description = "Hold AI agents to what they said they did: deterministic, token-free verification of claims about a change." readme = "README.md" requires-python = ">=3.11" @@ -47,3 +47,22 @@ addopts = "-q" markers = [ "slow: heavy test (benchmark run, wheel build, or real pytest subprocess) — deselect with -m 'not slow'", ] + +[tool.bandit] +# dorian is a deterministic checker-EXECUTION tool: by design (and only through the +# fail-closed deny-exec/deny-shell policy in src/dorian/policy.py — see +# docs/SECURITY_BOUNDARY.md) it spawns subprocesses (C4 `pytest:`, `git`) and runs +# read-only, authorizer-gated SQL (C5 data checks). The checks below flag exactly those +# reviewed-by-design primitives, so they are excluded here; bandit still gates every OTHER +# vulnerability class (eval/exec, pickle, weak crypto, yaml.load, requests verify=False, …). +# The real execution control is policy.py, covered by tests/test_deny_exec_policy.py. +skips = [ + "B101", # assert_used: asserts are internal invariants, not access control + "B105", # hardcoded_password: false positive — "PASS" is a Verdict label + "B404", # import subprocess: required to run the gated pytest/git primitives + "B603", # subprocess_without_shell: list-form spawn (the SAFE, non-shell form) + "B607", # start_process_with_partial_path: bare `python`/`git` via PATH, by design + "B602", # subprocess shell=True: the GATED C5 `shell:` checker (deny-shell/deny-exec) + "B604", # function shell=True: same gated shell path via run_readonly + "B608", # hardcoded_sql: read-only, authorizer-gated SQL in the data-check suggester +] diff --git a/src/dorian/__init__.py b/src/dorian/__init__.py index 4f14916..ae71b93 100644 --- a/src/dorian/__init__.py +++ b/src/dorian/__init__.py @@ -3,4 +3,4 @@ PyPI distribution: `dorian-vwp`; import package: `dorian`; CLI: `dorian`. """ -__version__ = "1.0.0" +__version__ = "1.0.1" diff --git a/src/dorian/checkers/c4_test.py b/src/dorian/checkers/c4_test.py index 82b5ba5..a52738a 100644 --- a/src/dorian/checkers/c4_test.py +++ b/src/dorian/checkers/c4_test.py @@ -44,7 +44,12 @@ def check(ctx: CheckContext, spec: CheckerSpec) -> CheckResult: file, sep, rest = nodeid.partition("::") file = ctx.rename_map.get(file, file) - if not file or not (ctx.repo / file).resolve().is_relative_to(ctx.repo.resolve()): + if not file or file.startswith("-"): + # an empty or leading-dash file part would reach pytest as an OPTION + # (-p / -c / --collect-only), not a file; reject before .resolve() and + # before any subprocess (the argv carries no `--` fence) + return CheckResult(Verdict.ERROR, detail="bad_program") + if not (ctx.repo / file).resolve().is_relative_to(ctx.repo.resolve()): # a hostile nodeid ('..' or absolute) must not probe files outside the repo return CheckResult(Verdict.ERROR, detail="bad_program") nodeid = file + sep + rest diff --git a/src/dorian/checkers/c5_data.py b/src/dorian/checkers/c5_data.py index c5754ac..48a7095 100644 --- a/src/dorian/checkers/c5_data.py +++ b/src/dorian/checkers/c5_data.py @@ -21,6 +21,7 @@ import csv import re import sqlite3 +import time from pathlib import Path from dorian.checkers import registry @@ -44,6 +45,10 @@ class _ColumnGone(Exception): pass +class _QueryTimeout(Exception): + pass + + _OPS = { "==": lambda a, b: a == b, "!=": lambda a, b: a != b, @@ -220,6 +225,13 @@ def _deny_non_read(op: int, *_: object) -> int: return sqlite3.SQLITE_OK if op in _SQLITE_READ_OPS else sqlite3.SQLITE_DENY +# typed C5 sqlite reads are deliberately NOT deny-exec-gated (only shell: is), so a +# pathological query — e.g. an infinite recursive CTE the read-only authorizer permits +# (SQLITE_RECURSIVE is a read op) — could otherwise hang the process even under +# --deny-exec. Bound every reconcile query by a wall-clock deadline -> ERROR on timeout. +_SQLITE_QUERY_TIMEOUT_S = 5.0 + + def _reconcile_side(ctx: CheckContext, side: str) -> int: engine, _, body = side.strip().partition(":") if engine == "csv": @@ -231,10 +243,27 @@ def _reconcile_side(ctx: CheckContext, side: str) -> int: raise _BadProgram(f"sqlite side expects '::