From ac7937f3a5a7b1336283e8bfa6aa1e024706dc68 Mon Sep 17 00:00:00 2001
From: Colin Son <colinson@fastmail.com>
Date: Mon, 18 May 2026 19:12:05 -0500
Subject: [PATCH 1/2] fix: restore phase 0 qa trust

---
 docs/premium-qa-architecture-plan.md          | 416 ++++++++++++++++++
 src/retrace/storage/core.py                   |   4 +-
 src/retrace/storage/repositories/incidents.py |  47 +-
 src/retrace/tester/__init__.py                |  15 +-
 src/retrace/tester/assertions.py              | 196 +++++++--
 src/retrace/tester/specs.py                   |   5 +
 6 files changed, 640 insertions(+), 43 deletions(-)
 create mode 100644 docs/premium-qa-architecture-plan.md

diff --git a/docs/premium-qa-architecture-plan.md b/docs/premium-qa-architecture-plan.md
new file mode 100644
index 0000000..c800f2a
--- /dev/null
+++ b/docs/premium-qa-architecture-plan.md
@@ -0,0 +1,416 @@
+# Premium QA Architecture Plan
+
+**Status:** Definitive direction document  
+**Date:** 2026-05-18  
+**Audience:** vibe coders, indie developers, and contributors building Retrace  
+
+## Thesis
+
+Retrace should become the free, open-source QA architecture for small teams that
+ship with coding agents: real-user failure capture, deterministic reproduction,
+agent-ready repair context, and verification in one local-first loop.
+
+The premium bar is not "more AI." The premium bar is evidence quality:
+
+```text
+production signal -> incident -> replay/test evidence -> likely code -> repair task -> verified fix
+```
+
+The product should feel like Sentry, PostHog, Playwright, OpenReplay,
+PR-Agent, Pullfrog, Langfuse, and Vercel-style evals were compressed into one
+small-team workflow, with every expensive or hosted assumption removed.
+
+## Current Position
+
+Retrace already has more of the right architecture than a normal alpha:
+
+- Unified `qa_incidents` across replay findings, UI tests, API tests,
+  Sentry-compatible/OTel monitor events, and PR review findings.
+- First-party browser replay capture plus PostHog replay import.
+- Deterministic detectors for console/network/render/interaction failures.
+- Native HTTP and Playwright-backed tester execution.
+- API testing, OpenAPI import, HAR import, and API suite storage.
+- Source-map upload, deploy correlation, alert rules, retention, digest, and
+  issue sinks.
+- GitHub/local repo linking, code matching, prompt generation, worktree-based
+  repair, and draft PR creation.
+- Browser SDK, Python SDK, Docker, CI, issue templates, security policy,
+  contribution docs, and broad automated coverage.
+
+The immediate credibility issue was a red default branch after modularization.
+That is now fixed locally by restoring repository row mappers, tester facade
+exports, deterministic evidence IDs for tests, deploy correlation updates, and
+consensus/classification behavior.
+
+## External References To Steal From
+
+### Pullfrog: GitHub-Native Agent Orchestration
+
+Pullfrog positions itself as an open-source orchestration layer for async
+development inside GitHub. Its strongest ideas for Retrace are:
+
+- GitHub as the control plane: trigger work from issues, PRs, comments, CI
+  failures, and reviews.
+- BYOK and model-agnostic operation.
+- Automated triggers for CI failures and PR reviews.
+- Self-healing PRs with loop prevention.
+- GitHub Actions as user-owned compute, secrets, and cost boundary.
+
+Reference: https://pullfrog.mintlify.app/
+
+### Vercel Next Evals: Agent Quality As Versioned Fixtures
+
+Vercel's Next evals package agent tasks as self-contained projects with a
+`PROMPT.md`, source files, and withheld `EVAL.ts` assertions. The runner uses
+memoization so new models or evals run only missing pairs.
+
+Retrace should copy the shape, not the domain:
+
+- Every generated reproduction should become a self-contained eval fixture.
+- The prompt given to an agent and the assertions used to judge the result must
+  be separate artifacts.
+- Non-product failures such as timeouts/infra errors should be classified apart
+  from real model or app failures.
+- Results should be exportable as benchmark data.
+
+Reference: https://github.com/vercel/next-evals-oss
+
+### Vercel AI SDK: Provider Abstraction And Developer Ergonomics
+
+Vercel AI SDK wins by being a clean TypeScript abstraction over many models and
+agent workflows, with a large ecosystem and frequent releases.
+
+Retrace should copy:
+
+- A small, stable provider interface for local and hosted coding agents.
+- Provider-specific adapters behind one API.
+- Great examples for Next.js, React, Python, and plain script users.
+- Versioned SDKs with boring upgrade paths.
+
+Reference: https://github.com/vercel/ai
+
+### Playwright: Deterministic Browser Runtime
+
+Playwright is the execution substrate to trust: browser isolation, auto-waiting,
+web-first assertions, resilient locators, traces, screenshots, videos, and
+parallelism.
+
+Retrace should not compete with Playwright. It should generate better Playwright
+inputs from real incidents and preserve incident evidence beside Playwright
+artifacts.
+
+Reference: https://github.com/microsoft/playwright
+
+### Cypress: App-Centric Testing UX
+
+Cypress's durable insight is that browser testing adoption depends on fast local
+feedback and approachable authoring, not just CI.
+
+Retrace should copy:
+
+- A local UI that makes tests, failures, screenshots, and logs inspectable.
+- Low ceremony install and a "tested with Retrace" style success path.
+- Component/API/e2e language in docs that users already understand.
+
+Reference: https://github.com/cypress-io/cypress
+
+### OpenReplay And PostHog: Session Replay Plus Product Context
+
+OpenReplay's best ideas are self-hosted replay, network/console/state context,
+privacy controls, and integrations that connect frontend behavior to backend
+logs. PostHog's best ideas are an all-in-one developer product stack, a clear
+free/open-source posture, feature flags/experiments, and replay as product
+evidence rather than a separate QA silo.
+
+Retrace should not become product analytics. It should ingest enough session and
+product context to rank QA incidents by user impact.
+
+References:
+
+- https://github.com/openreplay/openreplay
+- https://github.com/PostHog/posthog
+
+### Sentry: Error Identity And SDK Coverage
+
+Sentry's strength is event identity: grouping, releases, source maps,
+breadcrumbs, affected users, SDK breadth, and developer workflow integration.
+
+Retrace should keep Sentry compatibility while focusing on a narrower promise:
+turn errors into repro tests and repair tasks.
+
+Reference: https://github.com/getsentry/sentry
+
+### PR-Agent: Review Tools And Token-Aware Context
+
+PR-Agent's useful patterns are separate tools (`describe`, `review`, `improve`,
+`ask`), GitHub Actions/CLI/self-hosted deployment, PR compression, dynamic
+context, self-reflection, and platform-agnostic review flows.
+
+Retrace should apply these patterns to QA evidence: compress incident evidence,
+rank source context, cap suggestions, and keep review comments actionable.
+
+Reference: https://github.com/The-PR-Agent/pr-agent
+
+### Langfuse: Observability, Evals, Datasets
+
+Langfuse connects traces, prompts, evals, datasets, user feedback, and manual
+labels. Retrace should copy the "observability plus evals" mental model for QA:
+every real failure can become a dataset item, and every fix attempt can be
+evaluated against withheld assertions.
+
+Reference: https://github.com/langfuse/langfuse
+
+## Product Principles
+
+1. **Evidence before agents.** LLMs summarize and repair only after deterministic
+   capture, detection, and reproduction produce durable artifacts.
+2. **Generated tests are the product.** A detected bug is not valuable until it
+   becomes a repeatable test or a clearly classified non-repro.
+3. **GitHub-native, local-first.** Small teams should run this from a repo,
+   issue, PR, or GitHub Action without adopting a hosted control plane.
+4. **BYOK and provider-neutral.** OpenAI, Anthropic, local models, OpenRouter,
+   and future coding agents must sit behind stable adapters.
+5. **Privacy is a default, not a setting.** Inputs masked by default, secrets
+   redacted before storage/prompting, and prompt artifacts marked safe/unsafe.
+6. **No magical certainty.** Code matching must explain why files are likely,
+   preserve confidence, and let agents reject weak candidates.
+7. **OSS quality is the paid-tier substitute.** The free product must be the
+   premium product: reproducible setup, docs, CI, examples, and real fixtures.
+
+## Target Architecture
+
+```text
+Capture
+  browser SDK, Python SDK, Sentry DSN, OTLP, PostHog import, PR webhooks
+
+Normalize
+  canonical failures, evidence rows, replay sessions, deploys, source maps
+
+Triage
+  qa_incidents, severity, affected users, grouping, lifecycle, digest, sinks
+
+Reproduce
+  replay-to-test, API specs, Playwright/native runner, visual baselines,
+  flake classification, artifact manifests
+
+Repair
+  repo matching, source-map/code context, repair bundle, agent prompt,
+  worktree, draft PR, validation commands
+
+Verify
+  generated spec reruns, CI integration, resolved/regressed lifecycle,
+  eval dataset export
+```
+
+The boundary that matters: `qa_incidents` is the product spine. Every capture
+surface, test engine, review finding, and repair workflow must read/write this
+shape.
+
+## Roadmap
+
+### Phase 0: Restore Default-Branch Trust
+
+Goal: the project must be boringly green before claiming reliability.
+
+- Keep `master` green across `ruff`, full `pytest`, browser SDK tests/build,
+  Python SDK tests, Playwright runner tests, Postgres smoke, Docker build, and
+  e2e tests.
+- Add a CI badge and "known green command set" to the README.
+- Make branch protection require the same jobs listed in `.github/workflows`.
+- Add a short maintainer rule: no roadmap work merges while default branch is
+  red.
+
+Acceptance:
+
+- `uv run ruff check src tests`
+- `uv run pytest -q`
+- `cd packages/browser && npm ci && npm test && npm run build`
+- CI passes on `master`.
+
+### Phase 1: Killer Demo As A Contract
+
+Goal: `retrace demo all && retrace qa auto --repo local/demo-checkout --no-pr`
+must prove the architecture without external services.
+
+Build:
+
+- One seeded fixture for each signal source: replay, UI test, API test, monitor
+  event, PR review.
+- A local demo app with a real bug, source map, route, and generated repair
+  candidate.
+- Withheld assertions modeled after Vercel evals: agent sees prompt/evidence;
+  verifier sees expected behavior.
+- Exportable `qa-eval-result.json` with pass/fail, artifacts, classification,
+  and cost metadata.
+
+Why:
+
+Vercel-style evals turn agent quality into repeatable fixtures. Retrace needs
+the same for QA repair quality.
+
+### Phase 2: Incident Detail UX
+
+Goal: an indie developer can understand an incident in under one minute.
+
+Build:
+
+- Incident page with replay, timeline, console/network/error evidence, deploy
+  marker, source-map frame, generated test, candidate files, repair task, and
+  verification history.
+- Artifact viewer for screenshots, Playwright traces, request/response bodies,
+  source-map diagnostics, and prompt JSON.
+- "Why this file?" explanations for code matching.
+- "Promote to issue" and "open repair PR" flows from the same page.
+
+Reference:
+
+- OpenReplay-style replay plus DevTools context.
+- Sentry-style event identity and release/source-map context.
+
+### Phase 3: Test Generation Quality
+
+Goal: generated tests should survive CI, not just demos.
+
+Build:
+
+- Selector ranking: `data-testid`, role/name, label, placeholder, stable ID,
+  text fallback, coordinate fallback.
+- Playwright trace capture on first retry.
+- Flake quarantine with failure classification: app bug, test bug, auth
+  failure, selector drift, timeout, environment failure.
+- Parallel `run-all` with deterministic artifacts.
+- Visual baseline accept/compare flow documented for CI.
+- API sequence generation from captured network calls.
+
+Reference:
+
+- Playwright locators, traces, browser isolation, auto-waiting.
+- Cypress local testing UX and status visibility.
+
+### Phase 4: GitHub-Native Agent Loop
+
+Goal: Retrace behaves like a QA-native Pullfrog.
+
+Build:
+
+- GitHub App commands:
+  - `@retrace triage`
+  - `@retrace reproduce`
+  - `@retrace fix`
+  - `@retrace verify`
+  - `@retrace explain`
+- GitHub Actions workflow templates for:
+  - replay/API/test incident ingestion
+  - generated spec reruns
+  - PR review filing
+  - self-healing failed Retrace PRs with attempt caps
+- Loop prevention: max attempts per incident/PR, stale branch detection,
+  validation gate before new commits.
+- BYOK secrets stored in GitHub Actions secrets for fully user-owned compute.
+
+Reference:
+
+- Pullfrog's GitHub-first trigger model, self-healing PRs, and Actions-backed
+  execution.
+- PR-Agent's tool split and PR compression.
+
+### Phase 5: Agent Provider And Repair Adapter Layer
+
+Goal: coding-agent integration is swappable and testable.
+
+Build:
+
+- Common `RepairAgent` interface:
+  - `prepare(bundle) -> command/spec`
+  - `apply(worktree, bundle) -> result`
+  - `validate(worktree, commands) -> result`
+  - `summarize(result) -> incident event`
+- Adapters for local `codex`, `claude`, shell command, and no-op prompt-only
+  mode.
+- Provider cost and token accounting for LLM review/repair prompts.
+- Prompt safety tests that prove user evidence cannot become instructions.
+- Repair eval fixtures exported like Vercel Next evals.
+
+Reference:
+
+- Vercel AI SDK provider ergonomics.
+- Langfuse observability/eval separation.
+
+### Phase 6: Self-Host Operations
+
+Goal: users can run this as infrastructure, not a laptop script.
+
+Build:
+
+- Production Docker Compose profile with API, UI, worker, browser runner, cron,
+  SQLite/Postgres, and optional object storage.
+- Postgres backend moved from compatibility chassis to supported mode.
+- Storage sizing guide: replay volume, retention, source maps, artifacts.
+- Backup/restore and upgrade tests.
+- Retention defaults for raw replay, redacted evidence, source maps, and
+  generated artifacts.
+- Health checks and `/readyz` surfaces for each service.
+
+Reference:
+
+- OpenReplay self-hosting posture.
+- PostHog's explicit open-source/self-host tradeoff language.
+
+### Phase 7: Community And Extension System
+
+Goal: contributors can add detectors, runners, sinks, and matchers without
+reading the whole codebase.
+
+Build:
+
+- Detector plugin contract with fixture tests and reason-code docs.
+- Test runner plugin contract for native HTTP, Playwright, API, visual, and
+  future mobile.
+- Sink contract for GitHub Issues, Linear, Jira, Slack, webhooks.
+- Matching contract for route manifests, source maps, stack traces, selectors,
+  framework component graphs.
+- Example apps:
+  - Next.js checkout bug
+  - SaaS dashboard auth bug
+  - API contract regression
+  - source-mapped frontend exception
+
+Acceptance:
+
+- A contributor can add one detector with one fixture and one docs page in under
+  an hour.
+
+## Non-Goals
+
+- Replacing Playwright or Cypress as general-purpose test frameworks.
+- Replacing PostHog as product analytics.
+- Replacing Sentry as a broad multi-language observability platform.
+- Auto-merging fixes without tests and human review.
+- Treating LLM judgment as proof of failure.
+- Building a paid cloud tier before the open-source loop is excellent.
+
+## Launch Bar
+
+Retrace is ready to call itself premium open-source QA architecture when:
+
+- Fresh checkout demo works in 10 minutes without external services.
+- Default branch is green for 30 consecutive days.
+- At least five real-world fixture apps are covered.
+- Generated Playwright/API specs run in CI with stable artifact output.
+- `qa auto` can take a replay or monitor incident to a draft PR with a
+  validation command.
+- A failed repair attempt is classified and preserved as an eval result.
+- Docs explain the architecture, threat model, privacy model, and extension
+  points.
+- The browser SDK and Python SDK have versioned packages and upgrade notes.
+
+## Immediate Next Moves
+
+1. Merge the default-branch fix and confirm GitHub CI is green.
+2. Add the killer-demo contract as a CI e2e job.
+3. Convert `docs/roadmap.md` into execution milestones that map to this
+   architecture.
+4. Build the incident detail UX around the single `qa_incidents` spine.
+5. Add GitHub App command triggers and loop prevention.
+6. Export repair attempts as eval fixtures.
+
diff --git a/src/retrace/storage/core.py b/src/retrace/storage/core.py
index e357bf4..8d83447 100644
--- a/src/retrace/storage/core.py
+++ b/src/retrace/storage/core.py
@@ -1322,8 +1322,8 @@ def _append_failure_evidence(
             payload_json = json.dumps(evidence.payload, sort_keys=True)
         except (TypeError, ValueError) as exc:
             raise ValueError("evidence payload must be JSON-serializable") from exc
-        evidence_id = _id("ev")
-        created_at = _now_iso_microseconds()
+        evidence_id = self._id("ev")
+        created_at = self._now_iso_microseconds()
         conn.execute(
             """
             INSERT OR IGNORE INTO failure_evidence
diff --git a/src/retrace/storage/repositories/incidents.py b/src/retrace/storage/repositories/incidents.py
index f49ed6d..955f56e 100644
--- a/src/retrace/storage/repositories/incidents.py
+++ b/src/retrace/storage/repositories/incidents.py
@@ -7,7 +7,8 @@
 from ..helpers import (
     _SEVERITY_ORDER, _rollup_severity, _string_values,
     _normalize_app_error_incident_status, _id, _public_id, _dt,
-    _safe_json_obj, _merge_string_lists, APP_ERROR_FAILURE_STATUS_BY_INCIDENT_STATUS
+    _safe_json_obj, _merge_string_lists, _parse_string_list_json,
+    APP_ERROR_FAILURE_STATUS_BY_INCIDENT_STATUS
 )
 from ..models import (
     FailureRow, EvidenceRow, IncidentRow, IncidentLifecycleEventRow,
@@ -1286,6 +1287,37 @@ def nearest_deploy_marker(
             ).fetchone()
         return self._deploy_marker_from_row(row) if row is not None else None
 
+    def update_failure_deploy(self, *, failure_id: str, deploy_sha: str) -> None:
+        now = datetime.now(timezone.utc).isoformat()
+        with self._conn() as conn:
+            conn.execute(
+                """
+                UPDATE failures
+                SET related_deploy_sha = ?,
+                    updated_at = ?
+                WHERE id = ? OR public_id = ?
+                """,
+                (deploy_sha.strip(), now, failure_id, failure_id),
+            )
+
+    def _deploy_marker_from_row(self, row: sqlite3.Row) -> DeployMarkerRow:
+        return DeployMarkerRow(
+            id=str(row["id"]),
+            public_id=str(row["public_id"]),
+            project_id=str(row["project_id"]),
+            environment_id=str(row["environment_id"]),
+            sha=str(row["sha"]),
+            branch=str(row["branch"] or ""),
+            author=str(row["author"] or ""),
+            deployed_at_ms=int(row["deployed_at_ms"] or 0),
+            changed_files=_merge_string_lists(
+                _parse_string_list_json(row["changed_files_json"])
+            ),
+            metadata=dict(_safe_json_obj(row["metadata_json"])),
+            created_at=_dt(row["created_at"]) or datetime.now(timezone.utc),
+            updated_at=_dt(row["updated_at"]) or datetime.now(timezone.utc),
+        )
+
 
     def upsert_source_map(
         self,
@@ -1393,6 +1425,19 @@ def list_recent_source_maps(
             ).fetchall()
         return [self._source_map_from_row(row) for row in rows]
 
+    def _source_map_from_row(self, row: sqlite3.Row) -> SourceMapRow:
+        return SourceMapRow(
+            id=str(row["id"]),
+            public_id=str(row["public_id"]),
+            project_id=str(row["project_id"]),
+            environment_id=str(row["environment_id"]),
+            release=str(row["release"]),
+            dist=str(row["dist"] or ""),
+            artifact_url=str(row["artifact_url"]),
+            source_map=dict(_safe_json_obj(row["source_map_json"])),
+            uploaded_at=_dt(row["uploaded_at"]) or datetime.now(timezone.utc),
+        )
+
     def _validate_source_map_payload(self, source_map: dict[str, Any]) -> None:
         if source_map.get("version") != 3:
             raise ValueError("source_map must be a supported Source Map v3 object")
diff --git a/src/retrace/tester/__init__.py b/src/retrace/tester/__init__.py
index e3cbfd0..77d6621 100644
--- a/src/retrace/tester/__init__.py
+++ b/src/retrace/tester/__init__.py
@@ -41,11 +41,24 @@
     _redacted_response_headers as _redacted_response_headers,
     _response_assertion_evidence as _response_assertion_evidence,
 )
+from . import harness as _harness
 from .harness import (
-    run_spec as run_spec,
     load_run_summaries as load_run_summaries,
     enqueue_spec_run as enqueue_spec_run,
     run_queued_spec_once as run_queued_spec_once,
     set_explore_factories as set_explore_factories,
     set_visual_factories as set_visual_factories,
+    _run_playwright_spec as _run_playwright_spec,
+    _run_shell as _run_shell,
 )
+
+
+def run_spec(*args, **kwargs):
+    """Run a tester spec through the current package-level shell hook.
+
+    Older tests and integrations monkeypatch `retrace.tester._run_shell`.
+    The implementation now lives in `retrace.tester.harness`, so keep that
+    facade contract by syncing the harness hook immediately before execution.
+    """
+    _harness._run_shell = _run_shell
+    return _harness.run_spec(*args, **kwargs)
diff --git a/src/retrace/tester/assertions.py b/src/retrace/tester/assertions.py
index c01c7db..6c97b37 100644
--- a/src/retrace/tester/assertions.py
+++ b/src/retrace/tester/assertions.py
@@ -73,8 +73,11 @@ def _evaluate_consensus_assertion(
     assertion: dict[str, Any],
     *,
     consensus_group: str,
+    response: Optional[httpx.Response] = None,
+    evidence: Optional[dict[str, Any]] = None,
+    arbiter_vote: bool | None = None,
 ) -> TesterAssertionResult:
-    votes = _collect_consensus_votes(assertion)
+    votes = list(assertion.get("__collected_votes") or _collect_consensus_votes(assertion))
     if not votes:
         return _assertion_result(
             assertion=assertion,
@@ -85,24 +88,59 @@ def _evaluate_consensus_assertion(
         )
     ok_votes = [v for v in votes if _bool_from_vote(v) is True]
     fail_votes = [v for v in votes if _bool_from_vote(v) is False]
-    ok = len(ok_votes) >= len(fail_votes)
+    retry_count = int(
+        assertion.get("__retry_count")
+        if assertion.get("__retry_count") is not None
+        else len(
+            [
+                vote
+                for vote in votes
+                if bool(vote.get("retry")) or vote in assertion.get("retry_votes", [])
+            ]
+        )
+    )
+    disagreement = bool(ok_votes and fail_votes)
+    decision = "majority"
+    if arbiter_vote is None and "arbiter_vote" in assertion:
+        arbiter_vote = _coerce_arbiter_vote(assertion.get("arbiter_vote"))
+    if disagreement and arbiter_vote is not None:
+        ok = arbiter_vote
+        decision = "arbiter"
+    else:
+        ok = len(ok_votes) >= len(fail_votes)
+    selected_votes = ok_votes if ok else fail_votes
     confidence = (
-        max([_coerce_confidence(v.get("confidence"), default=0.5) for v in ok_votes])
-        if ok and ok_votes
-        else max([_coerce_confidence(v.get("confidence"), default=0.5) for v in fail_votes])
-        if fail_votes
-        else 0.5
+        max([_coerce_confidence(v.get("confidence"), default=1.0) for v in selected_votes])
+        if selected_votes
+        else 1.0
     )
+    if assertion.get("confidence") is not None:
+        confidence = _coerce_confidence(assertion.get("confidence"), default=confidence)
     message = (
         f"Consensus reached (OK={len(ok_votes)}, FAIL={len(fail_votes)})."
         if ok
         else f"Consensus failed (OK={len(ok_votes)}, FAIL={len(fail_votes)})."
     )
+    actual_evidence = evidence
+    if actual_evidence is None:
+        actual_evidence = _response_assertion_evidence(
+            response,
+            capture_body=bool(assertion.get("capture_body_evidence")),
+        )
+    actual = {
+        "decision": decision,
+        "disagreement": disagreement,
+        "pass_votes": len(ok_votes),
+        "fail_votes": len(fail_votes),
+        "retry_count": retry_count,
+        "arbiter_vote": arbiter_vote,
+        "evidence": actual_evidence,
+    }
     return _assertion_result(
         assertion={**assertion, "consensus_group": consensus_group, "model_votes": votes},
         ok=ok,
         expected=f"majority OK in {consensus_group}",
-        actual=f"{len(ok_votes)} OK / {len(fail_votes)} FAIL",
+        actual=actual,
         message=message,
         confidence=confidence,
     )
@@ -114,44 +152,100 @@ def _evaluate_model_backed_consensus_assertion(
     response: Optional[httpx.Response],
 ) -> TesterAssertionResult:
     models = list(assertion.get("models") or [])
-    if not models:
-        return _assertion_result(
-            assertion=assertion,
-            ok=False,
-            expected="at least one model",
-            actual=None,
-            message="No models configured for model-backed consensus assertion.",
-        )
-    prompt = str(assertion.get("prompt") or assertion.get("text") or "")
-    if not prompt:
-        return _assertion_result(
-            assertion=assertion,
-            ok=False,
-            expected="non-empty prompt",
-            actual=None,
-            message="Prompt is required for model-backed consensus assertion.",
-        )
     evidence = assertion.get("evidence")
+    capture_body = bool(assertion.get("capture_body_evidence") or models)
     if not evidence and response:
         evidence = _response_assertion_evidence(
-            response, capture_body=bool(assertion.get("capture_body_evidence"))
+            response, capture_body=capture_body
         )
-    votes = _call_consensus_models(
-        models=models,
-        prompt=prompt,
-        snapshot=evidence or {},
-        provider=str(assertion.get("provider") or "openai"),
-        base_url=str(assertion.get("base_url") or ""),
-        api_key=assertion.get("api_key"),
-        timeout=float(assertion.get("timeout") or 30.0),
-        retry=bool(assertion.get("retry")),
+    elif isinstance(evidence, dict) and response:
+        response_evidence = _response_assertion_evidence(
+            response, capture_body=capture_body
+        )
+        evidence = {**evidence, **response_evidence}
+    elif not isinstance(evidence, dict):
+        evidence = _response_assertion_evidence(None, capture_body=False)
+
+    if models:
+        prompt = str(assertion.get("prompt") or assertion.get("text") or "")
+        if not prompt:
+            return _assertion_result(
+                assertion=assertion,
+                ok=False,
+                expected="non-empty prompt",
+                actual=None,
+                message="Prompt is required for model-backed consensus assertion.",
+            )
+        votes = _call_consensus_models(
+            models=models,
+            prompt=prompt,
+            snapshot=evidence or {},
+            provider=str(assertion.get("provider") or "openai"),
+            base_url=str(assertion.get("base_url") or ""),
+            api_key=assertion.get("api_key"),
+            timeout=float(assertion.get("timeout") or 30.0),
+            retry=bool(assertion.get("retry")),
+        )
+    else:
+        votes = _collect_consensus_votes(assertion)
+
+    arbiter_vote = None
+    parsed_votes = [_bool_from_vote(vote) for vote in votes]
+    disagreement = any(v is True for v in parsed_votes) and any(
+        v is False for v in parsed_votes
     )
+    arbiter_model = str(assertion.get("arbiter_model") or "").strip()
+    prompt = str(assertion.get("prompt") or assertion.get("text") or "")
+    if disagreement and arbiter_model and prompt:
+        arbiter_votes = _call_consensus_models(
+            models=[arbiter_model],
+            prompt=prompt,
+            snapshot=evidence or {},
+            provider=str(assertion.get("provider") or "openai"),
+            base_url=str(assertion.get("base_url") or ""),
+            api_key=assertion.get("api_key"),
+            timeout=float(assertion.get("timeout") or 30.0),
+            retry=False,
+        )
+        if arbiter_votes:
+            arbiter_vote = _bool_from_vote(arbiter_votes[0])
+    elif "arbiter_vote" in assertion:
+        arbiter_vote = _coerce_arbiter_vote(assertion.get("arbiter_vote"))
+
     return _evaluate_consensus_assertion(
-        {**assertion, "model_votes": votes},
-        consensus_group=f"models:{','.join(models)}",
+        {
+            **assertion,
+            "model_votes": votes,
+            "__collected_votes": votes,
+            "__retry_count": len(
+                [vote for vote in votes if vote in assertion.get("retry_votes", [])]
+            ),
+        },
+        consensus_group=str(
+            assertion.get("consensus_group")
+            or (f"models:{','.join(models)}" if models else "model_consensus")
+        ),
+        response=response,
+        evidence=evidence,
+        arbiter_vote=arbiter_vote,
     )
 
 
+def _coerce_arbiter_vote(raw: Any) -> bool | None:
+    if raw is None:
+        return None
+    if isinstance(raw, bool):
+        return raw
+    if isinstance(raw, (int, float)):
+        return bool(raw)
+    normalized = str(raw).strip().lower()
+    if normalized in {"pass", "passed", "true", "1", "yes", "ok"}:
+        return True
+    if normalized in {"fail", "failed", "false", "0", "no"}:
+        return False
+    return None
+
+
 def _call_consensus_models(
     *,
     models: list[str],
@@ -437,13 +531,27 @@ def _classify_failure(
         ]
     ):
         return "environment_failure"
+    if any(
+        k in merged
+        for k in [
+            "invalid username or password",
+            "login failed",
+            "unauthorized",
+            "forbidden",
+            "auth failure",
+            "missing jwt",
+            "401",
+            "403",
+        ]
+    ):
+        return "auth_failure"
     if any(k in merged for k in ["timeout", "timed out", "deadline exceeded"]):
         return "timeout"
     if any(
         k in merged
-        for k in ["invalid username or password", "login failed", "unauthorized", "401"]
+        for k in ["invalid_regex", "unsupported native step", "unsupported assertion"]
     ):
-        return "auth_failure"
+        return "test_bug"
     if _failed_selector_assertion(failed_assertions):
         return "selector_drift"
     if failed_assertions or exit_code != 0:
@@ -462,6 +570,16 @@ def _assertion_text_for_classification(items: list[dict[str, Any]]) -> str:
 
 def _failed_selector_assertion(items: list[dict[str, Any]]) -> bool:
     for item in items:
+        assertion_type = str(
+            item.get("assertion_type") or item.get("type") or ""
+        ).lower()
+        if assertion_type in {
+            "selector_visible",
+            "element_visible",
+            "selector_count",
+            "element_count",
+        }:
+            return True
         msg = str(item.get("message") or "").lower()
         if "selector" in msg or "not found" in msg or "could not find" in msg:
             return True
diff --git a/src/retrace/tester/specs.py b/src/retrace/tester/specs.py
index 84f2a59..f338ba3 100644
--- a/src/retrace/tester/specs.py
+++ b/src/retrace/tester/specs.py
@@ -364,6 +364,11 @@ def select_execution_engine(spec: TesterSpec) -> EngineSelection:
                 "requires credential-aware execution"
             ),
         )
+    if spec.exploratory_goals and bool(spec.browser_settings.get("visual")):
+        return EngineSelection(
+            execution_engine="visual",
+            reason="auto selected visual for screenshot-guided exploratory goals",
+        )
     if spec.exploratory_goals:
         return EngineSelection(
             execution_engine="explore",

From 2c07ef265dfcf805a9ccd8863b9a8a7b55c63c19 Mon Sep 17 00:00:00 2001
From: Colin Son <colinson@fastmail.com>
Date: Mon, 18 May 2026 19:29:06 -0500
Subject: [PATCH 2/2] fix: address phase 0 review comments

---
 docs/premium-qa-architecture-plan.md          | 3 +--
 src/retrace/storage/repositories/incidents.py | 9 +++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/premium-qa-architecture-plan.md b/docs/premium-qa-architecture-plan.md
index c800f2a..4e8e77a 100644
--- a/docs/premium-qa-architecture-plan.md
+++ b/docs/premium-qa-architecture-plan.md
@@ -216,7 +216,7 @@ Goal: the project must be boringly green before claiming reliability.
   Python SDK tests, Playwright runner tests, Postgres smoke, Docker build, and
   e2e tests.
 - Add a CI badge and "known green command set" to the README.
-- Make branch protection require the same jobs listed in `.github/workflows`.
+- Make GitHub branch protection require the same jobs listed in `.github/workflows`.
 - Add a short maintainer rule: no roadmap work merges while default branch is
   red.
 
@@ -413,4 +413,3 @@ Retrace is ready to call itself premium open-source QA architecture when:
 4. Build the incident detail UX around the single `qa_incidents` spine.
 5. Add GitHub App command triggers and loop prevention.
 6. Export repair attempts as eval fixtures.
-
diff --git a/src/retrace/storage/repositories/incidents.py b/src/retrace/storage/repositories/incidents.py
index 955f56e..6f539af 100644
--- a/src/retrace/storage/repositories/incidents.py
+++ b/src/retrace/storage/repositories/incidents.py
@@ -1288,17 +1288,22 @@ def nearest_deploy_marker(
         return self._deploy_marker_from_row(row) if row is not None else None
 
     def update_failure_deploy(self, *, failure_id: str, deploy_sha: str) -> None:
+        clean_failure_id = failure_id.strip()
+        if not clean_failure_id:
+            raise ValueError("failure_id is required")
         now = datetime.now(timezone.utc).isoformat()
         with self._conn() as conn:
-            conn.execute(
+            cur = conn.execute(
                 """
                 UPDATE failures
                 SET related_deploy_sha = ?,
                     updated_at = ?
                 WHERE id = ? OR public_id = ?
                 """,
-                (deploy_sha.strip(), now, failure_id, failure_id),
+                (deploy_sha.strip(), now, clean_failure_id, clean_failure_id),
             )
+            if int(cur.rowcount) == 0:
+                raise ValueError(f"unknown failure_id: {failure_id}")
 
     def _deploy_marker_from_row(self, row: sqlite3.Row) -> DeployMarkerRow:
         return DeployMarkerRow(