From ac7937f3a5a7b1336283e8bfa6aa1e024706dc68 Mon Sep 17 00:00:00 2001 From: Colin Son Date: Mon, 18 May 2026 19:12:05 -0500 Subject: [PATCH 1/2] fix: restore phase 0 qa trust --- docs/premium-qa-architecture-plan.md | 416 ++++++++++++++++++ src/retrace/storage/core.py | 4 +- src/retrace/storage/repositories/incidents.py | 47 +- src/retrace/tester/__init__.py | 15 +- src/retrace/tester/assertions.py | 196 +++++++-- src/retrace/tester/specs.py | 5 + 6 files changed, 640 insertions(+), 43 deletions(-) create mode 100644 docs/premium-qa-architecture-plan.md diff --git a/docs/premium-qa-architecture-plan.md b/docs/premium-qa-architecture-plan.md new file mode 100644 index 0000000..c800f2a --- /dev/null +++ b/docs/premium-qa-architecture-plan.md @@ -0,0 +1,416 @@ +# Premium QA Architecture Plan + +**Status:** Definitive direction document +**Date:** 2026-05-18 +**Audience:** vibe coders, indie developers, and contributors building Retrace + +## Thesis + +Retrace should become the free, open-source QA architecture for small teams that +ship with coding agents: real-user failure capture, deterministic reproduction, +agent-ready repair context, and verification in one local-first loop. + +The premium bar is not "more AI." The premium bar is evidence quality: + +```text +production signal -> incident -> replay/test evidence -> likely code -> repair task -> verified fix +``` + +The product should feel like Sentry, PostHog, Playwright, OpenReplay, +PR-Agent, Pullfrog, Langfuse, and Vercel-style evals were compressed into one +small-team workflow, with every expensive or hosted assumption removed. + +## Current Position + +Retrace already has more of the right architecture than a normal alpha: + +- Unified `qa_incidents` across replay findings, UI tests, API tests, + Sentry-compatible/OTel monitor events, and PR review findings. +- First-party browser replay capture plus PostHog replay import. +- Deterministic detectors for console/network/render/interaction failures. +- Native HTTP and Playwright-backed tester execution. +- API testing, OpenAPI import, HAR import, and API suite storage. +- Source-map upload, deploy correlation, alert rules, retention, digest, and + issue sinks. +- GitHub/local repo linking, code matching, prompt generation, worktree-based + repair, and draft PR creation. +- Browser SDK, Python SDK, Docker, CI, issue templates, security policy, + contribution docs, and broad automated coverage. + +The immediate credibility issue was a red default branch after modularization. +That is now fixed locally by restoring repository row mappers, tester facade +exports, deterministic evidence IDs for tests, deploy correlation updates, and +consensus/classification behavior. + +## External References To Steal From + +### Pullfrog: GitHub-Native Agent Orchestration + +Pullfrog positions itself as an open-source orchestration layer for async +development inside GitHub. Its strongest ideas for Retrace are: + +- GitHub as the control plane: trigger work from issues, PRs, comments, CI + failures, and reviews. +- BYOK and model-agnostic operation. +- Automated triggers for CI failures and PR reviews. +- Self-healing PRs with loop prevention. +- GitHub Actions as user-owned compute, secrets, and cost boundary. + +Reference: https://pullfrog.mintlify.app/ + +### Vercel Next Evals: Agent Quality As Versioned Fixtures + +Vercel's Next evals package agent tasks as self-contained projects with a +`PROMPT.md`, source files, and withheld `EVAL.ts` assertions. The runner uses +memoization so new models or evals run only missing pairs. + +Retrace should copy the shape, not the domain: + +- Every generated reproduction should become a self-contained eval fixture. +- The prompt given to an agent and the assertions used to judge the result must + be separate artifacts. +- Non-product failures such as timeouts/infra errors should be classified apart + from real model or app failures. +- Results should be exportable as benchmark data. + +Reference: https://github.com/vercel/next-evals-oss + +### Vercel AI SDK: Provider Abstraction And Developer Ergonomics + +Vercel AI SDK wins by being a clean TypeScript abstraction over many models and +agent workflows, with a large ecosystem and frequent releases. + +Retrace should copy: + +- A small, stable provider interface for local and hosted coding agents. +- Provider-specific adapters behind one API. +- Great examples for Next.js, React, Python, and plain script users. +- Versioned SDKs with boring upgrade paths. + +Reference: https://github.com/vercel/ai + +### Playwright: Deterministic Browser Runtime + +Playwright is the execution substrate to trust: browser isolation, auto-waiting, +web-first assertions, resilient locators, traces, screenshots, videos, and +parallelism. + +Retrace should not compete with Playwright. It should generate better Playwright +inputs from real incidents and preserve incident evidence beside Playwright +artifacts. + +Reference: https://github.com/microsoft/playwright + +### Cypress: App-Centric Testing UX + +Cypress's durable insight is that browser testing adoption depends on fast local +feedback and approachable authoring, not just CI. + +Retrace should copy: + +- A local UI that makes tests, failures, screenshots, and logs inspectable. +- Low ceremony install and a "tested with Retrace" style success path. +- Component/API/e2e language in docs that users already understand. + +Reference: https://github.com/cypress-io/cypress + +### OpenReplay And PostHog: Session Replay Plus Product Context + +OpenReplay's best ideas are self-hosted replay, network/console/state context, +privacy controls, and integrations that connect frontend behavior to backend +logs. PostHog's best ideas are an all-in-one developer product stack, a clear +free/open-source posture, feature flags/experiments, and replay as product +evidence rather than a separate QA silo. + +Retrace should not become product analytics. It should ingest enough session and +product context to rank QA incidents by user impact. + +References: + +- https://github.com/openreplay/openreplay +- https://github.com/PostHog/posthog + +### Sentry: Error Identity And SDK Coverage + +Sentry's strength is event identity: grouping, releases, source maps, +breadcrumbs, affected users, SDK breadth, and developer workflow integration. + +Retrace should keep Sentry compatibility while focusing on a narrower promise: +turn errors into repro tests and repair tasks. + +Reference: https://github.com/getsentry/sentry + +### PR-Agent: Review Tools And Token-Aware Context + +PR-Agent's useful patterns are separate tools (`describe`, `review`, `improve`, +`ask`), GitHub Actions/CLI/self-hosted deployment, PR compression, dynamic +context, self-reflection, and platform-agnostic review flows. + +Retrace should apply these patterns to QA evidence: compress incident evidence, +rank source context, cap suggestions, and keep review comments actionable. + +Reference: https://github.com/The-PR-Agent/pr-agent + +### Langfuse: Observability, Evals, Datasets + +Langfuse connects traces, prompts, evals, datasets, user feedback, and manual +labels. Retrace should copy the "observability plus evals" mental model for QA: +every real failure can become a dataset item, and every fix attempt can be +evaluated against withheld assertions. + +Reference: https://github.com/langfuse/langfuse + +## Product Principles + +1. **Evidence before agents.** LLMs summarize and repair only after deterministic + capture, detection, and reproduction produce durable artifacts. +2. **Generated tests are the product.** A detected bug is not valuable until it + becomes a repeatable test or a clearly classified non-repro. +3. **GitHub-native, local-first.** Small teams should run this from a repo, + issue, PR, or GitHub Action without adopting a hosted control plane. +4. **BYOK and provider-neutral.** OpenAI, Anthropic, local models, OpenRouter, + and future coding agents must sit behind stable adapters. +5. **Privacy is a default, not a setting.** Inputs masked by default, secrets + redacted before storage/prompting, and prompt artifacts marked safe/unsafe. +6. **No magical certainty.** Code matching must explain why files are likely, + preserve confidence, and let agents reject weak candidates. +7. **OSS quality is the paid-tier substitute.** The free product must be the + premium product: reproducible setup, docs, CI, examples, and real fixtures. + +## Target Architecture + +```text +Capture + browser SDK, Python SDK, Sentry DSN, OTLP, PostHog import, PR webhooks + +Normalize + canonical failures, evidence rows, replay sessions, deploys, source maps + +Triage + qa_incidents, severity, affected users, grouping, lifecycle, digest, sinks + +Reproduce + replay-to-test, API specs, Playwright/native runner, visual baselines, + flake classification, artifact manifests + +Repair + repo matching, source-map/code context, repair bundle, agent prompt, + worktree, draft PR, validation commands + +Verify + generated spec reruns, CI integration, resolved/regressed lifecycle, + eval dataset export +``` + +The boundary that matters: `qa_incidents` is the product spine. Every capture +surface, test engine, review finding, and repair workflow must read/write this +shape. + +## Roadmap + +### Phase 0: Restore Default-Branch Trust + +Goal: the project must be boringly green before claiming reliability. + +- Keep `master` green across `ruff`, full `pytest`, browser SDK tests/build, + Python SDK tests, Playwright runner tests, Postgres smoke, Docker build, and + e2e tests. +- Add a CI badge and "known green command set" to the README. +- Make branch protection require the same jobs listed in `.github/workflows`. +- Add a short maintainer rule: no roadmap work merges while default branch is + red. + +Acceptance: + +- `uv run ruff check src tests` +- `uv run pytest -q` +- `cd packages/browser && npm ci && npm test && npm run build` +- CI passes on `master`. + +### Phase 1: Killer Demo As A Contract + +Goal: `retrace demo all && retrace qa auto --repo local/demo-checkout --no-pr` +must prove the architecture without external services. + +Build: + +- One seeded fixture for each signal source: replay, UI test, API test, monitor + event, PR review. +- A local demo app with a real bug, source map, route, and generated repair + candidate. +- Withheld assertions modeled after Vercel evals: agent sees prompt/evidence; + verifier sees expected behavior. +- Exportable `qa-eval-result.json` with pass/fail, artifacts, classification, + and cost metadata. + +Why: + +Vercel-style evals turn agent quality into repeatable fixtures. Retrace needs +the same for QA repair quality. + +### Phase 2: Incident Detail UX + +Goal: an indie developer can understand an incident in under one minute. + +Build: + +- Incident page with replay, timeline, console/network/error evidence, deploy + marker, source-map frame, generated test, candidate files, repair task, and + verification history. +- Artifact viewer for screenshots, Playwright traces, request/response bodies, + source-map diagnostics, and prompt JSON. +- "Why this file?" explanations for code matching. +- "Promote to issue" and "open repair PR" flows from the same page. + +Reference: + +- OpenReplay-style replay plus DevTools context. +- Sentry-style event identity and release/source-map context. + +### Phase 3: Test Generation Quality + +Goal: generated tests should survive CI, not just demos. + +Build: + +- Selector ranking: `data-testid`, role/name, label, placeholder, stable ID, + text fallback, coordinate fallback. +- Playwright trace capture on first retry. +- Flake quarantine with failure classification: app bug, test bug, auth + failure, selector drift, timeout, environment failure. +- Parallel `run-all` with deterministic artifacts. +- Visual baseline accept/compare flow documented for CI. +- API sequence generation from captured network calls. + +Reference: + +- Playwright locators, traces, browser isolation, auto-waiting. +- Cypress local testing UX and status visibility. + +### Phase 4: GitHub-Native Agent Loop + +Goal: Retrace behaves like a QA-native Pullfrog. + +Build: + +- GitHub App commands: + - `@retrace triage` + - `@retrace reproduce` + - `@retrace fix` + - `@retrace verify` + - `@retrace explain` +- GitHub Actions workflow templates for: + - replay/API/test incident ingestion + - generated spec reruns + - PR review filing + - self-healing failed Retrace PRs with attempt caps +- Loop prevention: max attempts per incident/PR, stale branch detection, + validation gate before new commits. +- BYOK secrets stored in GitHub Actions secrets for fully user-owned compute. + +Reference: + +- Pullfrog's GitHub-first trigger model, self-healing PRs, and Actions-backed + execution. +- PR-Agent's tool split and PR compression. + +### Phase 5: Agent Provider And Repair Adapter Layer + +Goal: coding-agent integration is swappable and testable. + +Build: + +- Common `RepairAgent` interface: + - `prepare(bundle) -> command/spec` + - `apply(worktree, bundle) -> result` + - `validate(worktree, commands) -> result` + - `summarize(result) -> incident event` +- Adapters for local `codex`, `claude`, shell command, and no-op prompt-only + mode. +- Provider cost and token accounting for LLM review/repair prompts. +- Prompt safety tests that prove user evidence cannot become instructions. +- Repair eval fixtures exported like Vercel Next evals. + +Reference: + +- Vercel AI SDK provider ergonomics. +- Langfuse observability/eval separation. + +### Phase 6: Self-Host Operations + +Goal: users can run this as infrastructure, not a laptop script. + +Build: + +- Production Docker Compose profile with API, UI, worker, browser runner, cron, + SQLite/Postgres, and optional object storage. +- Postgres backend moved from compatibility chassis to supported mode. +- Storage sizing guide: replay volume, retention, source maps, artifacts. +- Backup/restore and upgrade tests. +- Retention defaults for raw replay, redacted evidence, source maps, and + generated artifacts. +- Health checks and `/readyz` surfaces for each service. + +Reference: + +- OpenReplay self-hosting posture. +- PostHog's explicit open-source/self-host tradeoff language. + +### Phase 7: Community And Extension System + +Goal: contributors can add detectors, runners, sinks, and matchers without +reading the whole codebase. + +Build: + +- Detector plugin contract with fixture tests and reason-code docs. +- Test runner plugin contract for native HTTP, Playwright, API, visual, and + future mobile. +- Sink contract for GitHub Issues, Linear, Jira, Slack, webhooks. +- Matching contract for route manifests, source maps, stack traces, selectors, + framework component graphs. +- Example apps: + - Next.js checkout bug + - SaaS dashboard auth bug + - API contract regression + - source-mapped frontend exception + +Acceptance: + +- A contributor can add one detector with one fixture and one docs page in under + an hour. + +## Non-Goals + +- Replacing Playwright or Cypress as general-purpose test frameworks. +- Replacing PostHog as product analytics. +- Replacing Sentry as a broad multi-language observability platform. +- Auto-merging fixes without tests and human review. +- Treating LLM judgment as proof of failure. +- Building a paid cloud tier before the open-source loop is excellent. + +## Launch Bar + +Retrace is ready to call itself premium open-source QA architecture when: + +- Fresh checkout demo works in 10 minutes without external services. +- Default branch is green for 30 consecutive days. +- At least five real-world fixture apps are covered. +- Generated Playwright/API specs run in CI with stable artifact output. +- `qa auto` can take a replay or monitor incident to a draft PR with a + validation command. +- A failed repair attempt is classified and preserved as an eval result. +- Docs explain the architecture, threat model, privacy model, and extension + points. +- The browser SDK and Python SDK have versioned packages and upgrade notes. + +## Immediate Next Moves + +1. Merge the default-branch fix and confirm GitHub CI is green. +2. Add the killer-demo contract as a CI e2e job. +3. Convert `docs/roadmap.md` into execution milestones that map to this + architecture. +4. Build the incident detail UX around the single `qa_incidents` spine. +5. Add GitHub App command triggers and loop prevention. +6. Export repair attempts as eval fixtures. + diff --git a/src/retrace/storage/core.py b/src/retrace/storage/core.py index e357bf4..8d83447 100644 --- a/src/retrace/storage/core.py +++ b/src/retrace/storage/core.py @@ -1322,8 +1322,8 @@ def _append_failure_evidence( payload_json = json.dumps(evidence.payload, sort_keys=True) except (TypeError, ValueError) as exc: raise ValueError("evidence payload must be JSON-serializable") from exc - evidence_id = _id("ev") - created_at = _now_iso_microseconds() + evidence_id = self._id("ev") + created_at = self._now_iso_microseconds() conn.execute( """ INSERT OR IGNORE INTO failure_evidence diff --git a/src/retrace/storage/repositories/incidents.py b/src/retrace/storage/repositories/incidents.py index f49ed6d..955f56e 100644 --- a/src/retrace/storage/repositories/incidents.py +++ b/src/retrace/storage/repositories/incidents.py @@ -7,7 +7,8 @@ from ..helpers import ( _SEVERITY_ORDER, _rollup_severity, _string_values, _normalize_app_error_incident_status, _id, _public_id, _dt, - _safe_json_obj, _merge_string_lists, APP_ERROR_FAILURE_STATUS_BY_INCIDENT_STATUS + _safe_json_obj, _merge_string_lists, _parse_string_list_json, + APP_ERROR_FAILURE_STATUS_BY_INCIDENT_STATUS ) from ..models import ( FailureRow, EvidenceRow, IncidentRow, IncidentLifecycleEventRow, @@ -1286,6 +1287,37 @@ def nearest_deploy_marker( ).fetchone() return self._deploy_marker_from_row(row) if row is not None else None + def update_failure_deploy(self, *, failure_id: str, deploy_sha: str) -> None: + now = datetime.now(timezone.utc).isoformat() + with self._conn() as conn: + conn.execute( + """ + UPDATE failures + SET related_deploy_sha = ?, + updated_at = ? + WHERE id = ? OR public_id = ? + """, + (deploy_sha.strip(), now, failure_id, failure_id), + ) + + def _deploy_marker_from_row(self, row: sqlite3.Row) -> DeployMarkerRow: + return DeployMarkerRow( + id=str(row["id"]), + public_id=str(row["public_id"]), + project_id=str(row["project_id"]), + environment_id=str(row["environment_id"]), + sha=str(row["sha"]), + branch=str(row["branch"] or ""), + author=str(row["author"] or ""), + deployed_at_ms=int(row["deployed_at_ms"] or 0), + changed_files=_merge_string_lists( + _parse_string_list_json(row["changed_files_json"]) + ), + metadata=dict(_safe_json_obj(row["metadata_json"])), + created_at=_dt(row["created_at"]) or datetime.now(timezone.utc), + updated_at=_dt(row["updated_at"]) or datetime.now(timezone.utc), + ) + def upsert_source_map( self, @@ -1393,6 +1425,19 @@ def list_recent_source_maps( ).fetchall() return [self._source_map_from_row(row) for row in rows] + def _source_map_from_row(self, row: sqlite3.Row) -> SourceMapRow: + return SourceMapRow( + id=str(row["id"]), + public_id=str(row["public_id"]), + project_id=str(row["project_id"]), + environment_id=str(row["environment_id"]), + release=str(row["release"]), + dist=str(row["dist"] or ""), + artifact_url=str(row["artifact_url"]), + source_map=dict(_safe_json_obj(row["source_map_json"])), + uploaded_at=_dt(row["uploaded_at"]) or datetime.now(timezone.utc), + ) + def _validate_source_map_payload(self, source_map: dict[str, Any]) -> None: if source_map.get("version") != 3: raise ValueError("source_map must be a supported Source Map v3 object") diff --git a/src/retrace/tester/__init__.py b/src/retrace/tester/__init__.py index e3cbfd0..77d6621 100644 --- a/src/retrace/tester/__init__.py +++ b/src/retrace/tester/__init__.py @@ -41,11 +41,24 @@ _redacted_response_headers as _redacted_response_headers, _response_assertion_evidence as _response_assertion_evidence, ) +from . import harness as _harness from .harness import ( - run_spec as run_spec, load_run_summaries as load_run_summaries, enqueue_spec_run as enqueue_spec_run, run_queued_spec_once as run_queued_spec_once, set_explore_factories as set_explore_factories, set_visual_factories as set_visual_factories, + _run_playwright_spec as _run_playwright_spec, + _run_shell as _run_shell, ) + + +def run_spec(*args, **kwargs): + """Run a tester spec through the current package-level shell hook. + + Older tests and integrations monkeypatch `retrace.tester._run_shell`. + The implementation now lives in `retrace.tester.harness`, so keep that + facade contract by syncing the harness hook immediately before execution. + """ + _harness._run_shell = _run_shell + return _harness.run_spec(*args, **kwargs) diff --git a/src/retrace/tester/assertions.py b/src/retrace/tester/assertions.py index c01c7db..6c97b37 100644 --- a/src/retrace/tester/assertions.py +++ b/src/retrace/tester/assertions.py @@ -73,8 +73,11 @@ def _evaluate_consensus_assertion( assertion: dict[str, Any], *, consensus_group: str, + response: Optional[httpx.Response] = None, + evidence: Optional[dict[str, Any]] = None, + arbiter_vote: bool | None = None, ) -> TesterAssertionResult: - votes = _collect_consensus_votes(assertion) + votes = list(assertion.get("__collected_votes") or _collect_consensus_votes(assertion)) if not votes: return _assertion_result( assertion=assertion, @@ -85,24 +88,59 @@ def _evaluate_consensus_assertion( ) ok_votes = [v for v in votes if _bool_from_vote(v) is True] fail_votes = [v for v in votes if _bool_from_vote(v) is False] - ok = len(ok_votes) >= len(fail_votes) + retry_count = int( + assertion.get("__retry_count") + if assertion.get("__retry_count") is not None + else len( + [ + vote + for vote in votes + if bool(vote.get("retry")) or vote in assertion.get("retry_votes", []) + ] + ) + ) + disagreement = bool(ok_votes and fail_votes) + decision = "majority" + if arbiter_vote is None and "arbiter_vote" in assertion: + arbiter_vote = _coerce_arbiter_vote(assertion.get("arbiter_vote")) + if disagreement and arbiter_vote is not None: + ok = arbiter_vote + decision = "arbiter" + else: + ok = len(ok_votes) >= len(fail_votes) + selected_votes = ok_votes if ok else fail_votes confidence = ( - max([_coerce_confidence(v.get("confidence"), default=0.5) for v in ok_votes]) - if ok and ok_votes - else max([_coerce_confidence(v.get("confidence"), default=0.5) for v in fail_votes]) - if fail_votes - else 0.5 + max([_coerce_confidence(v.get("confidence"), default=1.0) for v in selected_votes]) + if selected_votes + else 1.0 ) + if assertion.get("confidence") is not None: + confidence = _coerce_confidence(assertion.get("confidence"), default=confidence) message = ( f"Consensus reached (OK={len(ok_votes)}, FAIL={len(fail_votes)})." if ok else f"Consensus failed (OK={len(ok_votes)}, FAIL={len(fail_votes)})." ) + actual_evidence = evidence + if actual_evidence is None: + actual_evidence = _response_assertion_evidence( + response, + capture_body=bool(assertion.get("capture_body_evidence")), + ) + actual = { + "decision": decision, + "disagreement": disagreement, + "pass_votes": len(ok_votes), + "fail_votes": len(fail_votes), + "retry_count": retry_count, + "arbiter_vote": arbiter_vote, + "evidence": actual_evidence, + } return _assertion_result( assertion={**assertion, "consensus_group": consensus_group, "model_votes": votes}, ok=ok, expected=f"majority OK in {consensus_group}", - actual=f"{len(ok_votes)} OK / {len(fail_votes)} FAIL", + actual=actual, message=message, confidence=confidence, ) @@ -114,44 +152,100 @@ def _evaluate_model_backed_consensus_assertion( response: Optional[httpx.Response], ) -> TesterAssertionResult: models = list(assertion.get("models") or []) - if not models: - return _assertion_result( - assertion=assertion, - ok=False, - expected="at least one model", - actual=None, - message="No models configured for model-backed consensus assertion.", - ) - prompt = str(assertion.get("prompt") or assertion.get("text") or "") - if not prompt: - return _assertion_result( - assertion=assertion, - ok=False, - expected="non-empty prompt", - actual=None, - message="Prompt is required for model-backed consensus assertion.", - ) evidence = assertion.get("evidence") + capture_body = bool(assertion.get("capture_body_evidence") or models) if not evidence and response: evidence = _response_assertion_evidence( - response, capture_body=bool(assertion.get("capture_body_evidence")) + response, capture_body=capture_body ) - votes = _call_consensus_models( - models=models, - prompt=prompt, - snapshot=evidence or {}, - provider=str(assertion.get("provider") or "openai"), - base_url=str(assertion.get("base_url") or ""), - api_key=assertion.get("api_key"), - timeout=float(assertion.get("timeout") or 30.0), - retry=bool(assertion.get("retry")), + elif isinstance(evidence, dict) and response: + response_evidence = _response_assertion_evidence( + response, capture_body=capture_body + ) + evidence = {**evidence, **response_evidence} + elif not isinstance(evidence, dict): + evidence = _response_assertion_evidence(None, capture_body=False) + + if models: + prompt = str(assertion.get("prompt") or assertion.get("text") or "") + if not prompt: + return _assertion_result( + assertion=assertion, + ok=False, + expected="non-empty prompt", + actual=None, + message="Prompt is required for model-backed consensus assertion.", + ) + votes = _call_consensus_models( + models=models, + prompt=prompt, + snapshot=evidence or {}, + provider=str(assertion.get("provider") or "openai"), + base_url=str(assertion.get("base_url") or ""), + api_key=assertion.get("api_key"), + timeout=float(assertion.get("timeout") or 30.0), + retry=bool(assertion.get("retry")), + ) + else: + votes = _collect_consensus_votes(assertion) + + arbiter_vote = None + parsed_votes = [_bool_from_vote(vote) for vote in votes] + disagreement = any(v is True for v in parsed_votes) and any( + v is False for v in parsed_votes ) + arbiter_model = str(assertion.get("arbiter_model") or "").strip() + prompt = str(assertion.get("prompt") or assertion.get("text") or "") + if disagreement and arbiter_model and prompt: + arbiter_votes = _call_consensus_models( + models=[arbiter_model], + prompt=prompt, + snapshot=evidence or {}, + provider=str(assertion.get("provider") or "openai"), + base_url=str(assertion.get("base_url") or ""), + api_key=assertion.get("api_key"), + timeout=float(assertion.get("timeout") or 30.0), + retry=False, + ) + if arbiter_votes: + arbiter_vote = _bool_from_vote(arbiter_votes[0]) + elif "arbiter_vote" in assertion: + arbiter_vote = _coerce_arbiter_vote(assertion.get("arbiter_vote")) + return _evaluate_consensus_assertion( - {**assertion, "model_votes": votes}, - consensus_group=f"models:{','.join(models)}", + { + **assertion, + "model_votes": votes, + "__collected_votes": votes, + "__retry_count": len( + [vote for vote in votes if vote in assertion.get("retry_votes", [])] + ), + }, + consensus_group=str( + assertion.get("consensus_group") + or (f"models:{','.join(models)}" if models else "model_consensus") + ), + response=response, + evidence=evidence, + arbiter_vote=arbiter_vote, ) +def _coerce_arbiter_vote(raw: Any) -> bool | None: + if raw is None: + return None + if isinstance(raw, bool): + return raw + if isinstance(raw, (int, float)): + return bool(raw) + normalized = str(raw).strip().lower() + if normalized in {"pass", "passed", "true", "1", "yes", "ok"}: + return True + if normalized in {"fail", "failed", "false", "0", "no"}: + return False + return None + + def _call_consensus_models( *, models: list[str], @@ -437,13 +531,27 @@ def _classify_failure( ] ): return "environment_failure" + if any( + k in merged + for k in [ + "invalid username or password", + "login failed", + "unauthorized", + "forbidden", + "auth failure", + "missing jwt", + "401", + "403", + ] + ): + return "auth_failure" if any(k in merged for k in ["timeout", "timed out", "deadline exceeded"]): return "timeout" if any( k in merged - for k in ["invalid username or password", "login failed", "unauthorized", "401"] + for k in ["invalid_regex", "unsupported native step", "unsupported assertion"] ): - return "auth_failure" + return "test_bug" if _failed_selector_assertion(failed_assertions): return "selector_drift" if failed_assertions or exit_code != 0: @@ -462,6 +570,16 @@ def _assertion_text_for_classification(items: list[dict[str, Any]]) -> str: def _failed_selector_assertion(items: list[dict[str, Any]]) -> bool: for item in items: + assertion_type = str( + item.get("assertion_type") or item.get("type") or "" + ).lower() + if assertion_type in { + "selector_visible", + "element_visible", + "selector_count", + "element_count", + }: + return True msg = str(item.get("message") or "").lower() if "selector" in msg or "not found" in msg or "could not find" in msg: return True diff --git a/src/retrace/tester/specs.py b/src/retrace/tester/specs.py index 84f2a59..f338ba3 100644 --- a/src/retrace/tester/specs.py +++ b/src/retrace/tester/specs.py @@ -364,6 +364,11 @@ def select_execution_engine(spec: TesterSpec) -> EngineSelection: "requires credential-aware execution" ), ) + if spec.exploratory_goals and bool(spec.browser_settings.get("visual")): + return EngineSelection( + execution_engine="visual", + reason="auto selected visual for screenshot-guided exploratory goals", + ) if spec.exploratory_goals: return EngineSelection( execution_engine="explore", From 2c07ef265dfcf805a9ccd8863b9a8a7b55c63c19 Mon Sep 17 00:00:00 2001 From: Colin Son Date: Mon, 18 May 2026 19:29:06 -0500 Subject: [PATCH 2/2] fix: address phase 0 review comments --- docs/premium-qa-architecture-plan.md | 3 +-- src/retrace/storage/repositories/incidents.py | 9 +++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/premium-qa-architecture-plan.md b/docs/premium-qa-architecture-plan.md index c800f2a..4e8e77a 100644 --- a/docs/premium-qa-architecture-plan.md +++ b/docs/premium-qa-architecture-plan.md @@ -216,7 +216,7 @@ Goal: the project must be boringly green before claiming reliability. Python SDK tests, Playwright runner tests, Postgres smoke, Docker build, and e2e tests. - Add a CI badge and "known green command set" to the README. -- Make branch protection require the same jobs listed in `.github/workflows`. +- Make GitHub branch protection require the same jobs listed in `.github/workflows`. - Add a short maintainer rule: no roadmap work merges while default branch is red. @@ -413,4 +413,3 @@ Retrace is ready to call itself premium open-source QA architecture when: 4. Build the incident detail UX around the single `qa_incidents` spine. 5. Add GitHub App command triggers and loop prevention. 6. Export repair attempts as eval fixtures. - diff --git a/src/retrace/storage/repositories/incidents.py b/src/retrace/storage/repositories/incidents.py index 955f56e..6f539af 100644 --- a/src/retrace/storage/repositories/incidents.py +++ b/src/retrace/storage/repositories/incidents.py @@ -1288,17 +1288,22 @@ def nearest_deploy_marker( return self._deploy_marker_from_row(row) if row is not None else None def update_failure_deploy(self, *, failure_id: str, deploy_sha: str) -> None: + clean_failure_id = failure_id.strip() + if not clean_failure_id: + raise ValueError("failure_id is required") now = datetime.now(timezone.utc).isoformat() with self._conn() as conn: - conn.execute( + cur = conn.execute( """ UPDATE failures SET related_deploy_sha = ?, updated_at = ? WHERE id = ? OR public_id = ? """, - (deploy_sha.strip(), now, failure_id, failure_id), + (deploy_sha.strip(), now, clean_failure_id, clean_failure_id), ) + if int(cur.rowcount) == 0: + raise ValueError(f"unknown failure_id: {failure_id}") def _deploy_marker_from_row(self, row: sqlite3.Row) -> DeployMarkerRow: return DeployMarkerRow(