diff --git a/challenge-benchmark-holdout-leakage-guard/README.md b/challenge-benchmark-holdout-leakage-guard/README.md new file mode 100644 index 0000000..3f07f1e --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/README.md @@ -0,0 +1,41 @@ +# Challenge Benchmark Holdout Leakage Guard + +This module adds a benchmark holdout leakage guard for SCIBASE issue +[#18](https://github.com/SCIBASE-AI/SCIBASE.AI/issues/18). It protects +scientific bounty scoring by checking whether private holdout data was exposed +before leaderboard or award decisions. + +The slice is intentionally narrow. It does not duplicate rubric readiness, +milestone progress, evidence freeze, IP redaction, payout eligibility, reviewer +consensus, solver prequalification, data-room access, clarification freeze, +award transparency, deliverable acceptance, or reproducibility environment +work. + +## What It Checks + +- Holdout artifacts have stable hashes before scoring. +- Holdout release events use sealed scoring channels only. +- Holdouts are not released before the submission deadline. +- Solver workspaces were not granted sealed holdout paths. +- Public baseline hashes do not match holdout hashes. +- Sponsor exceptions are approved outside the freeze window. +- Submission packages do not contain holdout artifact hashes. + +## Local Usage + +```bash +cd challenge-benchmark-holdout-leakage-guard +npm run check +npm test +npm run demo +``` + +`npm run demo` writes reviewer artifacts under `reports/`: + +- `benchmark-leakage-packet.json` +- `benchmark-leakage-report.md` +- `summary.svg` +- `demo.mp4` + +All examples use synthetic challenge metadata. No external services, accounts, +private challenge files, or payment information are required. diff --git a/challenge-benchmark-holdout-leakage-guard/acceptance-notes.md b/challenge-benchmark-holdout-leakage-guard/acceptance-notes.md new file mode 100644 index 0000000..6eca32d --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/acceptance-notes.md @@ -0,0 +1,26 @@ +# Acceptance Notes + +## Reviewer Checklist + +- Self-contained under `challenge-benchmark-holdout-leakage-guard/`. +- Dependency-free Node.js implementation. +- Synthetic challenge metadata only. +- Tests cover ready-to-score, sponsor-review, and quarantine decisions. +- Demo artifacts include JSON, Markdown, SVG, and MP4 outputs. + +## Commands Run + +```bash +npm run check +npm test +npm run demo +ffprobe -v error -show_entries format=duration,size -show_entries stream=codec_name,width,height -of default=noprint_wrappers=1 reports/demo.mp4 +git diff --check +``` + +## Limitations + +- This is a deterministic pre-scoring guard, not a live challenge storage or + data-room integration. +- Production integration should replace sample policy values and synthetic + events with signed SCIBASE challenge audit logs. diff --git a/challenge-benchmark-holdout-leakage-guard/demo.js b/challenge-benchmark-holdout-leakage-guard/demo.js new file mode 100644 index 0000000..952de8f --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/demo.js @@ -0,0 +1,80 @@ +const fs = require("node:fs") +const path = require("node:path") +const { spawnSync } = require("node:child_process") +const { evaluateChallengePortfolio } = require("./index") +const { leakagePolicy, challenges } = require("./sample-data") + +const reportsDir = path.join(__dirname, "reports") +fs.mkdirSync(reportsDir, { recursive: true }) + +const packet = evaluateChallengePortfolio({ challenges, leakagePolicy }) +const { summary } = packet + +fs.writeFileSync( + path.join(reportsDir, "benchmark-leakage-packet.json"), + `${JSON.stringify(packet, null, 2)}\n`, +) + +const markdown = [ + "# Challenge Benchmark Holdout Leakage Guard Report", + "", + `Generated challenges: ${summary.totalChallenges}`, + `Ready to score: ${summary.score}`, + `Needs review: ${summary.review}`, + `Quarantined: ${summary.quarantine}`, + `Scoring actions: ${summary.scoringActions}`, + `Audit digest: \`${packet.audit.digest}\``, + "", + "## Challenge Decisions", + ...packet.decisions.flatMap((decision) => [ + "", + `### ${decision.id}: ${decision.title}`, + `- Status: ${decision.status}`, + `- Holdout artifacts: ${decision.holdoutArtifacts}`, + `- Release events: ${decision.releaseEvents}`, + `- Solver grants: ${decision.solverWorkspaceGrants}`, + `- Findings: ${decision.findings.map((finding) => finding.code).join(", ") || "none"}`, + `- First action: ${decision.scoringActions[0]?.message || "none"}`, + ]), + "", +] + +fs.writeFileSync(path.join(reportsDir, "benchmark-leakage-report.md"), markdown.join("\n")) + +const svg = ` + + Challenge Benchmark Holdout Leakage Guard + Synthetic scoring readiness packet for SCIBASE issue #18 + + ${summary.score} + score + + ${summary.review} + review + + ${summary.quarantine} + quarantine + Checks: holdout hashes, release channels, solver grants, public baselines, exceptions, submission hashes. + Digest ${packet.audit.digest.slice(0, 24)}... + +` +fs.writeFileSync(path.join(reportsDir, "summary.svg"), svg) + +const ffmpeg = spawnSync("ffmpeg", [ + "-y", + "-f", + "lavfi", + "-i", + "color=c=0x111827:s=960x540:d=6:r=15", + "-vf", + "drawbox=x=48:y=170:w=250:h=150:color=0x0f766e@1:t=fill,drawbox=x=355:y=170:w=250:h=150:color=0xb45309@1:t=fill,drawbox=x=662:y=170:w=250:h=150:color=0x991b1b@1:t=fill,drawbox=x=48:y=370:w=864:h=18:color=0xa78bfa@1:t=fill", + "-pix_fmt", + "yuv420p", + path.join(reportsDir, "demo.mp4"), +], { stdio: "ignore" }) + +if (ffmpeg.status !== 0) { + console.warn("ffmpeg video generation failed; summary.svg and JSON/Markdown reports were still generated.") +} + +console.log(`Wrote benchmark leakage artifacts to ${reportsDir}`) diff --git a/challenge-benchmark-holdout-leakage-guard/index.js b/challenge-benchmark-holdout-leakage-guard/index.js new file mode 100644 index 0000000..8b7750c --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/index.js @@ -0,0 +1,186 @@ +const crypto = require("node:crypto") + +function stableJson(value) { + if (Array.isArray(value)) { + return `[${value.map(stableJson).join(",")}]` + } + if (value && typeof value === "object") { + return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(",")}}` + } + return JSON.stringify(value) +} + +function digestFor(value) { + return crypto.createHash("sha256").update(stableJson(value)).digest("hex") +} + +function hoursBetween(a, b) { + return (new Date(b).getTime() - new Date(a).getTime()) / 3600000 +} + +function finding(code, severity, message, detail = {}) { + return { code, severity, message, detail } +} + +function action(code, owner, message) { + return { code, owner, message } +} + +function evaluateChallenge(challenge, policy) { + const findings = [] + const holdoutHashes = new Set() + const holdoutPaths = new Set(challenge.holdoutArtifacts.map((artifact) => artifact.path)) + + for (const artifact of challenge.holdoutArtifacts) { + if (!artifact.hash || artifact.hash.length < policy.minHoldoutHashLength) { + findings.push(finding("HOLDOUT_HASH_MISSING", "blocker", "Holdout artifact needs a stable hash before scoring can start.", { + artifactId: artifact.id, + path: artifact.path, + })) + continue + } + holdoutHashes.add(artifact.hash) + } + + for (const event of challenge.releaseEvents) { + if (!policy.allowedReleaseChannels.includes(event.channel)) { + findings.push(finding("HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL", "blocker", "Holdout artifact was released through a non-scoring channel.", { + artifactId: event.artifactId, + channel: event.channel, + at: event.at, + actor: event.actor, + })) + } + + if (new Date(event.at) < new Date(challenge.submissionDeadline)) { + findings.push(finding("HOLDOUT_RELEASED_BEFORE_DEADLINE", "blocker", "Holdout release happened before the submission deadline.", { + artifactId: event.artifactId, + channel: event.channel, + at: event.at, + submissionDeadline: challenge.submissionDeadline, + })) + } + } + + for (const grant of challenge.solverWorkspaceGrants) { + const leakedPaths = grant.paths.filter((path) => holdoutPaths.has(path)) + if (leakedPaths.length > 0 && new Date(grant.grantedAt) < new Date(challenge.submissionDeadline)) { + findings.push(finding("SOLVER_WORKSPACE_HOLDOUT_GRANT", "blocker", "Solver workspace received holdout paths before the deadline.", { + solverId: grant.solverId, + leakedPaths, + grantedAt: grant.grantedAt, + })) + } + } + + for (const baseline of challenge.publicBaselines) { + if (holdoutHashes.has(baseline.hash)) { + findings.push(finding("PUBLIC_BASELINE_HASH_OVERLAP", "blocker", "A public baseline artifact matches the sealed holdout hash.", { + baselineId: baseline.id, + channel: baseline.channel, + hash: baseline.hash, + })) + } + } + + for (const exception of challenge.sponsorExceptions) { + const hoursToDeadline = hoursBetween(exception.approvedAt || challenge.submissionDeadline, challenge.submissionDeadline) + if (!exception.approved) { + findings.push(finding("UNAPPROVED_SPONSOR_EXCEPTION", "warning", "Sponsor exception was logged but not approved before scoring.", { + exceptionId: exception.id, + artifactId: exception.artifactId, + reason: exception.reason, + })) + continue + } + if (hoursToDeadline <= policy.freezeWindowHours) { + findings.push(finding("LATE_SPONSOR_EXCEPTION_REVIEW", "warning", "Approved sponsor exception landed inside the benchmark freeze window.", { + exceptionId: exception.id, + artifactId: exception.artifactId, + approvedAt: exception.approvedAt, + freezeWindowHours: policy.freezeWindowHours, + })) + } + } + + for (const submission of challenge.submissions) { + const overlappingHashes = submission.artifactHashes.filter((hash) => holdoutHashes.has(hash)) + if (overlappingHashes.length > 0) { + findings.push(finding("SUBMISSION_CONTAINS_HOLDOUT_HASH", "blocker", "Submission package contains an artifact hash matching the sealed holdout.", { + solverId: submission.solverId, + submittedAt: submission.submittedAt, + overlappingHashes, + })) + } + } + + const blockers = findings.filter((item) => item.severity === "blocker") + const warnings = findings.filter((item) => item.severity === "warning") + const status = blockers.length > 0 ? "quarantine" : (warnings.length > 0 ? "review" : "score") + + const decision = { + id: challenge.id, + title: challenge.title, + status, + holdoutArtifacts: challenge.holdoutArtifacts.length, + releaseEvents: challenge.releaseEvents.length, + solverWorkspaceGrants: challenge.solverWorkspaceGrants.length, + publicBaselines: challenge.publicBaselines.length, + submissions: challenge.submissions.length, + findings, + scoringActions: buildScoringActions(status, findings), + } + + return { + ...decision, + auditDigest: digestFor(decision), + } +} + +function buildScoringActions(status, findings) { + if (status === "score") { + return [action("RELEASE_TO_SCORING", "scoring-admin", "Proceed with sealed evaluator scoring.")] + } + + const actions = [] + for (const item of findings) { + if (item.severity === "blocker") { + actions.push(action(`QUARANTINE_${item.code}`, "challenge-ops", item.message)) + } else { + actions.push(action(`REVIEW_${item.code}`, "sponsor-reviewer", item.message)) + } + } + + if (status === "quarantine") { + actions.push(action("FREEZE_LEADERBOARD", "challenge-ops", "Freeze leaderboard publication until leakage review is resolved.")) + } + + return [...new Map(actions.map((item) => [item.code, item])).values()] +} + +function evaluateChallengePortfolio({ challenges, leakagePolicy }) { + const decisions = challenges.map((challenge) => evaluateChallenge(challenge, leakagePolicy)) + const summary = { + totalChallenges: decisions.length, + score: decisions.filter((decision) => decision.status === "score").length, + review: decisions.filter((decision) => decision.status === "review").length, + quarantine: decisions.filter((decision) => decision.status === "quarantine").length, + scoringActions: decisions.reduce((sum, decision) => sum + decision.scoringActions.length, 0), + } + + return { + generatedAt: "2026-05-21T14:25:00.000Z", + policy: leakagePolicy, + summary, + decisions, + audit: { + source: "synthetic-challenge-benchmark-leakage-review", + digest: digestFor({ summary, decisions }), + }, + } +} + +module.exports = { + evaluateChallenge, + evaluateChallengePortfolio, +} diff --git a/challenge-benchmark-holdout-leakage-guard/package.json b/challenge-benchmark-holdout-leakage-guard/package.json new file mode 100644 index 0000000..178a817 --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/package.json @@ -0,0 +1,12 @@ +{ + "name": "challenge-benchmark-holdout-leakage-guard", + "version": "1.0.0", + "description": "Benchmark holdout leakage guard for SCIBASE scientific bounty challenges", + "private": true, + "scripts": { + "check": "node --check index.js && node --check sample-data.js && node --check test.js && node --check demo.js", + "test": "node test.js", + "demo": "node demo.js" + }, + "license": "MIT" +} diff --git a/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-packet.json b/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-packet.json new file mode 100644 index 0000000..57ceb27 --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-packet.json @@ -0,0 +1,229 @@ +{ + "generatedAt": "2026-05-21T14:25:00.000Z", + "policy": { + "freezeWindowHours": 48, + "minHoldoutHashLength": 16, + "allowedReleaseChannels": [ + "scoring-service", + "sealed-evaluator" + ], + "publicChannels": [ + "public-baseline", + "documentation", + "discussion-thread" + ] + }, + "summary": { + "totalChallenges": 4, + "score": 1, + "review": 1, + "quarantine": 2, + "scoringActions": 11 + }, + "decisions": [ + { + "id": "CH-SAFE-001", + "title": "Protein stability private holdout scoring", + "status": "score", + "holdoutArtifacts": 1, + "releaseEvents": 1, + "solverWorkspaceGrants": 1, + "publicBaselines": 1, + "submissions": 1, + "findings": [], + "scoringActions": [ + { + "code": "RELEASE_TO_SCORING", + "owner": "scoring-admin", + "message": "Proceed with sealed evaluator scoring." + } + ], + "auditDigest": "db3398dc5bbe2a1fa4442274984e4fe7822f939e6c5302683486243143ff830e" + }, + { + "id": "CH-LEAK-002", + "title": "Climate forecast benchmark with accidental public release", + "status": "quarantine", + "holdoutArtifacts": 1, + "releaseEvents": 2, + "solverWorkspaceGrants": 1, + "publicBaselines": 1, + "submissions": 1, + "findings": [ + { + "code": "HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL", + "severity": "blocker", + "message": "Holdout artifact was released through a non-scoring channel.", + "detail": { + "artifactId": "HOLD-CF-7", + "channel": "documentation", + "at": "2026-06-03T14:00:00Z", + "actor": "sponsor-editor" + } + }, + { + "code": "HOLDOUT_RELEASED_BEFORE_DEADLINE", + "severity": "blocker", + "message": "Holdout release happened before the submission deadline.", + "detail": { + "artifactId": "HOLD-CF-7", + "channel": "documentation", + "at": "2026-06-03T14:00:00Z", + "submissionDeadline": "2026-06-05T23:59:00Z" + } + }, + { + "code": "SOLVER_WORKSPACE_HOLDOUT_GRANT", + "severity": "blocker", + "message": "Solver workspace received holdout paths before the deadline.", + "detail": { + "solverId": "team-beta", + "leakedPaths": [ + "sealed/climate/region-7-holdout.parquet" + ], + "grantedAt": "2026-06-02T08:00:00Z" + } + }, + { + "code": "PUBLIC_BASELINE_HASH_OVERLAP", + "severity": "blocker", + "message": "A public baseline artifact matches the sealed holdout hash.", + "detail": { + "baselineId": "BASE-CF-2", + "channel": "public-baseline", + "hash": "ff00aa7711cc884422dd" + } + }, + { + "code": "UNAPPROVED_SPONSOR_EXCEPTION", + "severity": "warning", + "message": "Sponsor exception was logged but not approved before scoring.", + "detail": { + "exceptionId": "EXC-44", + "artifactId": "HOLD-CF-7", + "reason": "Sponsor preview request after freeze" + } + }, + { + "code": "SUBMISSION_CONTAINS_HOLDOUT_HASH", + "severity": "blocker", + "message": "Submission package contains an artifact hash matching the sealed holdout.", + "detail": { + "solverId": "team-beta", + "submittedAt": "2026-06-05T22:30:00Z", + "overlappingHashes": [ + "ff00aa7711cc884422dd" + ] + } + } + ], + "scoringActions": [ + { + "code": "QUARANTINE_HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL", + "owner": "challenge-ops", + "message": "Holdout artifact was released through a non-scoring channel." + }, + { + "code": "QUARANTINE_HOLDOUT_RELEASED_BEFORE_DEADLINE", + "owner": "challenge-ops", + "message": "Holdout release happened before the submission deadline." + }, + { + "code": "QUARANTINE_SOLVER_WORKSPACE_HOLDOUT_GRANT", + "owner": "challenge-ops", + "message": "Solver workspace received holdout paths before the deadline." + }, + { + "code": "QUARANTINE_PUBLIC_BASELINE_HASH_OVERLAP", + "owner": "challenge-ops", + "message": "A public baseline artifact matches the sealed holdout hash." + }, + { + "code": "REVIEW_UNAPPROVED_SPONSOR_EXCEPTION", + "owner": "sponsor-reviewer", + "message": "Sponsor exception was logged but not approved before scoring." + }, + { + "code": "QUARANTINE_SUBMISSION_CONTAINS_HOLDOUT_HASH", + "owner": "challenge-ops", + "message": "Submission package contains an artifact hash matching the sealed holdout." + }, + { + "code": "FREEZE_LEADERBOARD", + "owner": "challenge-ops", + "message": "Freeze leaderboard publication until leakage review is resolved." + } + ], + "auditDigest": "b888af59d5efac733046439117172c204d9d596cb7cb10ea0794b0ed293dda70" + }, + { + "id": "CH-REVIEW-003", + "title": "Materials challenge with late sponsor exception", + "status": "review", + "holdoutArtifacts": 1, + "releaseEvents": 1, + "solverWorkspaceGrants": 1, + "publicBaselines": 1, + "submissions": 1, + "findings": [ + { + "code": "LATE_SPONSOR_EXCEPTION_REVIEW", + "severity": "warning", + "message": "Approved sponsor exception landed inside the benchmark freeze window.", + "detail": { + "exceptionId": "EXC-80", + "artifactId": "HOLD-MAT-3", + "approvedAt": "2026-06-30T18:00:00Z", + "freezeWindowHours": 48 + } + } + ], + "scoringActions": [ + { + "code": "REVIEW_LATE_SPONSOR_EXCEPTION_REVIEW", + "owner": "sponsor-reviewer", + "message": "Approved sponsor exception landed inside the benchmark freeze window." + } + ], + "auditDigest": "8d06de6a09c1c7703b7d7467031f5fa9fb75a8562fec7dbd973da040e7762c08" + }, + { + "id": "CH-HOLD-004", + "title": "Chemistry optimization challenge missing holdout hash", + "status": "quarantine", + "holdoutArtifacts": 1, + "releaseEvents": 1, + "solverWorkspaceGrants": 0, + "publicBaselines": 0, + "submissions": 0, + "findings": [ + { + "code": "HOLDOUT_HASH_MISSING", + "severity": "blocker", + "message": "Holdout artifact needs a stable hash before scoring can start.", + "detail": { + "artifactId": "HOLD-CHEM-9", + "path": "sealed/chemistry/holdout.sdf" + } + } + ], + "scoringActions": [ + { + "code": "QUARANTINE_HOLDOUT_HASH_MISSING", + "owner": "challenge-ops", + "message": "Holdout artifact needs a stable hash before scoring can start." + }, + { + "code": "FREEZE_LEADERBOARD", + "owner": "challenge-ops", + "message": "Freeze leaderboard publication until leakage review is resolved." + } + ], + "auditDigest": "c72d2b706f728e9eae2e3e78183c65a2722a4b6e446614095d797b8de1ce62ba" + } + ], + "audit": { + "source": "synthetic-challenge-benchmark-leakage-review", + "digest": "9c448b358c70c46e32658642098d06c1a80886a70b95771c253ad9185f563723" + } +} diff --git a/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-report.md b/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-report.md new file mode 100644 index 0000000..c335d4e --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/reports/benchmark-leakage-report.md @@ -0,0 +1,42 @@ +# Challenge Benchmark Holdout Leakage Guard Report + +Generated challenges: 4 +Ready to score: 1 +Needs review: 1 +Quarantined: 2 +Scoring actions: 11 +Audit digest: `9c448b358c70c46e32658642098d06c1a80886a70b95771c253ad9185f563723` + +## Challenge Decisions + +### CH-SAFE-001: Protein stability private holdout scoring +- Status: score +- Holdout artifacts: 1 +- Release events: 1 +- Solver grants: 1 +- Findings: none +- First action: Proceed with sealed evaluator scoring. + +### CH-LEAK-002: Climate forecast benchmark with accidental public release +- Status: quarantine +- Holdout artifacts: 1 +- Release events: 2 +- Solver grants: 1 +- Findings: HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL, HOLDOUT_RELEASED_BEFORE_DEADLINE, SOLVER_WORKSPACE_HOLDOUT_GRANT, PUBLIC_BASELINE_HASH_OVERLAP, UNAPPROVED_SPONSOR_EXCEPTION, SUBMISSION_CONTAINS_HOLDOUT_HASH +- First action: Holdout artifact was released through a non-scoring channel. + +### CH-REVIEW-003: Materials challenge with late sponsor exception +- Status: review +- Holdout artifacts: 1 +- Release events: 1 +- Solver grants: 1 +- Findings: LATE_SPONSOR_EXCEPTION_REVIEW +- First action: Approved sponsor exception landed inside the benchmark freeze window. + +### CH-HOLD-004: Chemistry optimization challenge missing holdout hash +- Status: quarantine +- Holdout artifacts: 1 +- Release events: 1 +- Solver grants: 0 +- Findings: HOLDOUT_HASH_MISSING +- First action: Holdout artifact needs a stable hash before scoring can start. diff --git a/challenge-benchmark-holdout-leakage-guard/reports/demo.mp4 b/challenge-benchmark-holdout-leakage-guard/reports/demo.mp4 new file mode 100644 index 0000000..b890f2c Binary files /dev/null and b/challenge-benchmark-holdout-leakage-guard/reports/demo.mp4 differ diff --git a/challenge-benchmark-holdout-leakage-guard/reports/summary.svg b/challenge-benchmark-holdout-leakage-guard/reports/summary.svg new file mode 100644 index 0000000..4d70e7c --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/reports/summary.svg @@ -0,0 +1,16 @@ + + + Challenge Benchmark Holdout Leakage Guard + Synthetic scoring readiness packet for SCIBASE issue #18 + + 1 + score + + 1 + review + + 2 + quarantine + Checks: holdout hashes, release channels, solver grants, public baselines, exceptions, submission hashes. + Digest 9c448b358c70c46e32658642... + diff --git a/challenge-benchmark-holdout-leakage-guard/requirements-map.md b/challenge-benchmark-holdout-leakage-guard/requirements-map.md new file mode 100644 index 0000000..55b15fa --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/requirements-map.md @@ -0,0 +1,17 @@ +# Requirements Map + +| Issue #18 capability | Coverage in this module | +| --- | --- | +| Challenge posting portal | Uses declared deadlines, release channels, and sponsor exception logs. | +| Submission engine | Checks solver workspaces and submission artifact hashes before scoring. | +| Version control and audit logs | Produces deterministic audit digests and JSON/Markdown review packets. | +| Arbitration and reward distribution | Freezes scoring and leaderboard publication when leakage blockers exist. | +| Public/private challenge support | Separates sealed scoring channels from public baseline/documentation channels. | +| Evaluation criteria and scoring rubric | Adds a pre-scoring fairness gate for benchmark integrity. | + +## Non-Overlap Notes + +This is a holdout benchmark leakage guard. It avoids duplicating existing #18 +work around payout eligibility, award transparency, rubric readiness, challenge +milestones, sponsor data-room access, clarification freeze, deliverable +acceptance, IP redaction, reviewer consensus, and reproducibility environments. diff --git a/challenge-benchmark-holdout-leakage-guard/sample-data.js b/challenge-benchmark-holdout-leakage-guard/sample-data.js new file mode 100644 index 0000000..09d8772 --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/sample-data.js @@ -0,0 +1,101 @@ +const leakagePolicy = { + freezeWindowHours: 48, + minHoldoutHashLength: 16, + allowedReleaseChannels: ["scoring-service", "sealed-evaluator"], + publicChannels: ["public-baseline", "documentation", "discussion-thread"], +} + +const challenges = [ + { + id: "CH-SAFE-001", + title: "Protein stability private holdout scoring", + submissionDeadline: "2026-05-30T23:59:00Z", + scoringStartedAt: "2026-05-31T02:00:00Z", + holdoutArtifacts: [ + { id: "HOLD-PS-1", hash: "0a3f9c2e8bd4412397ea", path: "sealed/protein-stability/holdout-v2.csv" }, + ], + releaseEvents: [ + { artifactId: "HOLD-PS-1", channel: "sealed-evaluator", at: "2026-05-31T01:20:00Z", actor: "scoring-admin" }, + ], + solverWorkspaceGrants: [ + { solverId: "team-alpha", paths: ["training/protein-stability/train.csv"], grantedAt: "2026-05-18T10:00:00Z" }, + ], + publicBaselines: [ + { id: "BASE-PS-1", hash: "84c7131ef918aa0b11f1", channel: "public-baseline" }, + ], + sponsorExceptions: [], + submissions: [ + { solverId: "team-alpha", submittedAt: "2026-05-30T20:12:00Z", artifactHashes: ["84c7131ef918aa0b11f1"] }, + ], + }, + { + id: "CH-LEAK-002", + title: "Climate forecast benchmark with accidental public release", + submissionDeadline: "2026-06-05T23:59:00Z", + scoringStartedAt: "2026-06-06T03:00:00Z", + holdoutArtifacts: [ + { id: "HOLD-CF-7", hash: "ff00aa7711cc884422dd", path: "sealed/climate/region-7-holdout.parquet" }, + ], + releaseEvents: [ + { artifactId: "HOLD-CF-7", channel: "documentation", at: "2026-06-03T14:00:00Z", actor: "sponsor-editor" }, + { artifactId: "HOLD-CF-7", channel: "sealed-evaluator", at: "2026-06-06T01:10:00Z", actor: "scoring-admin" }, + ], + solverWorkspaceGrants: [ + { solverId: "team-beta", paths: ["sealed/climate/region-7-holdout.parquet"], grantedAt: "2026-06-02T08:00:00Z" }, + ], + publicBaselines: [ + { id: "BASE-CF-2", hash: "ff00aa7711cc884422dd", channel: "public-baseline" }, + ], + sponsorExceptions: [ + { id: "EXC-44", artifactId: "HOLD-CF-7", approved: false, reason: "Sponsor preview request after freeze" }, + ], + submissions: [ + { solverId: "team-beta", submittedAt: "2026-06-05T22:30:00Z", artifactHashes: ["ff00aa7711cc884422dd"] }, + ], + }, + { + id: "CH-REVIEW-003", + title: "Materials challenge with late sponsor exception", + submissionDeadline: "2026-07-01T23:59:00Z", + scoringStartedAt: "2026-07-02T04:00:00Z", + holdoutArtifacts: [ + { id: "HOLD-MAT-3", hash: "9c73b1d4a4e74cb39d10", path: "sealed/materials/final-holdout.jsonl" }, + ], + releaseEvents: [ + { artifactId: "HOLD-MAT-3", channel: "sealed-evaluator", at: "2026-07-02T02:00:00Z", actor: "scoring-admin" }, + ], + solverWorkspaceGrants: [ + { solverId: "team-gamma", paths: ["training/materials/train.jsonl"], grantedAt: "2026-06-20T12:00:00Z" }, + ], + publicBaselines: [ + { id: "BASE-MAT-1", hash: "0011aabbccddeeff0022", channel: "public-baseline" }, + ], + sponsorExceptions: [ + { id: "EXC-80", artifactId: "HOLD-MAT-3", approved: true, approvedAt: "2026-06-30T18:00:00Z", reason: "Independent auditor inspected sealed hash manifest" }, + ], + submissions: [ + { solverId: "team-gamma", submittedAt: "2026-07-01T21:00:00Z", artifactHashes: ["0011aabbccddeeff0022"] }, + ], + }, + { + id: "CH-HOLD-004", + title: "Chemistry optimization challenge missing holdout hash", + submissionDeadline: "2026-07-15T23:59:00Z", + scoringStartedAt: "2026-07-16T03:00:00Z", + holdoutArtifacts: [ + { id: "HOLD-CHEM-9", hash: "", path: "sealed/chemistry/holdout.sdf" }, + ], + releaseEvents: [ + { artifactId: "HOLD-CHEM-9", channel: "sealed-evaluator", at: "2026-07-16T01:00:00Z", actor: "scoring-admin" }, + ], + solverWorkspaceGrants: [], + publicBaselines: [], + sponsorExceptions: [], + submissions: [], + }, +] + +module.exports = { + leakagePolicy, + challenges, +} diff --git a/challenge-benchmark-holdout-leakage-guard/test.js b/challenge-benchmark-holdout-leakage-guard/test.js new file mode 100644 index 0000000..0dcae9f --- /dev/null +++ b/challenge-benchmark-holdout-leakage-guard/test.js @@ -0,0 +1,34 @@ +const assert = require("node:assert/strict") +const { evaluateChallengePortfolio } = require("./index") +const { leakagePolicy, challenges } = require("./sample-data") + +const packet = evaluateChallengePortfolio({ challenges, leakagePolicy }) + +assert.equal(packet.summary.totalChallenges, 4) +assert.equal(packet.summary.score, 1) +assert.equal(packet.summary.review, 1) +assert.equal(packet.summary.quarantine, 2) +assert.match(packet.audit.digest, /^[a-f0-9]{64}$/) + +const safe = packet.decisions.find((decision) => decision.id === "CH-SAFE-001") +assert.equal(safe.status, "score") +assert.equal(safe.findings.length, 0) +assert.equal(safe.scoringActions[0].code, "RELEASE_TO_SCORING") + +const leaked = packet.decisions.find((decision) => decision.id === "CH-LEAK-002") +assert.equal(leaked.status, "quarantine") +assert.ok(leaked.findings.some((finding) => finding.code === "HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL")) +assert.ok(leaked.findings.some((finding) => finding.code === "SOLVER_WORKSPACE_HOLDOUT_GRANT")) +assert.ok(leaked.findings.some((finding) => finding.code === "PUBLIC_BASELINE_HASH_OVERLAP")) +assert.ok(leaked.findings.some((finding) => finding.code === "SUBMISSION_CONTAINS_HOLDOUT_HASH")) +assert.ok(leaked.scoringActions.some((action) => action.code === "FREEZE_LEADERBOARD")) + +const review = packet.decisions.find((decision) => decision.id === "CH-REVIEW-003") +assert.equal(review.status, "review") +assert.ok(review.findings.some((finding) => finding.code === "LATE_SPONSOR_EXCEPTION_REVIEW")) + +const missingHash = packet.decisions.find((decision) => decision.id === "CH-HOLD-004") +assert.equal(missingHash.status, "quarantine") +assert.ok(missingHash.findings.some((finding) => finding.code === "HOLDOUT_HASH_MISSING")) + +console.log("challenge-benchmark-holdout-leakage-guard tests passed")