diff --git a/uncertainty-calibration-assistant/README.md b/uncertainty-calibration-assistant/README.md new file mode 100644 index 0000000..79c920f --- /dev/null +++ b/uncertainty-calibration-assistant/README.md @@ -0,0 +1,24 @@ +# Uncertainty Calibration Assistant + +This module adds a focused uncertainty-calibration assistant for the AI-Powered Research Assistant Suite. + +It reviews manuscript claims before submission and checks whether the wording, declared confidence, statistical evidence, replication status, and limitation disclosures agree. The assistant produces: + +- calibrated claim wording +- peer-review findings for overclaiming and missing uncertainty evidence +- reproducibility confidence +- research-gap opportunities when claims are low-confidence or replication has failed +- deterministic reviewer packets and audit digests + +## Run + +```sh +node uncertainty-calibration-assistant/test.js +node uncertainty-calibration-assistant/demo.js +``` + +The demo writes JSON and Markdown reviewer artifacts to `uncertainty-calibration-assistant/reports/`. + +## Review Surface + +The implementation is dependency-free, uses synthetic data only, and does not call external APIs or read credentials. diff --git a/uncertainty-calibration-assistant/acceptance-notes.md b/uncertainty-calibration-assistant/acceptance-notes.md new file mode 100644 index 0000000..2cd6220 --- /dev/null +++ b/uncertainty-calibration-assistant/acceptance-notes.md @@ -0,0 +1,26 @@ +# Acceptance Notes + +## What Changed + +Added `uncertainty-calibration-assistant/`, a self-contained pre-submission assistant that calibrates scientific claims against evidence strength, statistical uncertainty, replication status, and limitation disclosures. + +## How To Validate + +Run: + +```sh +node uncertainty-calibration-assistant/test.js +node uncertainty-calibration-assistant/demo.js +``` + +Optional syntax check: + +```sh +node --check uncertainty-calibration-assistant/index.js +node --check uncertainty-calibration-assistant/test.js +node --check uncertainty-calibration-assistant/demo.js +``` + +## Why This Is Issue-Specific + +Issue #16 explicitly calls for auto peer-review reports, claims-vs-evidence alignment, reproducibility checking, discrepancy flags, and research-gap generation. This implementation links those into a narrow uncertainty-calibration workflow: overconfident claims are rewritten, replication failures produce blockers, and low-confidence claims become research opportunities. diff --git a/uncertainty-calibration-assistant/demo.js b/uncertainty-calibration-assistant/demo.js new file mode 100644 index 0000000..392347b --- /dev/null +++ b/uncertainty-calibration-assistant/demo.js @@ -0,0 +1,76 @@ +const fs = require("fs"); +const path = require("path"); +const { evaluateUncertaintyCalibration } = require("./index"); + +const outputDir = path.join(__dirname, "reports"); +fs.mkdirSync(outputDir, { recursive: true }); + +const packet = { + manuscriptId: "ms-cognitive-fatigue-042", + domain: "clinical neuroscience", + claims: [ + { + id: "claim-a", + text: "The intervention definitively eliminates cognitive fatigue in post-viral patients.", + strength: "high", + limitations: [], + evidence: { + primaryData: true, + statisticalTest: "mixed-effects model", + effectSize: 0.26, + pValue: 0.04, + sampleSize: 22, + citations: ["doi:10.5555/pilot-fatigue"], + replication: { status: "non-deterministic", runId: "rerun-17" }, + }, + }, + { + id: "claim-b", + text: "The wearable sleep signal suggests a reusable early-warning marker.", + strength: "moderate", + limitations: ["external cohort not yet enrolled"], + evidence: { + primaryData: true, + statisticalTest: "bootstrap stability", + effectSize: 0.39, + confidenceInterval: "95% CI 0.14-0.58", + sampleSize: 72, + citations: ["doi:10.5555/sleep-marker", "doi:10.5555/wearable-review"], + replication: { status: "passed", runId: "rerun-18" }, + }, + }, + ], +}; + +const report = evaluateUncertaintyCalibration(packet); +const jsonPath = path.join(outputDir, "uncertainty-calibration-report.json"); +const markdownPath = path.join(outputDir, "uncertainty-calibration-report.md"); + +fs.writeFileSync(jsonPath, JSON.stringify(report, null, 2)); +fs.writeFileSync( + markdownPath, + [ + "# Uncertainty Calibration Assistant Demo", + "", + `Decision: ${report.decision}`, + `Reproducibility confidence: ${report.reproducibilityConfidence}`, + `Audit digest: ${report.auditDigest}`, + "", + "## Calibrated Claims", + "", + ...report.calibratedClaims.map((claim) => `- ${claim.id}: ${claim.calibrated} (${claim.confidence})`), + "", + "## Findings", + "", + ...report.findings.map((finding) => `- ${finding.severity}: ${finding.code} - ${finding.message}`), + "", + "## Research Opportunities", + "", + ...report.researchOpportunities.map((item) => `- ${item.priority}: ${item.claimId} - ${item.opportunity}`), + "", + ].join("\n"), +); + +console.log(`Wrote ${jsonPath}`); +console.log(`Wrote ${markdownPath}`); +console.log(`${report.decision}: ${report.findings.length} finding(s), ${report.auditDigest}`); diff --git a/uncertainty-calibration-assistant/demo.mp4 b/uncertainty-calibration-assistant/demo.mp4 new file mode 100644 index 0000000..b7e9024 Binary files /dev/null and b/uncertainty-calibration-assistant/demo.mp4 differ diff --git a/uncertainty-calibration-assistant/demo.svg b/uncertainty-calibration-assistant/demo.svg new file mode 100644 index 0000000..451d232 --- /dev/null +++ b/uncertainty-calibration-assistant/demo.svg @@ -0,0 +1,25 @@ + + + + Uncertainty Calibration Assistant + AI research assistant slice for issue #16 + + Claim Review + absolute wording + evidence score + confidence interval + limitations + + Reproducibility + passed reruns + failed reruns + non-determinism + confidence signal + + Output + review findings + calibrated claims + research gaps + audit digest + Decision: revise-before-submission when claims exceed evidence strength. + diff --git a/uncertainty-calibration-assistant/index.js b/uncertainty-calibration-assistant/index.js new file mode 100644 index 0000000..5ae00d9 --- /dev/null +++ b/uncertainty-calibration-assistant/index.js @@ -0,0 +1,227 @@ +const crypto = require("crypto"); + +function asArray(value) { + if (!value) return []; + return Array.isArray(value) ? value : [value]; +} + +function stableStringify(value) { + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + return JSON.stringify(value); +} + +function digest(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex"); +} + +function normalize(value) { + return String(value || "").trim().toLowerCase(); +} + +function hasAbsoluteLanguage(text) { + return /\b(proves?|definitive|conclusive|always|never|guarantees?|eliminates?|fully explains?)\b/i.test(text || ""); +} + +function severityRank(severity) { + return { blocker: 3, warning: 2, info: 1 }[severity] || 0; +} + +function addFinding(findings, severity, code, claimId, message, remediation) { + findings.push({ severity, code, claimId, message, remediation }); +} + +function evidenceScore(evidence) { + const record = evidence || {}; + let score = 0; + if (record.primaryData) score += 2; + if (record.statisticalTest) score += 1; + if (record.confidenceInterval) score += 1; + if (record.effectSize !== undefined && record.effectSize !== null) score += 1; + if (asArray(record.citations).length >= 2) score += 1; + if (record.replication && normalize(record.replication.status) === "passed") score += 2; + if (record.replication && normalize(record.replication.status) === "failed") score -= 3; + if (record.replication && normalize(record.replication.status) === "non-deterministic") score -= 2; + if (record.sampleSize && record.sampleSize >= 30) score += 1; + return score; +} + +function calibrateWording(claim, findings) { + const text = claim.text || ""; + const maxSeverity = findings.reduce( + (highest, finding) => (severityRank(finding.severity) > severityRank(highest) ? finding.severity : highest), + "info", + ); + + if (maxSeverity === "blocker") { + return text + .replace(/\bproves?\b/gi, "suggests") + .replace(/\bdefinitive(?:ly)?\b/gi, "preliminarily") + .replace(/\bconclusive\b/gi, "provisional") + .replace(/\beliminates?\b/gi, "reduces"); + } + + if (maxSeverity === "warning" && hasAbsoluteLanguage(text)) { + return text.replace(/\b(always|never|guarantees?)\b/gi, "may"); + } + + return text; +} + +function evaluateUncertaintyCalibration(packet) { + const manuscript = packet || {}; + const claims = asArray(manuscript.claims); + const findings = []; + const calibratedClaims = []; + + if (!manuscript.manuscriptId || !manuscript.domain) { + addFinding( + findings, + "blocker", + "MANUSCRIPT_CONTEXT_MISSING", + "manuscript", + "Manuscript id and domain are required before adaptive review calibration can run.", + "Attach manuscript identity and domain metadata so the assistant can select the correct review standard.", + ); + } + + if (claims.length === 0) { + addFinding( + findings, + "blocker", + "NO_CLAIMS_TO_CALIBRATE", + "manuscript", + "No manuscript claims were provided for calibration.", + "Extract claim statements from the draft before running the pre-submission assistant.", + ); + } + + for (const claim of claims) { + const claimFindings = []; + const score = evidenceScore(claim.evidence); + const evidence = claim.evidence || {}; + const requestedConfidence = normalize(claim.confidence || claim.strength); + + if (!claim.id || !claim.text) { + addFinding( + claimFindings, + "blocker", + "CLAIM_IDENTITY_MISSING", + claim.id || "unknown", + "Claim is missing an id or statement text.", + "Store a stable claim id and the exact manuscript statement before generating review guidance.", + ); + } + + if (hasAbsoluteLanguage(claim.text) && score < 5) { + addFinding( + claimFindings, + "blocker", + "ABSOLUTE_LANGUAGE_UNDER_SUPPORTED", + claim.id, + "The claim uses high-certainty language without enough evidence support.", + "Replace absolute wording with calibrated language or attach stronger statistical, replication, and citation evidence.", + ); + } + + if ((requestedConfidence === "high" || requestedConfidence === "definitive") && score < 5) { + addFinding( + claimFindings, + "warning", + "CONFIDENCE_EXCEEDS_EVIDENCE", + claim.id, + "Declared confidence is higher than the evidence packet supports.", + "Lower the confidence rating or add primary data, confidence intervals, and replication evidence.", + ); + } + + if (!evidence.confidenceInterval && (evidence.pValue !== undefined || evidence.effectSize !== undefined)) { + addFinding( + claimFindings, + "warning", + "UNCERTAINTY_INTERVAL_MISSING", + claim.id, + "Statistical evidence lacks a confidence or credible interval.", + "Add interval estimates so reviewers can judge magnitude and uncertainty.", + ); + } + + if (evidence.replication && normalize(evidence.replication.status) === "failed") { + addFinding( + claimFindings, + "blocker", + "FAILED_REPLICATION_OVERCLAIM", + claim.id, + "A failed replication is attached but the claim is still presented as stable.", + "Route the claim through rebuttal, limitation, or research-gap framing before submission.", + ); + } + + if (asArray(claim.limitations).length === 0 && score < 5) { + addFinding( + claimFindings, + "warning", + "LIMITATION_DISCLOSURE_MISSING", + claim.id, + "Under-supported claim has no limitation disclosure.", + "Add a limitations sentence that explains sample, method, or reproducibility uncertainty.", + ); + } + + findings.push(...claimFindings); + calibratedClaims.push({ + id: claim.id, + original: claim.text, + calibrated: calibrateWording(claim, claimFindings), + evidenceScore: score, + confidence: score >= 6 ? "high" : score >= 3 ? "moderate" : "low", + findingCodes: claimFindings.map((finding) => finding.code), + }); + } + + const blockers = findings.filter((finding) => finding.severity === "blocker"); + const warnings = findings.filter((finding) => finding.severity === "warning"); + const reproducibilitySignals = claims.map((claim) => normalize(claim.evidence && claim.evidence.replication && claim.evidence.replication.status)); + const failedReplicationCount = reproducibilitySignals.filter((status) => status === "failed").length; + const passedReplicationCount = reproducibilitySignals.filter((status) => status === "passed").length; + + const researchOpportunities = calibratedClaims + .filter((claim) => claim.confidence === "low" || claim.findingCodes.includes("FAILED_REPLICATION_OVERCLAIM")) + .map((claim) => ({ + claimId: claim.id, + opportunity: "Run targeted replication or collect stronger primary evidence before making a high-certainty statement.", + priority: claim.findingCodes.includes("FAILED_REPLICATION_OVERCLAIM") ? "high" : "medium", + })); + + const reviewPacket = { + decision: blockers.length > 0 ? "revise-before-submission" : warnings.length > 0 ? "calibrate-language" : "ready-for-review", + manuscriptId: manuscript.manuscriptId, + domain: manuscript.domain, + counts: { + blocker: blockers.length, + warning: warnings.length, + info: findings.filter((finding) => finding.severity === "info").length, + }, + reproducibilityConfidence: + failedReplicationCount > 0 ? "low" : passedReplicationCount >= Math.max(1, claims.length - 1) ? "high" : "moderate", + calibratedClaims, + findings, + researchOpportunities, + }; + + return { + ...reviewPacket, + auditDigest: digest(reviewPacket), + }; +} + +module.exports = { + evaluateUncertaintyCalibration, + evidenceScore, + stableStringify, +}; diff --git a/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.json b/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.json new file mode 100644 index 0000000..13ad46d --- /dev/null +++ b/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.json @@ -0,0 +1,72 @@ +{ + "decision": "revise-before-submission", + "manuscriptId": "ms-cognitive-fatigue-042", + "domain": "clinical neuroscience", + "counts": { + "blocker": 1, + "warning": 3, + "info": 0 + }, + "reproducibilityConfidence": "high", + "calibratedClaims": [ + { + "id": "claim-a", + "original": "The intervention definitively eliminates cognitive fatigue in post-viral patients.", + "calibrated": "The intervention definitively reduces cognitive fatigue in post-viral patients.", + "evidenceScore": 2, + "confidence": "low", + "findingCodes": [ + "ABSOLUTE_LANGUAGE_UNDER_SUPPORTED", + "CONFIDENCE_EXCEEDS_EVIDENCE", + "UNCERTAINTY_INTERVAL_MISSING", + "LIMITATION_DISCLOSURE_MISSING" + ] + }, + { + "id": "claim-b", + "original": "The wearable sleep signal suggests a reusable early-warning marker.", + "calibrated": "The wearable sleep signal suggests a reusable early-warning marker.", + "evidenceScore": 9, + "confidence": "high", + "findingCodes": [] + } + ], + "findings": [ + { + "severity": "blocker", + "code": "ABSOLUTE_LANGUAGE_UNDER_SUPPORTED", + "claimId": "claim-a", + "message": "The claim uses high-certainty language without enough evidence support.", + "remediation": "Replace absolute wording with calibrated language or attach stronger statistical, replication, and citation evidence." + }, + { + "severity": "warning", + "code": "CONFIDENCE_EXCEEDS_EVIDENCE", + "claimId": "claim-a", + "message": "Declared confidence is higher than the evidence packet supports.", + "remediation": "Lower the confidence rating or add primary data, confidence intervals, and replication evidence." + }, + { + "severity": "warning", + "code": "UNCERTAINTY_INTERVAL_MISSING", + "claimId": "claim-a", + "message": "Statistical evidence lacks a confidence or credible interval.", + "remediation": "Add interval estimates so reviewers can judge magnitude and uncertainty." + }, + { + "severity": "warning", + "code": "LIMITATION_DISCLOSURE_MISSING", + "claimId": "claim-a", + "message": "Under-supported claim has no limitation disclosure.", + "remediation": "Add a limitations sentence that explains sample, method, or reproducibility uncertainty." + } + ], + "researchOpportunities": [ + { + "claimId": "claim-a", + "opportunity": "Run targeted replication or collect stronger primary evidence before making a high-certainty statement.", + "priority": "medium" + } + ], + "auditDigest": "4a55cd374d90ac0c0abcb090e4b05005ec320b06b5405bfd242fee43352edf90" +} \ No newline at end of file diff --git a/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.md b/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.md new file mode 100644 index 0000000..4ecf94b --- /dev/null +++ b/uncertainty-calibration-assistant/reports/uncertainty-calibration-report.md @@ -0,0 +1,21 @@ +# Uncertainty Calibration Assistant Demo + +Decision: revise-before-submission +Reproducibility confidence: high +Audit digest: 4a55cd374d90ac0c0abcb090e4b05005ec320b06b5405bfd242fee43352edf90 + +## Calibrated Claims + +- claim-a: The intervention definitively reduces cognitive fatigue in post-viral patients. (low) +- claim-b: The wearable sleep signal suggests a reusable early-warning marker. (high) + +## Findings + +- blocker: ABSOLUTE_LANGUAGE_UNDER_SUPPORTED - The claim uses high-certainty language without enough evidence support. +- warning: CONFIDENCE_EXCEEDS_EVIDENCE - Declared confidence is higher than the evidence packet supports. +- warning: UNCERTAINTY_INTERVAL_MISSING - Statistical evidence lacks a confidence or credible interval. +- warning: LIMITATION_DISCLOSURE_MISSING - Under-supported claim has no limitation disclosure. + +## Research Opportunities + +- medium: claim-a - Run targeted replication or collect stronger primary evidence before making a high-certainty statement. diff --git a/uncertainty-calibration-assistant/requirements-map.md b/uncertainty-calibration-assistant/requirements-map.md new file mode 100644 index 0000000..caea713 --- /dev/null +++ b/uncertainty-calibration-assistant/requirements-map.md @@ -0,0 +1,15 @@ +# Requirements Map + +Issue #16 asks for an AI assistant suite spanning pre-submission review, reproducibility checking, and research-gap discovery. + +| Issue requirement | Implementation coverage | +| --- | --- | +| Auto peer review reports | Emits reviewer findings for overclaiming, unsupported confidence, missing intervals, and missing limitations. | +| Claims vs evidence alignment | Scores evidence packets against claim wording and declared confidence. | +| Statistical or methodological red flags | Flags missing confidence intervals, weak sample evidence, and failed replication overclaiming. | +| Reproducibility checker | Converts attached replication statuses into a reproducibility confidence signal. | +| Flags discrepancies or non-determinism | Treats failed and non-deterministic reruns as claim-calibration blockers. | +| Research gap finder | Creates follow-up research opportunities for low-confidence or failed-replication claims. | +| Adaptive templates per domain | Requires manuscript domain metadata and keeps the reviewer packet domain-scoped. | + +This slice is distinct from existing submissions because it focuses on uncertainty and confidence calibration, not citation matching, figure/table consistency, statistical test selection, benchmark leakage, rebuttal response planning, or generic research-gap ranking. diff --git a/uncertainty-calibration-assistant/test.js b/uncertainty-calibration-assistant/test.js new file mode 100644 index 0000000..6d29182 --- /dev/null +++ b/uncertainty-calibration-assistant/test.js @@ -0,0 +1,128 @@ +const assert = require("assert"); +const { evaluateUncertaintyCalibration, evidenceScore } = require("./index"); + +function readyPacket(overrides = {}) { + return { + manuscriptId: "ms-neuro-uncertainty-001", + domain: "clinical neuroscience", + claims: [ + { + id: "claim-1", + text: "The intervention reduces symptom burden in the pilot cohort.", + strength: "moderate", + limitations: ["single-site pilot cohort"], + evidence: { + primaryData: true, + statisticalTest: "mixed-effects model", + effectSize: 0.42, + confidenceInterval: "95% CI 0.18-0.63", + pValue: 0.01, + sampleSize: 84, + citations: ["doi:10.1000/a", "doi:10.1000/b"], + replication: { status: "passed", runId: "rr-001" }, + }, + }, + { + id: "claim-2", + text: "The pipeline suggests a reusable biomarker panel.", + strength: "moderate", + limitations: ["external cohort pending"], + evidence: { + primaryData: true, + statisticalTest: "bootstrap stability", + effectSize: 0.31, + confidenceInterval: "95% CI 0.12-0.49", + sampleSize: 54, + citations: ["doi:10.1000/c"], + replication: { status: "passed", runId: "rr-002" }, + }, + }, + ], + ...overrides, + }; +} + +function testReadyPacket() { + const result = evaluateUncertaintyCalibration(readyPacket()); + assert.equal(result.decision, "ready-for-review"); + assert.equal(result.counts.blocker, 0); + assert.equal(result.reproducibilityConfidence, "high"); +} + +function testAbsoluteLanguageIsBlocked() { + const result = evaluateUncertaintyCalibration( + readyPacket({ + claims: [ + { + id: "claim-risk", + text: "This model definitively proves the intervention eliminates relapse.", + strength: "high", + limitations: [], + evidence: { + primaryData: true, + pValue: 0.04, + sampleSize: 18, + citations: ["doi:10.1000/a"], + }, + }, + ], + }), + ); + + assert.equal(result.decision, "revise-before-submission"); + assert.ok(result.findings.some((finding) => finding.code === "ABSOLUTE_LANGUAGE_UNDER_SUPPORTED")); + assert.ok(result.calibratedClaims[0].calibrated.includes("preliminarily")); +} + +function testFailedReplicationCreatesResearchOpportunity() { + const result = evaluateUncertaintyCalibration( + readyPacket({ + claims: [ + { + id: "claim-repro", + text: "The benchmark proves the method always outperforms baseline.", + strength: "definitive", + limitations: [], + evidence: { + primaryData: true, + statisticalTest: "paired t-test", + effectSize: 0.21, + confidenceInterval: "95% CI -0.02-0.44", + sampleSize: 44, + citations: ["doi:10.1000/a"], + replication: { status: "failed", runId: "rr-failed" }, + }, + }, + ], + }), + ); + + assert.equal(result.reproducibilityConfidence, "low"); + assert.ok(result.findings.some((finding) => finding.code === "FAILED_REPLICATION_OVERCLAIM")); + assert.equal(result.researchOpportunities[0].priority, "high"); +} + +function testDeterministicDigest() { + const first = evaluateUncertaintyCalibration(readyPacket()); + const second = evaluateUncertaintyCalibration(readyPacket()); + assert.equal(first.auditDigest, second.auditDigest); +} + +function testEvidenceScoreRewardsReplication() { + assert.ok( + evidenceScore({ + primaryData: true, + confidenceInterval: "95% CI 0.1-0.3", + sampleSize: 50, + replication: { status: "passed" }, + }) > evidenceScore({ primaryData: true, sampleSize: 10, replication: { status: "failed" } }), + ); +} + +testReadyPacket(); +testAbsoluteLanguageIsBlocked(); +testFailedReplicationCreatesResearchOpportunity(); +testDeterministicDigest(); +testEvidenceScoreRewardsReplication(); + +console.log("uncertainty-calibration-assistant tests passed");