diff --git a/knowledge-graph-recommendation-diversity-guard/README.md b/knowledge-graph-recommendation-diversity-guard/README.md new file mode 100644 index 0000000..f31337f --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/README.md @@ -0,0 +1,23 @@ +# Knowledge Graph Recommendation Diversity Guard + +This module is a focused Scientific Knowledge Graph Integration slice for issue #17. It validates discovery-mode recommendation sets before researchers see them, so graph suggestions do not collapse into one institution, funder, method, or stale citation cluster. + +The guard checks: + +- institution, funder, domain, and method concentration +- citation-herd dominance by one highly cited node +- stale evidence dominance +- missing recommendation rationale paths +- too few visible recommendations for exploratory discovery +- per-recommendation curator actions for weak graph suggestions + +It uses only Node.js built-ins and synthetic data. + +## Run + +```bash +node knowledge-graph-recommendation-diversity-guard/test.js +node knowledge-graph-recommendation-diversity-guard/demo.js +``` + +The demo writes reviewer artifacts to `knowledge-graph-recommendation-diversity-guard/reports/`. diff --git a/knowledge-graph-recommendation-diversity-guard/demo.js b/knowledge-graph-recommendation-diversity-guard/demo.js new file mode 100644 index 0000000..064f1c5 --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/demo.js @@ -0,0 +1,102 @@ +const fs = require('fs'); +const path = require('path'); +const { evaluateRecommendationDiversity } = require('./index'); +const { biasedRecommendationSet } = require('./sample-data'); + +const reportDir = path.join(__dirname, 'reports'); +fs.mkdirSync(reportDir, { recursive: true }); + +const result = evaluateRecommendationDiversity(biasedRecommendationSet); +const packetPath = path.join(reportDir, 'recommendation-diversity-packet.json'); +const reportPath = path.join(reportDir, 'recommendation-diversity-report.md'); +const svgPath = path.join(reportDir, 'summary.svg'); + +fs.writeFileSync(packetPath, `${JSON.stringify(result, null, 2)}\n`); + +const concentrationRows = result.concentrationChecks + .map((check) => `| ${check.metric} | ${check.result.value} | ${check.result.share} | ${check.limit} |`) + .join('\n'); +const itemRows = result.itemResults + .map((item) => `| ${item.id} | ${item.status} | ${item.institution || '-'} | ${item.funder || '-'} | ${item.method || '-'} | ${item.blockers.join(', ') || '-'} |`) + .join('\n'); + +fs.writeFileSync( + reportPath, + `# Knowledge Graph Recommendation Diversity Report + +Status: ${result.status} +Audit digest: ${result.auditDigest} + +## Recommendation Set + +Title: ${result.set.title} +Audience: ${result.set.audience} +Showable recommendations: ${result.showableRecommendations}/${result.totalRecommendations} + +## Concentration Checks + +| Metric | Top value | Share | Limit | +| --- | --- | ---: | ---: | +${concentrationRows} + +## Recommendation Review + +| Recommendation | Status | Institution | Funder | Method | Blockers | +| --- | --- | --- | --- | --- | --- | +${itemRows} + +## Citation Herd + +- Top cited recommendation: ${result.citationHerd.id} +- Citation share: ${result.citationHerd.share} + +## Stale Evidence + +- Stale evidence share: ${result.staleEvidenceShare} + +## Blockers + +${result.blockers.map((blocker) => `- ${blocker}`).join('\n')} + +## Curator Actions + +${result.curatorActions.map((action) => `- ${action}`).join('\n')} +`, +); + +fs.writeFileSync( + svgPath, + ` + + + Knowledge Graph Recommendation Guard + Status: ${result.status} | Showable: ${result.showableRecommendations}/${result.totalRecommendations} + + + ${result.blockers.length} + diversity blockers + + + + ${result.warnings.length} + disclosure warnings + + + + ${result.staleEvidenceShare} + stale evidence share + + Top institution share: ${result.concentrationChecks[0].result.share} + Top funder share: ${result.concentrationChecks[1].result.share} + Digest: ${result.auditDigest.slice(0, 32)}... + +`, +); + +console.log(`status=${result.status}`); +console.log(`showable=${result.showableRecommendations}/${result.totalRecommendations}`); +console.log(`blockers=${result.blockers.length}`); +console.log(`warnings=${result.warnings.length}`); +console.log(`staleEvidenceShare=${result.staleEvidenceShare}`); +console.log(`auditDigest=${result.auditDigest}`); +console.log(`reports=${reportDir}`); diff --git a/knowledge-graph-recommendation-diversity-guard/index.js b/knowledge-graph-recommendation-diversity-guard/index.js new file mode 100644 index 0000000..a7930d5 --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/index.js @@ -0,0 +1,206 @@ +const crypto = require('crypto'); + +const DEFAULT_POLICY = { + minRecommendations: 5, + minRationaleEdges: 2, + maxInstitutionShare: 0.5, + maxFunderShare: 0.5, + maxDomainShare: 0.7, + maxMethodShare: 0.5, + maxTopCitationShare: 0.6, + maxStaleEvidenceShare: 0.4, + staleEvidenceBeforeYear: 2023, +}; + +function asArray(value) { + return Array.isArray(value) ? value : []; +} + +function stableJson(value) { + if (Array.isArray(value)) { + return `[${value.map(stableJson).join(',')}]`; + } + if (value && typeof value === 'object') { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`) + .join(',')}}`; + } + return JSON.stringify(value); +} + +function digest(value) { + return crypto.createHash('sha256').update(stableJson(value)).digest('hex'); +} + +function unique(values) { + return [...new Set(values.filter(Boolean))]; +} + +function ratio(count, total) { + return total === 0 ? 0 : Number((count / total).toFixed(3)); +} + +function topShare(items, field) { + const counts = new Map(); + for (const item of items) { + const value = item[field] || 'unknown'; + counts.set(value, (counts.get(value) || 0) + 1); + } + const sorted = [...counts.entries()].sort((a, b) => b[1] - a[1]); + const [value, count] = sorted[0] || ['none', 0]; + return { + field, + value, + count, + share: ratio(count, items.length), + }; +} + +function citationShare(items) { + const total = items.reduce((sum, item) => sum + Math.max(0, Number(item.citations || 0)), 0); + const top = items.reduce((winner, item) => (Number(item.citations || 0) > Number(winner.citations || 0) ? item : winner), items[0] || {}); + return { + id: top.id || 'none', + citations: Number(top.citations || 0), + share: total === 0 ? 0 : Number((Number(top.citations || 0) / total).toFixed(3)), + }; +} + +function evaluateRecommendation(item, policy) { + const blockers = []; + const warnings = []; + const actions = []; + const rationaleEdges = asArray(item.rationaleEdges); + + if (rationaleEdges.length < policy.minRationaleEdges) { + blockers.push('insufficient_rationale_edges'); + actions.push('add_explainable_graph_path'); + } + + if (!item.institution) { + warnings.push('institution_missing'); + actions.push('add_institution_metadata'); + } + + if (!item.funder) { + warnings.push('funder_missing'); + actions.push('add_funder_metadata'); + } + + if (!item.method) { + warnings.push('method_missing'); + actions.push('add_method_metadata'); + } + + if (Number(item.evidenceYear || 0) < policy.staleEvidenceBeforeYear) { + warnings.push('stale_evidence'); + actions.push('refresh_or_label_evidence_age'); + } + + if (item.suppressed === true) { + blockers.push('already_suppressed_source'); + actions.push('replace_suppressed_source'); + } + + return { + id: item.id, + title: item.title, + domain: item.domain, + institution: item.institution, + funder: item.funder, + method: item.method, + status: blockers.length ? 'hold' : warnings.length ? 'warn' : 'show', + blockers, + warnings, + actions: unique(actions), + }; +} + +function evaluateRecommendationDiversity(input) { + const policy = { ...DEFAULT_POLICY, ...(input.policy || {}) }; + const recommendationSet = input.recommendationSet || {}; + const recommendations = asArray(recommendationSet.recommendations); + const itemResults = recommendations.map((item) => evaluateRecommendation(item, policy)); + const shownItems = recommendations.filter((item) => { + const result = itemResults.find((candidate) => candidate.id === item.id); + return result && result.status !== 'hold'; + }); + const blockers = []; + const warnings = []; + const actions = []; + + if (shownItems.length < policy.minRecommendations) { + blockers.push('too_few_showable_recommendations'); + actions.push('add_more_diverse_candidates'); + } + + const concentrationChecks = [ + { metric: 'institution', result: topShare(recommendations, 'institution'), limit: policy.maxInstitutionShare }, + { metric: 'funder', result: topShare(recommendations, 'funder'), limit: policy.maxFunderShare }, + { metric: 'domain', result: topShare(recommendations, 'domain'), limit: policy.maxDomainShare }, + { metric: 'method', result: topShare(recommendations, 'method'), limit: policy.maxMethodShare }, + ]; + + for (const check of concentrationChecks) { + if (check.result.share > check.limit) { + blockers.push(`${check.metric}_concentration`); + actions.push(`rebalance_${check.metric}_mix`); + } + } + + const citation = citationShare(recommendations); + if (citation.share > policy.maxTopCitationShare) { + warnings.push('citation_herd_risk'); + actions.push('add_low_citation_or_recent_counterpoints'); + } + + const staleCount = recommendations.filter((item) => Number(item.evidenceYear || 0) < policy.staleEvidenceBeforeYear).length; + const staleShare = ratio(staleCount, recommendations.length); + if (staleShare > policy.maxStaleEvidenceShare) { + blockers.push('stale_evidence_dominance'); + actions.push('refresh_recommendation_evidence'); + } + + for (const result of itemResults) { + for (const blocker of result.blockers) { + blockers.push(`${result.id}_${blocker}`); + } + for (const warning of result.warnings) { + warnings.push(`${result.id}_${warning}`); + } + for (const action of result.actions) { + actions.push(`${result.id}_${action}`); + } + } + + const status = blockers.length ? 'hold' : warnings.length ? 'show_with_disclosures' : 'show'; + const packet = { + set: { + id: recommendationSet.id, + title: recommendationSet.title, + audience: recommendationSet.audience, + }, + status, + showableRecommendations: shownItems.length, + totalRecommendations: recommendations.length, + concentrationChecks, + citationHerd: citation, + staleEvidenceShare: staleShare, + blockers: unique(blockers), + warnings: unique(warnings), + curatorActions: unique(actions), + itemResults, + }; + + return { + ...packet, + auditDigest: digest(packet), + }; +} + +module.exports = { + DEFAULT_POLICY, + digest, + evaluateRecommendationDiversity, +}; diff --git a/knowledge-graph-recommendation-diversity-guard/reports/demo.mp4 b/knowledge-graph-recommendation-diversity-guard/reports/demo.mp4 new file mode 100644 index 0000000..c0eac9d Binary files /dev/null and b/knowledge-graph-recommendation-diversity-guard/reports/demo.mp4 differ diff --git a/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-packet.json b/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-packet.json new file mode 100644 index 0000000..a3be7fe --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-packet.json @@ -0,0 +1,172 @@ +{ + "set": { + "id": "kg-rec-astro-bio-2026-05", + "title": "Discovery mode recommendations for CRISPR stress-screen projects", + "audience": "computational biology researchers" + }, + "status": "hold", + "showableRecommendations": 2, + "totalRecommendations": 5, + "concentrationChecks": [ + { + "metric": "institution", + "result": { + "field": "institution", + "value": "Stanford", + "count": 3, + "share": 0.6 + }, + "limit": 0.5 + }, + { + "metric": "funder", + "result": { + "field": "funder", + "value": "Funder A", + "count": 3, + "share": 0.6 + }, + "limit": 0.5 + }, + { + "metric": "domain", + "result": { + "field": "domain", + "value": "single-cell biology", + "count": 4, + "share": 0.8 + }, + "limit": 0.7 + }, + { + "metric": "method", + "result": { + "field": "method", + "value": "CRISPR screen", + "count": 4, + "share": 0.8 + }, + "limit": 0.5 + } + ], + "citationHerd": { + "id": "rec-1", + "citations": 920, + "share": 0.495 + }, + "staleEvidenceShare": 0.6, + "blockers": [ + "too_few_showable_recommendations", + "institution_concentration", + "funder_concentration", + "domain_concentration", + "method_concentration", + "stale_evidence_dominance", + "rec-1_insufficient_rationale_edges", + "rec-2_insufficient_rationale_edges", + "rec-3_insufficient_rationale_edges" + ], + "warnings": [ + "rec-1_stale_evidence", + "rec-2_stale_evidence", + "rec-3_stale_evidence" + ], + "curatorActions": [ + "add_more_diverse_candidates", + "rebalance_institution_mix", + "rebalance_funder_mix", + "rebalance_domain_mix", + "rebalance_method_mix", + "refresh_recommendation_evidence", + "rec-1_add_explainable_graph_path", + "rec-1_refresh_or_label_evidence_age", + "rec-2_add_explainable_graph_path", + "rec-2_refresh_or_label_evidence_age", + "rec-3_add_explainable_graph_path", + "rec-3_refresh_or_label_evidence_age" + ], + "itemResults": [ + { + "id": "rec-1", + "title": "Stanford CRISPR stress atlas", + "domain": "single-cell biology", + "institution": "Stanford", + "funder": "Funder A", + "method": "CRISPR screen", + "status": "hold", + "blockers": [ + "insufficient_rationale_edges" + ], + "warnings": [ + "stale_evidence" + ], + "actions": [ + "add_explainable_graph_path", + "refresh_or_label_evidence_age" + ] + }, + { + "id": "rec-2", + "title": "Stanford perturbation notebook", + "domain": "single-cell biology", + "institution": "Stanford", + "funder": "Funder A", + "method": "CRISPR screen", + "status": "hold", + "blockers": [ + "insufficient_rationale_edges" + ], + "warnings": [ + "stale_evidence" + ], + "actions": [ + "add_explainable_graph_path", + "refresh_or_label_evidence_age" + ] + }, + { + "id": "rec-3", + "title": "Stanford follow-up protocol", + "domain": "single-cell biology", + "institution": "Stanford", + "funder": "Funder A", + "method": "CRISPR screen", + "status": "hold", + "blockers": [ + "insufficient_rationale_edges" + ], + "warnings": [ + "stale_evidence" + ], + "actions": [ + "add_explainable_graph_path", + "refresh_or_label_evidence_age" + ] + }, + { + "id": "rec-4", + "title": "Cambridge replication dataset", + "domain": "single-cell biology", + "institution": "Cambridge", + "funder": "Funder B", + "method": "CRISPR screen", + "status": "show", + "blockers": [], + "warnings": [], + "actions": [] + }, + { + "id": "rec-5", + "title": "Tokyo microscopy validation", + "domain": "cell imaging", + "institution": "Tokyo BioLab", + "funder": "Funder C", + "method": "confocal imaging", + "status": "show", + "blockers": [], + "warnings": [], + "actions": [] + } + ], + "auditDigest": "4b0b52688d9e49051fa43f68ab9a914134dc767e5a1fd0c945096bcbda1fd5df" +} diff --git a/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-report.md b/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-report.md new file mode 100644 index 0000000..fac10f4 --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/reports/recommendation-diversity-report.md @@ -0,0 +1,65 @@ +# Knowledge Graph Recommendation Diversity Report + +Status: hold +Audit digest: 4b0b52688d9e49051fa43f68ab9a914134dc767e5a1fd0c945096bcbda1fd5df + +## Recommendation Set + +Title: Discovery mode recommendations for CRISPR stress-screen projects +Audience: computational biology researchers +Showable recommendations: 2/5 + +## Concentration Checks + +| Metric | Top value | Share | Limit | +| --- | --- | ---: | ---: | +| institution | Stanford | 0.6 | 0.5 | +| funder | Funder A | 0.6 | 0.5 | +| domain | single-cell biology | 0.8 | 0.7 | +| method | CRISPR screen | 0.8 | 0.5 | + +## Recommendation Review + +| Recommendation | Status | Institution | Funder | Method | Blockers | +| --- | --- | --- | --- | --- | --- | +| rec-1 | hold | Stanford | Funder A | CRISPR screen | insufficient_rationale_edges | +| rec-2 | hold | Stanford | Funder A | CRISPR screen | insufficient_rationale_edges | +| rec-3 | hold | Stanford | Funder A | CRISPR screen | insufficient_rationale_edges | +| rec-4 | show | Cambridge | Funder B | CRISPR screen | - | +| rec-5 | show | Tokyo BioLab | Funder C | confocal imaging | - | + +## Citation Herd + +- Top cited recommendation: rec-1 +- Citation share: 0.495 + +## Stale Evidence + +- Stale evidence share: 0.6 + +## Blockers + +- too_few_showable_recommendations +- institution_concentration +- funder_concentration +- domain_concentration +- method_concentration +- stale_evidence_dominance +- rec-1_insufficient_rationale_edges +- rec-2_insufficient_rationale_edges +- rec-3_insufficient_rationale_edges + +## Curator Actions + +- add_more_diverse_candidates +- rebalance_institution_mix +- rebalance_funder_mix +- rebalance_domain_mix +- rebalance_method_mix +- refresh_recommendation_evidence +- rec-1_add_explainable_graph_path +- rec-1_refresh_or_label_evidence_age +- rec-2_add_explainable_graph_path +- rec-2_refresh_or_label_evidence_age +- rec-3_add_explainable_graph_path +- rec-3_refresh_or_label_evidence_age diff --git a/knowledge-graph-recommendation-diversity-guard/reports/summary.svg b/knowledge-graph-recommendation-diversity-guard/reports/summary.svg new file mode 100644 index 0000000..cc2a026 --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/reports/summary.svg @@ -0,0 +1,24 @@ + + + + Knowledge Graph Recommendation Guard + Status: hold | Showable: 2/5 + + + 9 + diversity blockers + + + + 3 + disclosure warnings + + + + 0.6 + stale evidence share + + Top institution share: 0.6 + Top funder share: 0.6 + Digest: 4b0b52688d9e49051fa43f68ab9a9141... + diff --git a/knowledge-graph-recommendation-diversity-guard/sample-data.js b/knowledge-graph-recommendation-diversity-guard/sample-data.js new file mode 100644 index 0000000..c994c7c --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/sample-data.js @@ -0,0 +1,96 @@ +const biasedRecommendationSet = { + policy: { + minRecommendations: 5, + staleEvidenceBeforeYear: 2023, + }, + recommendationSet: { + id: 'kg-rec-astro-bio-2026-05', + title: 'Discovery mode recommendations for CRISPR stress-screen projects', + audience: 'computational biology researchers', + recommendations: [ + { + id: 'rec-1', + title: 'Stanford CRISPR stress atlas', + domain: 'single-cell biology', + institution: 'Stanford', + funder: 'Funder A', + method: 'CRISPR screen', + evidenceYear: 2021, + citations: 920, + rationaleEdges: ['concept:CRISPR -> dataset:stress-atlas'], + }, + { + id: 'rec-2', + title: 'Stanford perturbation notebook', + domain: 'single-cell biology', + institution: 'Stanford', + funder: 'Funder A', + method: 'CRISPR screen', + evidenceYear: 2020, + citations: 510, + rationaleEdges: ['tool:scanpy -> notebook:perturbation'], + }, + { + id: 'rec-3', + title: 'Stanford follow-up protocol', + domain: 'single-cell biology', + institution: 'Stanford', + funder: 'Funder A', + method: 'CRISPR screen', + evidenceYear: 2019, + citations: 315, + rationaleEdges: ['protocol:stimulation -> paper:stress-atlas'], + }, + { + id: 'rec-4', + title: 'Cambridge replication dataset', + domain: 'single-cell biology', + institution: 'Cambridge', + funder: 'Funder B', + method: 'CRISPR screen', + evidenceYear: 2024, + citations: 83, + rationaleEdges: ['dataset:replication -> paper:stress-atlas', 'method:qc -> result:null'], + }, + { + id: 'rec-5', + title: 'Tokyo microscopy validation', + domain: 'cell imaging', + institution: 'Tokyo BioLab', + funder: 'Funder C', + method: 'confocal imaging', + evidenceYear: 2025, + citations: 31, + rationaleEdges: ['concept:stress -> image:validation', 'image:validation -> dataset:phenotype'], + }, + ], + }, +}; + +function buildBalancedRecommendationSet() { + const clone = JSON.parse(JSON.stringify(biasedRecommendationSet)); + const recs = clone.recommendationSet.recommendations; + recs[0].rationaleEdges.push('paper:stress-atlas -> limitation:sample-size'); + recs[0].evidenceYear = 2024; + recs[0].citations = 300; + recs[1].institution = 'Max Planck'; + recs[1].funder = 'Funder D'; + recs[1].method = 'single-cell RNA-seq'; + recs[1].domain = 'transcriptomics'; + recs[1].evidenceYear = 2025; + recs[1].citations = 180; + recs[1].rationaleEdges.push('dataset:orthogonal -> concept:stress'); + recs[2].institution = 'Toronto Genomics'; + recs[2].funder = 'Funder E'; + recs[2].method = 'proteomics'; + recs[2].domain = 'systems biology'; + recs[2].evidenceYear = 2024; + recs[2].citations = 160; + recs[2].rationaleEdges.push('dataset:proteomics -> concept:stress'); + return clone; +} + +module.exports = { + biasedRecommendationSet, + buildBalancedRecommendationSet, +}; diff --git a/knowledge-graph-recommendation-diversity-guard/test.js b/knowledge-graph-recommendation-diversity-guard/test.js new file mode 100644 index 0000000..42f84b5 --- /dev/null +++ b/knowledge-graph-recommendation-diversity-guard/test.js @@ -0,0 +1,52 @@ +const assert = require('assert'); +const { evaluateRecommendationDiversity } = require('./index'); +const { biasedRecommendationSet, buildBalancedRecommendationSet } = require('./sample-data'); + +function testBiasedSetIsHeld() { + const result = evaluateRecommendationDiversity(biasedRecommendationSet); + assert.strictEqual(result.status, 'hold'); + assert(result.blockers.includes('institution_concentration')); + assert(result.blockers.includes('funder_concentration')); + assert(result.blockers.includes('method_concentration')); + assert(result.blockers.includes('stale_evidence_dominance')); + assert(result.blockers.includes('rec-1_insufficient_rationale_edges')); +} + +function testBalancedSetCanShow() { + const result = evaluateRecommendationDiversity(buildBalancedRecommendationSet()); + assert.strictEqual(result.status, 'show'); + assert.strictEqual(result.blockers.length, 0); + assert.strictEqual(result.warnings.length, 0); + assert.strictEqual(result.showableRecommendations, 5); +} + +function testTooFewRecommendationsBlocks() { + const data = buildBalancedRecommendationSet(); + data.recommendationSet.recommendations = data.recommendationSet.recommendations.slice(0, 3); + const result = evaluateRecommendationDiversity(data); + assert.strictEqual(result.status, 'hold'); + assert(result.blockers.includes('too_few_showable_recommendations')); +} + +function testSuppressedSourceBlocksItem() { + const data = buildBalancedRecommendationSet(); + data.recommendationSet.recommendations[4].suppressed = true; + const result = evaluateRecommendationDiversity(data); + assert.strictEqual(result.status, 'hold'); + assert(result.blockers.includes('rec-5_already_suppressed_source')); +} + +function testDigestIsStable() { + const first = evaluateRecommendationDiversity(buildBalancedRecommendationSet()).auditDigest; + const second = evaluateRecommendationDiversity(buildBalancedRecommendationSet()).auditDigest; + assert.strictEqual(first, second); + assert.strictEqual(first.length, 64); +} + +testBiasedSetIsHeld(); +testBalancedSetCanShow(); +testTooFewRecommendationsBlocks(); +testSuppressedSourceBlocksItem(); +testDigestIsStable(); + +console.log('5 knowledge graph recommendation diversity guard tests passed');