Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions data-dictionary-release-gate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Data Dictionary Release Gate

Self-contained release-readiness module for SCIBASE issue #14, focused on scientific data and code hosting.

The gate evaluates whether tabular research artifacts have a release-ready data dictionary before a DOI, API export, or public archive package is published. It checks column coverage, measurement units, missingness explanations, semantic tags, schema.org/DataCite discovery metadata, checksums, license/access policy, and sensitive-variable handling.

## Run

```bash
npm run check
npm test
npm run demo
```

The demo writes reviewer artifacts to `reports/`:

- `data-dictionary-release-packet.json`
- `data-dictionary-release-report.md`
- `summary.svg`
- `demo.mp4`

## Scope

This slice is intentionally narrow. It does not implement storage infrastructure, license compatibility review, sensitive file redaction, schema migration, preview rendering, retention tombstones, or execution environments. Those are covered by nearby PRs; this module focuses on dictionary and unit readiness for released datasets.
8 changes: 8 additions & 0 deletions data-dictionary-release-gate/acceptance-notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Acceptance Notes

- Dependency-free Node.js module with deterministic output.
- Synthetic data only; no private datasets, credentials, payment data, or external services.
- Release decisions are `pass`, `revise`, or `hold`.
- Blockers cover undocumented release columns, missing checksums, missing or non-standard units, release-blocking licenses, missing access policies, and sensitive variables released without exclusion or de-identification.
- Warnings cover thin descriptions, missing DOI plans, incomplete schema.org/DataCite metadata, orphan dictionary entries, missing unit authorities, unexplained non-blocking missingness, sparse semantic tags, and missing ontology mappings.
- Demo artifacts are generated locally under `reports/`.
83 changes: 83 additions & 0 deletions data-dictionary-release-gate/demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
const fs = require("node:fs")
const path = require("node:path")
const { spawnSync } = require("node:child_process")
const { evaluateReleasePortfolio } = require("./index")
const { datasets, releasePolicy } = require("./sample-data")

const reportsDir = path.join(__dirname, "reports")
fs.mkdirSync(reportsDir, { recursive: true })

const packet = evaluateReleasePortfolio({ datasets, policy: releasePolicy })
const { summary } = packet

fs.writeFileSync(
path.join(reportsDir, "data-dictionary-release-packet.json"),
`${JSON.stringify(packet, null, 2)}\n`,
)

const markdown = [
"# Data Dictionary Release Gate Report",
"",
`Generated datasets: ${summary.totalDatasets}`,
`Passed: ${summary.passed}`,
`Needs revision: ${summary.revise}`,
`Held: ${summary.held}`,
`Blockers: ${summary.blockers}`,
`Warnings: ${summary.warnings}`,
`Audit digest: \`${packet.audit.digest}\``,
"",
"## Dataset Decisions",
...packet.decisions.flatMap((decision) => [
"",
`### ${decision.id}: ${decision.title}`,
`- Status: ${decision.status}`,
`- Release target: ${decision.releaseTarget}`,
`- Files: ${decision.manifest.files}`,
`- Columns: ${decision.manifest.columns}`,
`- Dictionary entries: ${decision.manifest.dictionaryEntries}`,
`- License: ${decision.manifest.license}`,
`- Findings: ${decision.findings.map((finding) => finding.code).join(", ") || "none"}`,
`- First action: ${decision.reviewerActions[0]?.message || "none"}`,
]),
"",
]

fs.writeFileSync(path.join(reportsDir, "data-dictionary-release-report.md"), markdown.join("\n"))

const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="960" height="540" viewBox="0 0 960 540">
<rect width="960" height="540" fill="#0f172a"/>
<text x="48" y="78" fill="#f8fafc" font-family="Arial" font-size="34" font-weight="700">Data Dictionary Release Gate</text>
<text x="48" y="124" fill="#cbd5e1" font-family="Arial" font-size="18">Synthetic FAIR metadata and unit-readiness packet for SCIBASE issue #14</text>
<rect x="48" y="170" width="250" height="150" rx="14" fill="#047857"/>
<text x="78" y="230" fill="#ecfdf5" font-family="Arial" font-size="56" font-weight="700">${summary.passed}</text>
<text x="78" y="270" fill="#d1fae5" font-family="Arial" font-size="22">passed</text>
<rect x="355" y="170" width="250" height="150" rx="14" fill="#a16207"/>
<text x="385" y="230" fill="#fefce8" font-family="Arial" font-size="56" font-weight="700">${summary.revise}</text>
<text x="385" y="270" fill="#fef3c7" font-family="Arial" font-size="22">revise</text>
<rect x="662" y="170" width="250" height="150" rx="14" fill="#b91c1c"/>
<text x="692" y="230" fill="#fef2f2" font-family="Arial" font-size="56" font-weight="700">${summary.held}</text>
<text x="692" y="270" fill="#fee2e2" font-family="Arial" font-size="22">held</text>
<text x="48" y="390" fill="#e2e8f0" font-family="Arial" font-size="20">Checks: column coverage, units, missingness, semantic tags, checksums, license, DataCite, schema.org.</text>
<text x="48" y="430" fill="#94a3b8" font-family="Arial" font-size="16">Digest ${packet.audit.digest.slice(0, 24)}...</text>
</svg>
`
fs.writeFileSync(path.join(reportsDir, "summary.svg"), svg)

const ffmpeg = spawnSync("ffmpeg", [
"-y",
"-f",
"lavfi",
"-i",
"color=c=0x0f172a:s=960x540:d=6:r=15",
"-vf",
"drawbox=x=48:y=170:w=250:h=150:color=0x047857@1:t=fill,drawbox=x=355:y=170:w=250:h=150:color=0xa16207@1:t=fill,drawbox=x=662:y=170:w=250:h=150:color=0xb91c1c@1:t=fill,drawbox=x=48:y=370:w=864:h=18:color=0x38bdf8@1:t=fill",
"-pix_fmt",
"yuv420p",
path.join(reportsDir, "demo.mp4"),
], { stdio: "ignore" })

if (ffmpeg.status !== 0) {
console.warn("ffmpeg video generation failed; summary.svg and JSON/Markdown reports were still generated.")
}

console.log(`Wrote data dictionary release artifacts to ${reportsDir}`)
252 changes: 252 additions & 0 deletions data-dictionary-release-gate/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
const crypto = require("node:crypto")

function stableJson(value) {
if (Array.isArray(value)) {
return `[${value.map(stableJson).join(",")}]`
}
if (value && typeof value === "object") {
return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(",")}}`
}
return JSON.stringify(value)
}

function digestFor(value) {
return crypto.createHash("sha256").update(stableJson(value)).digest("hex")
}

function finding(code, severity, message, detail = {}) {
return { code, severity, message, detail }
}

function hasText(value) {
return typeof value === "string" && value.trim().length > 0
}

function unique(values) {
return [...new Set(values)]
}

function isMeasurementVariable(variable) {
return !["identifier", "category", "text", "boolean"].includes(variable.measurementKind)
}

function evaluateMetadata(dataset, policy) {
const findings = []
const { metadata } = dataset

if (!hasText(metadata.title) || metadata.description.length < policy.minDatasetDescriptionLength) {
findings.push(finding("DATASET_DESCRIPTION_THIN", "warning", "Dataset title and description need enough context for reuse."))
}
if (!policy.approvedLicenses.includes(metadata.license)) {
findings.push(finding("LICENSE_NOT_RELEASE_READY", "blocker", "Dataset license is missing or not approved for public release.", {
license: metadata.license || "missing",
}))
}
if (!policy.approvedAccessPolicies.includes(metadata.accessPolicy)) {
findings.push(finding("ACCESS_POLICY_MISSING", "blocker", "Dataset release needs a declared access policy.", {
accessPolicy: metadata.accessPolicy || "missing",
}))
}
if (!hasText(metadata.persistentIdentifier)) {
findings.push(finding("PERSISTENT_IDENTIFIER_MISSING", "warning", "Release metadata should include a DOI, ARK, or planned persistent identifier."))
}
if (!hasText(metadata.schemaOrgType) || !Array.isArray(metadata.dataCiteCreators) || metadata.dataCiteCreators.length === 0) {
findings.push(finding("DISCOVERY_METADATA_INCOMPLETE", "warning", "schema.org and DataCite metadata are incomplete."))
}

dataset.files.forEach((file) => {
if (!hasText(file.checksum)) {
findings.push(finding("FILE_CHECKSUM_MISSING", "blocker", "Every released file needs a stable checksum.", {
file: file.path,
}))
}
if (!hasText(file.format)) {
findings.push(finding("FILE_FORMAT_MISSING", "warning", "File format metadata is missing.", {
file: file.path,
}))
}
})

return findings
}

function evaluateColumnCoverage(dataset) {
const findings = []
const actualColumns = unique(dataset.files.flatMap((file) => file.columns.map((column) => `${file.path}:${column}`)))
const actualColumnNames = unique(dataset.files.flatMap((file) => file.columns))
const documentedNames = unique(dataset.dictionary.map((variable) => variable.name))

const undocumented = actualColumnNames.filter((column) => !documentedNames.includes(column))
const orphaned = documentedNames.filter((name) => !actualColumnNames.includes(name))

if (undocumented.length > 0) {
findings.push(finding("UNDOCUMENTED_COLUMNS", "blocker", "Release files contain columns missing from the data dictionary.", {
columns: undocumented,
}))
}
if (orphaned.length > 0) {
findings.push(finding("ORPHAN_DICTIONARY_ENTRIES", "warning", "Data dictionary contains entries not present in the release files.", {
variables: orphaned,
}))
}
if (actualColumns.length === 0) {
findings.push(finding("NO_RELEASE_COLUMNS", "blocker", "No tabular release columns were detected."))
}

return findings
}

function evaluateVariable(variable, policy) {
const findings = []

if (!hasText(variable.label) || !hasText(variable.description) || variable.description.length < policy.minVariableDescriptionLength) {
findings.push(finding("VARIABLE_DESCRIPTION_THIN", "warning", "Variable description is too short for independent reuse.", {
variable: variable.name,
}))
}
if (!hasText(variable.dataType)) {
findings.push(finding("VARIABLE_TYPE_MISSING", "blocker", "Variable is missing a machine-readable data type.", {
variable: variable.name,
}))
}
if (isMeasurementVariable(variable)) {
const allowedUnits = policy.approvedUnitsByKind[variable.measurementKind] || []
if (!hasText(variable.unit)) {
findings.push(finding("MEASUREMENT_UNIT_MISSING", "blocker", "Measurement variable is missing units.", {
variable: variable.name,
measurementKind: variable.measurementKind,
}))
} else if (!allowedUnits.includes(variable.unit)) {
findings.push(finding("MEASUREMENT_UNIT_NOT_STANDARD", "blocker", "Measurement unit is not approved for this variable kind.", {
variable: variable.name,
unit: variable.unit,
approvedUnits: allowedUnits,
}))
}
if (!hasText(variable.unitStandard)) {
findings.push(finding("UNIT_STANDARD_MISSING", "warning", "Measurement variable should cite UCUM, SI, or another unit authority.", {
variable: variable.name,
}))
}
}
if (variable.dataType === "categorical" && (!Array.isArray(variable.allowedValues) || variable.allowedValues.length === 0)) {
findings.push(finding("CATEGORICAL_VALUES_MISSING", "warning", "Categorical variable lacks allowed values.", {
variable: variable.name,
}))
}
if (variable.missingnessRate > policy.maxMissingnessWithoutReason && !hasText(variable.missingnessReason)) {
findings.push(finding(
variable.missingnessRate > policy.blockingMissingnessWithoutReason ? "MISSINGNESS_REASON_REQUIRED" : "MISSINGNESS_REASON_RECOMMENDED",
variable.missingnessRate > policy.blockingMissingnessWithoutReason ? "blocker" : "warning",
"Variable has unexplained missingness above the policy threshold.",
{ variable: variable.name, missingnessRate: variable.missingnessRate },
))
}
if (!Array.isArray(variable.semanticTags) || variable.semanticTags.length < policy.minSemanticTagsPerVariable) {
findings.push(finding("SEMANTIC_TAGS_INCOMPLETE", "warning", "Variable needs richer semantic tags for FAIR discovery.", {
variable: variable.name,
}))
}
if (!hasText(variable.schemaProperty)) {
findings.push(finding("SCHEMA_PROPERTY_MISSING", "warning", "Variable lacks a schema.org or domain ontology property mapping.", {
variable: variable.name,
}))
}
if (policy.blockedSensitivityClasses.includes(variable.sensitivity) && variable.releaseHandling !== "excluded" && !variable.deidentified) {
findings.push(finding("SENSITIVE_VARIABLE_RELEASED", "blocker", "Sensitive or identifying variable is marked for release without exclusion or de-identification.", {
variable: variable.name,
sensitivity: variable.sensitivity,
}))
}

return findings
}

function buildReviewerActions(status, findings) {
if (status === "pass") {
return [{ code: "APPROVE_DATA_DICTIONARY_RELEASE", owner: "data-steward", message: "Approve dataset release with the current dictionary and metadata packet." }]
}

const actions = findings.map((item) => {
if (item.code === "UNDOCUMENTED_COLUMNS") {
return { code: "DOCUMENT_ALL_RELEASE_COLUMNS", owner: "data-steward", message: "Add dictionary entries for every released column before DOI/export." }
}
if (item.code === "MEASUREMENT_UNIT_MISSING" || item.code === "MEASUREMENT_UNIT_NOT_STANDARD") {
return { code: "FIX_MEASUREMENT_UNITS", owner: "domain-reviewer", message: "Add standard units and unit authority for measurement variables." }
}
if (item.code === "SENSITIVE_VARIABLE_RELEASED") {
return { code: "REMOVE_OR_DEIDENTIFY_SENSITIVE_FIELDS", owner: "privacy-reviewer", message: "Exclude or de-identify sensitive variables before public release." }
}
if (item.code === "LICENSE_NOT_RELEASE_READY") {
return { code: "RESOLVE_LICENSE_BEFORE_RELEASE", owner: "project-admin", message: "Select an approved release license before publishing." }
}
if (item.code === "FILE_CHECKSUM_MISSING") {
return { code: "ADD_FILE_CHECKSUMS", owner: "release-engineer", message: "Add stable checksums for every release artifact." }
}
return { code: `ADDRESS_${item.code}`, owner: "data-steward", message: item.message }
})

return [...new Map(actions.map((item) => [item.code, item])).values()]
}

function evaluateDatasetRelease(dataset, policy) {
const findings = [
...evaluateMetadata(dataset, policy),
...evaluateColumnCoverage(dataset),
...dataset.dictionary.flatMap((variable) => evaluateVariable(variable, policy)),
]
const blockers = findings.filter((item) => item.severity === "blocker")
const warnings = findings.filter((item) => item.severity === "warning")
const status = blockers.length > 0 ? "hold" : (warnings.length > 0 ? "revise" : "pass")

const decision = {
id: dataset.id,
title: dataset.metadata.title,
status,
releaseTarget: dataset.releaseTarget,
manifest: {
files: dataset.files.length,
columns: unique(dataset.files.flatMap((file) => file.columns)).length,
dictionaryEntries: dataset.dictionary.length,
license: dataset.metadata.license,
accessPolicy: dataset.metadata.accessPolicy,
persistentIdentifier: dataset.metadata.persistentIdentifier || "pending",
},
findings,
reviewerActions: buildReviewerActions(status, findings),
}

return {
...decision,
auditDigest: digestFor(decision),
}
}

function evaluateReleasePortfolio({ datasets, policy }) {
const decisions = datasets.map((dataset) => evaluateDatasetRelease(dataset, policy))
const summary = {
totalDatasets: decisions.length,
passed: decisions.filter((decision) => decision.status === "pass").length,
revise: decisions.filter((decision) => decision.status === "revise").length,
held: decisions.filter((decision) => decision.status === "hold").length,
blockers: decisions.reduce((sum, decision) => sum + decision.findings.filter((item) => item.severity === "blocker").length, 0),
warnings: decisions.reduce((sum, decision) => sum + decision.findings.filter((item) => item.severity === "warning").length, 0),
}

return {
generatedAt: "2026-05-21T18:05:00.000Z",
policy,
summary,
decisions,
audit: {
source: "synthetic-data-dictionary-release-gate",
digest: digestFor({ summary, decisions }),
},
}
}

module.exports = {
digestFor,
evaluateDatasetRelease,
evaluateReleasePortfolio,
}
11 changes: 11 additions & 0 deletions data-dictionary-release-gate/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "data-dictionary-release-gate",
"version": "1.0.0",
"private": true,
"type": "commonjs",
"scripts": {
"check": "node --check index.js && node --check sample-data.js && node --check test.js && node --check demo.js",
"test": "node test.js",
"demo": "node demo.js"
}
}
Loading