Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 75 additions & 10 deletions scripts/build-edges.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* Pre-computes cross-reference edges for each dataset.
* Reads all concept JSON files, extracts structured and inline references,
* and writes edges.json for each dataset.
* Pre-computes cross-reference and domain edges for each dataset.
* Reads all concept JSON files, extracts structured references and
* authoritative sources (domains), and writes edges.json + domain-nodes.json.
*
* Usage: node scripts/build-edges.js
*/
Expand All @@ -13,12 +13,18 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = process.cwd();
const DATA_DIR = join(ROOT, 'public', 'data');

function extractEdgesFromConcept(concept, registerId) {
// --- Normalization ---

function slugify(text) {
return text.toLowerCase().replace(/[^\w\s-]/g, '').replace(/[\s/]+/g, '-');
}

// --- Extractors (open/closed: add new extractors to EXTRACTORS array) ---

function extractReferences(concept, registerId) {
const edges = [];
const sourceUri = concept['@id'];

for (const [_lang, lc] of Object.entries(concept['gl:localizedConcept'] || {})) {
// Structured cross-references (gl:references array, pre-computed during data generation)
for (const [lang, lc] of Object.entries(concept['gl:localizedConcept'] || {})) {
if (lc['gl:references']) {
for (const ref of lc['gl:references']) {
if (ref['@id'] && ref['@id'] !== sourceUri) {
Expand All @@ -28,15 +34,42 @@ function extractEdgesFromConcept(concept, registerId) {
type: 'references',
label: ref['gl:term'] || undefined,
register: registerId,
lang,
});
}
}
}
}
return edges;
}

function extractDomains(concept, registerId) {
const edges = [];
const sourceUri = concept['@id'];
for (const [lang, lc] of Object.entries(concept['gl:localizedConcept'] || {})) {
const domain = lc['gl:domain'];
if (domain) {
edges.push({
source: sourceUri,
target: `https://glossarist.org/${registerId}/domain/${slugify(domain)}`,
type: 'domain',
label: domain,
register: registerId,
lang,
});
}
}
return edges;
}

const EXTRACTORS = [extractReferences, extractDomains];

function extractAllEdges(concept, registerId) {
return EXTRACTORS.flatMap(fn => fn(concept, registerId));
}

// --- Build ---

function buildEdgesForDataset(datasetDir, registerId) {
const conceptsDir = join(datasetDir, 'concepts');
if (!existsSync(conceptsDir)) {
Expand All @@ -48,13 +81,20 @@ function buildEdgesForDataset(datasetDir, registerId) {
console.log(` Processing ${files.length} concepts...`);

const allEdges = [];
const domainConceptCount = new Map();
let processed = 0;

for (const file of files) {
try {
const data = JSON.parse(readFileSync(join(conceptsDir, file), 'utf-8'));
const edges = extractEdgesFromConcept(data, registerId);
const edges = extractAllEdges(data, registerId);
allEdges.push(...edges);

for (const edge of edges) {
if (edge.type === 'domain') {
domainConceptCount.set(edge.target, (domainConceptCount.get(edge.target) || 0) + 1);
}
}
} catch (e) {
console.error(` Error processing ${file}: ${e.message}`);
}
Expand All @@ -64,11 +104,11 @@ function buildEdgesForDataset(datasetDir, registerId) {
}
}

// Deduplicate edges by source+target pair
// Deduplicate edges by source+target+type+lang
const seen = new Set();
const deduped = [];
for (const edge of allEdges) {
const key = `${edge.source}→${edge.target}`;
const key = `${edge.source}→${edge.target}→${edge.type}→${edge.lang || ''}`;
if (!seen.has(key)) {
seen.add(key);
deduped.push(edge);
Expand All @@ -84,6 +124,31 @@ function buildEdgesForDataset(datasetDir, registerId) {
const outputPath = join(datasetDir, 'edges.json');
writeFileSync(outputPath, JSON.stringify(output, null, 2));
console.log(` Written ${deduped.length} edges to edges.json (${(JSON.stringify(output).length / 1024).toFixed(1)} KB)`);

// Build domain-nodes.json
const domainEdgeMap = new Map();
for (const edge of deduped) {
if (edge.type === 'domain') {
const existing = domainEdgeMap.get(edge.target);
if (existing) {
existing.labels.add(edge.label);
} else {
domainEdgeMap.set(edge.target, { uri: edge.target, labels: new Set([edge.label]), registerId });
}
}
}

const domainNodes = [...domainEdgeMap.values()].map(d => ({
uri: d.uri,
label: [...d.labels][0],
registerId: d.registerId,
conceptCount: domainConceptCount.get(d.uri) || 0,
})).sort((a, b) => b.conceptCount - a.conceptCount);

const domainOutput = { registerId, domainNodes };
const domainPath = join(datasetDir, 'domain-nodes.json');
writeFileSync(domainPath, JSON.stringify(domainOutput, null, 2));
console.log(` Written ${domainNodes.length} domain nodes to domain-nodes.json`);
}

// Main
Expand Down
63 changes: 52 additions & 11 deletions scripts/generate-data.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,46 @@ function writeJson(filePath, data) {
}

function termToDesignation(term) {
const typeMap = {
expression: 'gl:Expression',
abbreviation: 'gl:Abbreviation',
symbol: 'gl:Symbol',
letter_symbol: 'gl:LetterSymbol',
'graphical symbol': 'gl:GraphicalSymbol',
};
const doc = {
'@type': term.type === 'expression' ? 'gl:Expression'
: term.type === 'symbol' ? 'gl:Symbol'
: term.type === 'abbreviation' ? 'gl:Abbreviation'
: 'gl:Designation',
'@type': typeMap[term.type] || 'gl:Designation',
'gl:normativeStatus': term.normative_status || 'preferred',
'gl:term': term.designation,
};
if (term.gender) doc['gl:gender'] = term.gender;
if (term.plurality) doc['gl:plurality'] = term.plurality;

if (term.grammar_info && term.grammar_info.length > 0) {
doc['gl:grammarInfo'] = term.grammar_info.map(gi => {
const g = {};
if (gi.gender) g['gl:gender'] = gi.gender;
if (gi.number) g['gl:number'] = gi.number;
for (const pos of ['noun', 'verb', 'adj', 'adverb', 'preposition', 'participle']) {
if (gi[pos]) g[`gl:${pos}`] = gi[pos];
}
return g;
});
}

if (term.international !== undefined) doc['gl:international'] = term.international;
if (term.absent !== undefined) doc['gl:absent'] = term.absent;
if (term.geographical_area) doc['gl:geographicalArea'] = term.geographical_area;
if (term.term_type) doc['gl:termType'] = term.term_type;
if (term.prefix) doc['gl:prefix'] = term.prefix;
if (term.usage_info) doc['gl:usageInfo'] = term.usage_info;
if (term.field_of_application) doc['gl:fieldOfApplication'] = term.field_of_application;

if (term.acronym !== undefined) doc['gl:acronym'] = term.acronym;
if (term.initialism !== undefined) doc['gl:initialism'] = term.initialism;
if (term.truncation !== undefined) doc['gl:truncation'] = term.truncation;

if (term.text) doc['gl:text'] = term.text;
if (term.image) doc['gl:image'] = term.image;

return doc;
}

Expand Down Expand Up @@ -210,6 +239,10 @@ function yamlToJsonLd(conceptYaml, register, refMaps) {
};

if (lc.entry_status) lDoc['gl:entryStatus'] = lc.entry_status;
if (lc.classification) lDoc['gl:classification'] = lc.classification;
if (lc.review_type) lDoc['gl:reviewType'] = lc.review_type;
if (lc.script) lDoc['gl:script'] = lc.script;
if (lc.system) lDoc['gl:system'] = lc.system;
if (lc.terms && lc.terms.length > 0) lDoc['gl:designation'] = lc.terms.map(termToDesignation);
if (lc.definition) lDoc['gl:definition'] = defsToJsonLd(lc.definition);
if (lc.notes && lc.notes.length > 0) lDoc['gl:notes'] = defsToJsonLd(lc.notes);
Expand All @@ -223,6 +256,7 @@ function yamlToJsonLd(conceptYaml, register, refMaps) {
if (lc.review_status) lDoc['gl:reviewStatus'] = lc.review_status;
if (lc.review_decision) lDoc['gl:reviewDecision'] = lc.review_decision;
if (lc.review_decision_notes) lDoc['gl:reviewDecisionNotes'] = lc.review_decision_notes;
if (lc.domain) lDoc['gl:domain'] = lc.domain;
if (lc.dates && lc.dates.length > 0) {
lDoc['gl:dates'] = lc.dates.map(d => ({
'gl:dateType': d.type,
Expand Down Expand Up @@ -378,15 +412,22 @@ function conceptJsonToTbx(concept) {
const status = d['gl:normativeStatus'] || '';
const type = d['@type'] || '';
let gramGrp = '';
if (d['gl:gender']) gramGrp = `\n <grammaticalGender>${escapeXml(d['gl:gender'])}</grammaticalGender>`;
let partOfSpeech = '';
if (type.includes('Abbreviation')) partOfSpeech = '\n <partOfSpeech>abbreviation</partOfSpeech>';
if (type.includes('Symbol')) partOfSpeech = '\n <partOfSpeech>symbol</partOfSpeech>';
if (d['gl:grammarInfo'] && d['gl:grammarInfo'].length > 0) {
const gi = d['gl:grammarInfo'][0];
if (gi['gl:gender']) gramGrp = `\n <grammaticalGender>${escapeXml(gi['gl:gender'])}</grammaticalGender>`;
if (gi['gl:number']) gramGrp += `\n <grammaticalNumber>${escapeXml(gi['gl:number'])}</grammaticalNumber>`;
for (const pos of ['noun', 'verb', 'adj', 'adverb', 'preposition', 'participle']) {
if (gi[`gl:${pos}`]) gramGrp += `\n <partOfSpeech>${pos}</partOfSpeech>`;
}
}
let posBlock = '';
if (type.includes('Abbreviation')) posBlock = '\n <partOfSpeech>abbreviation</partOfSpeech>';
if (type.includes('Symbol')) posBlock = '\n <partOfSpeech>symbol</partOfSpeech>';

termEntries.push(` <termEntry>
<langSet xml:lang="${lang}">
<tig>
<term>${escapeXml(term)}</term>${gramGrp}${partOfSpeech}
<term>${escapeXml(term)}</term>${gramGrp}${posBlock}
</tig>
</langSet>
</termEntry>`);
Expand Down
2 changes: 1 addition & 1 deletion src/__tests__/concept-timeline.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ describe('ConceptTimeline', () => {
'gl:reviewStatus': 'final',
'gl:reviewDecision': 'accepted',
'gl:entryStatus': 'valid',
'gl:release': 3,
'gl:release': '3',
});
const wrapper = mountTimeline({ eng: lc });
expect(wrapper.text()).toContain('Review Details');
Expand Down
93 changes: 93 additions & 0 deletions src/__tests__/dataset-adapter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,24 @@ describe('DatasetAdapter', () => {
expect(edges[0].label).toBe('functional');
});

it('tags reference edges with language', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/1',
'gl:localizedConcept': {
eng: { 'gl:references': [
{ '@id': 'https://glossarist.org/test/concept/2', 'gl:term': 'other' },
]},
fra: { 'gl:references': [
{ '@id': 'https://glossarist.org/test/concept/3', 'gl:term': 'autre' },
]},
},
};
const edges = adapter.extractEdges(concept as any);
expect(edges.length).toBe(2);
expect(edges.find(e => e.lang === 'eng')?.target).toContain('/concept/2');
expect(edges.find(e => e.lang === 'fra')?.target).toContain('/concept/3');
});

it('skips self-references', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/102-01-01',
Expand Down Expand Up @@ -318,6 +336,81 @@ describe('DatasetAdapter', () => {
});
});

describe('extractDomainEdges', () => {
it('extracts domain edges from gl:domain field per language', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/3',
'gl:localizedConcept': {
eng: { 'gl:domain': 'geometry' },
fra: { 'gl:domain': 'géométrie' },
},
};
const edges = adapter.extractDomainEdges(concept as any);
expect(edges.length).toBe(2);
expect(edges.every(e => e.type === 'domain')).toBe(true);
expect(edges.find(e => e.lang === 'eng')?.target).toContain('/domain/geometry');
expect(edges.find(e => e.lang === 'fra')?.target).toContain('/domain/gomtrie');
expect(edges.find(e => e.lang === 'eng')?.label).toBe('geometry');
expect(edges.find(e => e.lang === 'fra')?.label).toBe('géométrie');
});

it('handles same domain across languages', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/1',
'gl:localizedConcept': {
eng: { 'gl:domain': 'metadata' },
fra: { 'gl:domain': 'metadata' },
},
};
const edges = adapter.extractDomainEdges(concept as any);
expect(edges.length).toBe(2);
expect(edges[0].target).toBe(edges[1].target);
expect(edges[0].target).toContain('/domain/metadata');
});

it('skips concepts without gl:domain', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/1',
'gl:localizedConcept': { eng: {} },
};
const edges = adapter.extractDomainEdges(concept as any);
expect(edges.length).toBe(0);
});

it('handles empty localizedConcept', () => {
const concept = {
'@id': 'https://glossarist.org/test/concept/1',
'gl:localizedConcept': {},
};
const edges = adapter.extractDomainEdges(concept as any);
expect(edges.length).toBe(0);
});
});

describe('loadDomainNodes', () => {
it('loads domain nodes from domain-nodes.json', async () => {
mockFetch.mockReturnValue(mockJsonResponse({
registerId: 'test',
domainNodes: [
{ uri: 'https://glossarist.org/test/domain/iso-19107', label: 'ISO 19107', registerId: 'test', conceptCount: 147 },
],
}));
const nodes = await adapter.loadDomainNodes();
expect(nodes.length).toBe(1);
expect(nodes[0].nodeType).toBe('domain');
expect(nodes[0].status).toBe('domain');
expect(nodes[0].loaded).toBe(true);
expect(nodes[0].designations.eng).toBe('ISO 19107');
expect(mockFetch).toHaveBeenCalledWith('/data/test/domain-nodes.json');
});

it('returns empty array on fetch failure', async () => {
mockFetch.mockReturnValue(Promise.resolve({ ok: false, status: 404 } as Response));
const nodes = await adapter.loadDomainNodes();
expect(nodes).toEqual([]);
});
});

describe('getLanguages', () => {
it('returns languages from manifest', async () => {
const manifest = {
Expand Down
Loading
Loading