From 491f51bc07520317f31416a68a9a221ccade03f9 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 17:38:25 +0200 Subject: [PATCH 01/33] minimal changes for direct from 4CAT mapping --- js/lib.js | 16 +++++++++++++++- modules/_loader.js | 6 +++++- popup/interface.js | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/js/lib.js b/js/lib.js index 6199d01..1579195 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,4 +57,18 @@ class MissingMappedField { toString() { return `${this.value}`; } -} \ No newline at end of file +} + +/** + * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. + * + * 4CAT's importer constructs: + * { ...item.data, __import_meta: { ...everything in item except data } } + * + * Mirroring that here means map_item functions auto-generated from 4CAT + * data sources can run against Zeeschuimer-stored items without translation. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} diff --git a/modules/_loader.js b/modules/_loader.js index 47697ca..afae2d7 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -17,11 +17,15 @@ async function load() { ]; for(const module of imported_modules) { + const mapper = module.map_item + ? (stored_item) => module.map_item(wrap_for_map_item(stored_item)) + : null; + zeeschuimer.register_module( module.MODULE_NAME, module.DOMAIN, module.capture, - module.map_item, + mapper, module.MODULE_ID ? module.MODULE_ID : module.MODULE_DOMAIN, module.overwrite_partial, module.TOOLTIP ? module.TOOLTIP : null, diff --git a/popup/interface.js b/popup/interface.js index 5cc7864..1ae60a2 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -619,7 +619,7 @@ async function get_csv_blob(platform) { let csv = []; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item.data); + item = module.mapper(item); if(csv.length === 0) { csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } From b06805f711a97fad6e9e3f6615db3a0cf936205e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 10:54:13 +0200 Subject: [PATCH 02/33] give me some standard helper functions --- js/lib.js | 54 +++++++++++++++++++++ modules/tiktok.js | 119 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/js/lib.js b/js/lib.js index 1579195..3b144d2 100644 --- a/js/lib.js +++ b/js/lib.js @@ -72,3 +72,57 @@ function wrap_for_map_item(stored_item) { const { data, ...meta } = stored_item; return { ...data, __import_meta: meta }; } + +/** + * Ports of 4CAT functions commonly used by `map_item` below + */ + +/** + * Strip HTML tags from a string. + * @param {string} html + * @param {boolean} convertNewlines Convert
and

tags to \n before stripping. + * @returns {string} + */ +function strip_tags(html, convertNewlines = true) { + if (!html) return ""; + if (convertNewlines) { + html = html.replace(//gi, "\n").replace(/<\/p>/gi, "

\n"); + html = html.replace(/\n+/g, "\n"); + } + const doc = new DOMParser().parseFromString(html, "text/html"); + return doc.body.textContent || ""; +} + +/** + * Normalize URL encoding for display and linking. + * Decodes percent-encoded URLs and re-encodes the query string canonically. + * Returns the original URL on parse failure. + * @param {string} url + * @returns {string} + */ +function normalize_url_encoding(url) { + if (!url) return ""; + try { + // Iterative decode handles double-encoded inputs. + let decoded = url; + let prev; + do { + prev = decoded; + try { + decoded = decodeURIComponent(prev); + } catch { + decoded = prev; + break; + } + } while (decoded !== prev); + const parsed = new URL(decoded); + // URL.toString() re-encodes the query/fragment correctly. + return parsed.toString(); + } catch { + return url; + } +} + +function formatUtcTimestamp(unixSeconds) { + return new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19); +} \ No newline at end of file diff --git a/modules/tiktok.js b/modules/tiktok.js index 55e6fbf..ea52532 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,3 +1,4 @@ + export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -103,4 +104,120 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === +// (regenerated from datasources/tiktok/search_tiktok.py) +export function map_item(post) { + // Zeeschuimer metadata + const metadata = post.__import_meta || {}; + + const challenges = Array.isArray(post.challenges) + ? post.challenges.map(ch => ch.title).filter(Boolean) + : []; + + const hashtags = Array.isArray(post.textExtra) + ? post.textExtra + .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) + .map(e => e.hashtagName) + : []; + + const diversificationLabels = Array.isArray(post.diversificationLabels) + ? post.diversificationLabels.join(',') + : ''; + + let user_nickname = ''; + let user_fullname = ''; + let user_thumbnail = ''; + + if (post.author && typeof post.author === 'object') { + user_nickname = post.author.uniqueId || ''; + user_fullname = post.author.nickname || ''; + user_thumbnail = post.author.avatarThumb || ''; + } else if (post.author) { + user_nickname = post.author || ''; + user_fullname = post.nickname || ''; + user_thumbnail = ''; + } + + const thumbnailOptions = []; + + if (post.video && Array.isArray(post.video.shareCover)) { + thumbnailOptions.push(...post.video.shareCover); + } + + if (post.video && post.video.cover) { + thumbnailOptions.push(post.video.cover); + } + + const now = Math.floor(Date.now() / 1000); + + const validThumbnails = thumbnailOptions.filter(url => { + try { + const parsedUrl = new URL(url); + const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; + return expires >= now; + } catch (e) { + return false; + } + }); + + const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; + + return new MappedItem({ + collected_from_url: metadata.source_platform_url + ? normalize_url_encoding(metadata.source_platform_url) + : '', + id: post.id || '', + thread_id: post.id || '', + author: user_nickname, + author_full: user_fullname, + author_followers: post.authorStats?.followerCount ?? '', + author_likes: post.authorStats?.diggCount ?? '', + author_videos: post.authorStats?.videoCount ?? '', + author_avatar: user_thumbnail, + body: post.desc || '', + stickers: Array.isArray(post.stickersOnItem) + ? post.stickersOnItem + .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) + .filter(Boolean) + .join('') + : '', + timestamp: post.createTime + ? formatUtcTimestamp(parseInt(post.createTime, 10)) + : '', + unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, + is_duet: + post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' + ? 'yes' + : 'no', + is_ad: post.isAd ? 'yes' : 'no', + is_paid_partnership: post.adAuthorization ? 'yes' : 'no', + is_sensitive: post.maskType === 3 ? 'yes' : 'no', + is_photosensitive: post.maskType === 4 ? 'yes' : 'no', + music_name: post.music?.title ?? '', + music_id: post.music?.id ?? '', + music_url: post.music?.playUrl ?? '', + music_thumbnail: post.music?.coverLarge ?? '', + music_author: post.music?.authorName ?? '', + video_url: post.video?.downloadAddr ?? '', + tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, + thumbnail_url: thumbnail_url, + likes: post.stats?.diggCount ?? '', + comments: post.stats?.commentCount ?? '', + shares: post.stats?.shareCount ?? '', + plays: post.stats?.playCount ?? '', + hashtags: hashtags.join(','), + challenges: challenges.join(','), + diversification_labels: diversificationLabels, + location_created: post.locationCreated ?? '', + effects: Array.isArray(post.effectStickers) + ? post.effectStickers.map(e => e.name).join(',') + : '', + warning: Array.isArray(post.warnInfo) + ? post.warnInfo.map(w => w.text).join(',') + : '', + }); +} +// === end auto-generated === +// === end auto-generated === From f9a2405a0703bcadfdee7492ccd57af12917733e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 13:07:43 +0200 Subject: [PATCH 03/33] fix csv export --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 1ae60a2..8afd1b1 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -595,7 +595,7 @@ const CSV_ESCAPED = `"${CSV_SEPARATOR}\n`; function csv_escape(value) { value = String(value); let needs_escape = false; - for(const character in CSV_ESCAPED) { + for(const character of CSV_ESCAPED) { if(value.indexOf(character) >= 0) { needs_escape = true; } From 2f084b9352c25a1034429bb05d8390b5961d35ef Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:19:18 +0200 Subject: [PATCH 04/33] another to CSV fix --- popup/interface.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/popup/interface.js b/popup/interface.js index 8afd1b1..94fff77 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -626,7 +626,7 @@ async function get_csv_blob(platform) { csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob([csv], {type: 'text/csv'}); + return new Blob(csv, {type: 'text/csv'}); } /** From d7870426c7765a6107c47c4fff062f5643725167 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:25:42 +0200 Subject: [PATCH 05/33] revert tiktok (mistaken test result commited) --- modules/tiktok.js | 119 +--------------------------------------------- 1 file changed, 1 insertion(+), 118 deletions(-) diff --git a/modules/tiktok.js b/modules/tiktok.js index ea52532..55e6fbf 100644 --- a/modules/tiktok.js +++ b/modules/tiktok.js @@ -1,4 +1,3 @@ - export const MODULE_NAME = 'TikTok (posts)'; export const DOMAIN = 'tiktok.com'; @@ -104,120 +103,4 @@ export function capture(response, source_platform_url, source_url) { } else { return []; } -} - -// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND === -// (regenerated from datasources/tiktok/search_tiktok.py) -export function map_item(post) { - // Zeeschuimer metadata - const metadata = post.__import_meta || {}; - - const challenges = Array.isArray(post.challenges) - ? post.challenges.map(ch => ch.title).filter(Boolean) - : []; - - const hashtags = Array.isArray(post.textExtra) - ? post.textExtra - .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName) - .map(e => e.hashtagName) - : []; - - const diversificationLabels = Array.isArray(post.diversificationLabels) - ? post.diversificationLabels.join(',') - : ''; - - let user_nickname = ''; - let user_fullname = ''; - let user_thumbnail = ''; - - if (post.author && typeof post.author === 'object') { - user_nickname = post.author.uniqueId || ''; - user_fullname = post.author.nickname || ''; - user_thumbnail = post.author.avatarThumb || ''; - } else if (post.author) { - user_nickname = post.author || ''; - user_fullname = post.nickname || ''; - user_thumbnail = ''; - } - - const thumbnailOptions = []; - - if (post.video && Array.isArray(post.video.shareCover)) { - thumbnailOptions.push(...post.video.shareCover); - } - - if (post.video && post.video.cover) { - thumbnailOptions.push(post.video.cover); - } - - const now = Math.floor(Date.now() / 1000); - - const validThumbnails = thumbnailOptions.filter(url => { - try { - const parsedUrl = new URL(url); - const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0; - return expires >= now; - } catch (e) { - return false; - } - }); - - const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : ''; - - return new MappedItem({ - collected_from_url: metadata.source_platform_url - ? normalize_url_encoding(metadata.source_platform_url) - : '', - id: post.id || '', - thread_id: post.id || '', - author: user_nickname, - author_full: user_fullname, - author_followers: post.authorStats?.followerCount ?? '', - author_likes: post.authorStats?.diggCount ?? '', - author_videos: post.authorStats?.videoCount ?? '', - author_avatar: user_thumbnail, - body: post.desc || '', - stickers: Array.isArray(post.stickersOnItem) - ? post.stickersOnItem - .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : '')) - .filter(Boolean) - .join('') - : '', - timestamp: post.createTime - ? formatUtcTimestamp(parseInt(post.createTime, 10)) - : '', - unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0, - is_duet: - post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0' - ? 'yes' - : 'no', - is_ad: post.isAd ? 'yes' : 'no', - is_paid_partnership: post.adAuthorization ? 'yes' : 'no', - is_sensitive: post.maskType === 3 ? 'yes' : 'no', - is_photosensitive: post.maskType === 4 ? 'yes' : 'no', - music_name: post.music?.title ?? '', - music_id: post.music?.id ?? '', - music_url: post.music?.playUrl ?? '', - music_thumbnail: post.music?.coverLarge ?? '', - music_author: post.music?.authorName ?? '', - video_url: post.video?.downloadAddr ?? '', - tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`, - thumbnail_url: thumbnail_url, - likes: post.stats?.diggCount ?? '', - comments: post.stats?.commentCount ?? '', - shares: post.stats?.shareCount ?? '', - plays: post.stats?.playCount ?? '', - hashtags: hashtags.join(','), - challenges: challenges.join(','), - diversification_labels: diversificationLabels, - location_created: post.locationCreated ?? '', - effects: Array.isArray(post.effectStickers) - ? post.effectStickers.map(e => e.name).join(',') - : '', - warning: Array.isArray(post.warnInfo) - ? post.warnInfo.map(w => w.text).join(',') - : '', - }); -} -// === end auto-generated === -// === end auto-generated === +} \ No newline at end of file From a9fba9a9caee86d8799ee35d11374fbb602c9a41 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 15:57:45 +0200 Subject: [PATCH 06/33] clean up UI (make download menu button) --- popup/interface.html | 32 +++++++++++++++++++++- popup/interface.js | 63 +++++++++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index 356f2b5..e9d9b3f 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -215,10 +215,39 @@ text-indent: 2em; } - td > button:not(:last-child) { + td > button:not(:last-child), + td > .download-menu:not(:last-child) { margin-right: 0.25em; } + /* download chooser: trigger is a regular button (inherits all button + styles); */ + .download-menu { + display: inline-block; + position: relative; + } + + /* :not([hidden]) so the explicit display:flex doesn't override the + [hidden] attribute's default display:none */ + .download-menu > .download-options:not([hidden]) { + position: absolute; + top: calc(100% + 0.25em); + left: 0; + display: flex; + flex-direction: column; + gap: 0.25em; + padding: 0.25em; + background: var(--neutral-contrast-alt); + border: 2px solid var(--neutral-contrast); + border-radius: 0.5em; + z-index: 10; + white-space: nowrap; + } + + .download-menu > .download-options > button { + margin: 0; + } + input:not([type=checkbox]):not([type=radio]), button { background: var(--neutral-contrast-alt); color: var(--accent); @@ -302,6 +331,7 @@ .toggle-switch input { -moz-appearance: none; + appearance: none; opacity: 0; } diff --git a/popup/interface.js b/popup/interface.js index 94fff77..3b8aaa9 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -119,7 +119,7 @@ async function set_4cat_url(e) { function activate_buttons() { document.querySelectorAll("td button").forEach(button => { let current = button.disabled; - let items = parseInt(button.parentNode.parentNode.querySelector('.num-items').innerText); + let items = parseInt(button.closest('tr').querySelector('.num-items').innerText); let new_status = current; if(button.classList.contains('upload-to-4cat') && !is_uploading) { @@ -132,7 +132,7 @@ function activate_buttons() { button.setAttribute('title', ''); } - } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset') || button.classList.contains('download-csv')) { + } else if(button.classList.contains('download-format') || button.classList.contains('download-menu-trigger') || button.classList.contains('reset')) { new_status = !(items > 0); } @@ -234,21 +234,32 @@ async function get_stats() { let actions = createElement("td"); const clear_button = createElement("button", {"data-platform": platform, "class": "reset"}, "Delete"); - const csv_button = createElement("button", {"data-platform": platform, 'class': 'download-csv'}, '.csv'); - const download_button = createElement("button", { - "data-platform": platform, - "class": "download-ndjson" - }, ".ndjson"); + + // Render the download chooser as a button + popover panel, + // (even when only NDJSON is available as visual consistent) + const download_widget = createElement("span", {"class": "download-menu"}); + const trigger = createElement("button", { + "data-platform": platform, "class": "download-menu-trigger" + }, "Download"); + const options = createElement("div", {"class": "download-options", "hidden": ""}); + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "ndjson", "class": "download-format" + }, ".ndjson (original)")); + if(module.mapper) { + options.appendChild(createElement("button", { + "data-platform": platform, "data-format": "csv", "class": "download-format" + }, ".csv")); + } + download_widget.appendChild(trigger); + download_widget.appendChild(options); + const fourcat_button = createElement("button", { "data-platform": platform, "class": "upload-to-4cat", }, "to 4CAT"); actions.appendChild(clear_button); - if(module.mapper) { - actions.appendChild(csv_button); - } - actions.appendChild(download_button); + actions.appendChild(download_widget); actions.appendChild(fourcat_button); row.appendChild(actions); @@ -317,22 +328,38 @@ async function get_stats() { async function button_handler(event) { let status = document.getElementById('upload-status'); - if (event.target.matches('.reset')) { + // Close any open download-format popovers when clicking outside their host. + // Skip if the click is on a trigger or inside an options panel + if(!event.target.matches('.download-menu-trigger') && !event.target.closest('.download-options')) { + document.querySelectorAll('.download-options:not([hidden])').forEach(el => el.hidden = true); + } + + if (event.target.matches('.download-menu-trigger')) { + const widget = event.target.closest('.download-menu'); + const options = widget.querySelector('.download-options'); + const opening = options.hidden; + // close any other menus before opening this one + document.querySelectorAll('.download-options:not([hidden])').forEach(el => { + if(el !== options) el.hidden = true; + }); + options.hidden = !opening; + + } else if (event.target.matches('.reset')) { let platform = event.target.getAttribute('data-platform'); await background.db.items.where("source_platform").equals(platform).delete(); } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); - } else if (event.target.matches('.download-ndjson') || event.target.matches('.download-csv')) { - const blobber = event.target.matches('.download-ndjson') ? get_ndjson_blob : get_csv_blob; - const extension = event.target.matches('.download-ndjson') ? 'ndjson' : 'csv'; + } else if (event.target.matches('.download-format')) { + const format = event.target.getAttribute('data-format'); + const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; + const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - //let blob = await download_blob(platform, 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.ndjson'); let blob = await blobber(platform); let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); @@ -345,6 +372,10 @@ async function button_handler(event) { event.target.classList.remove('loading'); + // collapse the popover menu after the download fires + const widget = event.target.closest('.download-menu'); + if(widget) widget.querySelector('.download-options').hidden = true; + } else if (event.target.matches('.upload-to-4cat')) { let platform = event.target.getAttribute('data-platform'); status.innerText = 'Creating data file for uploading...'; From 0980a56f0ba6872884bfc1e891efc2cb9f4e4c33 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:13:52 +0200 Subject: [PATCH 07/33] testing is hard in JS --- docs/test-plan.md | 162 ++++++++++++++++++++++ modules/package.json | 3 + tests/__pycache__/test.cpython-39.pyc | Bin 0 -> 7345 bytes tests/duplicate-behavior.test.js | 3 +- tests/{jest.config.js => jest.config.cjs} | 3 +- tests/map_item.test.js | 130 +++++++++++++++++ tests/package.json | 5 +- tests/setup-globals.cjs | 41 ++++++ 8 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 docs/test-plan.md create mode 100644 modules/package.json create mode 100644 tests/__pycache__/test.cpython-39.pyc rename tests/{jest.config.js => jest.config.cjs} (64%) create mode 100644 tests/map_item.test.js create mode 100644 tests/setup-globals.cjs diff --git a/docs/test-plan.md b/docs/test-plan.md new file mode 100644 index 0000000..249a7e0 --- /dev/null +++ b/docs/test-plan.md @@ -0,0 +1,162 @@ +# Selenium Test Harness — Improvement Plan + +Date: 2026-04-30 + +Overview + +This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to: + +- Make profile handling reliable and reusable (so logged-in sessions persist across runs). +- Preserve and export captured data per platform for offline analysis and for passing to 4CAT. +- Add optional automated upload to a 4CAT instance for mapping/validation tests. +- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns). +- Improve robustness, error handling, and machine-readable results. + +Scope + +All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB). + +Phases & Changes + +Phase 1 — Profile management + +- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data. +- Changes: + - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running. + - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root. + - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run). + +Implementation note (copytree ignore example): + +```python +def _profile_ignore(root, names): + # Only ignore these entries in the root profile dir + if os.path.abspath(root) == os.path.abspath(profile_dir): + return {"storage", "extensions", "signedInUser.json"} + return set() + +shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore) +``` + +Phase 2 — Data preservation & export + +- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests. +- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform. +- Changes: + - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`). + - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`. + - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL). + +Execute_async_script pattern (example): + +```python +script = ''' +const cb = arguments[0]; +background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)}))); +''' +items_json = driver.execute_async_script(script) +items = json.loads(items_json) +``` + +Phase 3 — 4CAT integration (optional) + +- Problem: mapping tests live in 4CAT and need NDJSON input. +- Changes: + - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - Do not fail the test run on 4CAT errors — print status and continue. + +Example upload with `requests`: + +```python +import requests +with open(ndjson_path, 'rb') as f: + headers = { + 'X-Zeeschuimer-Platform': platform, + 'Authorization': f'Bearer {fourcat_key}' + } + r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) + # check r.status_code and r.text for details +``` + +Phase 4 — Interactive controls & popup dismissals + +- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures. +- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options. +- Changes: + - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning). + - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example: + +```json +"dismiss-selectors": ["button.cookie-accept", ".modal .close"] +``` + + - Add per-URL `timeout` (page load timeout override). + +Phase 5 — Runner robustness & reporting + +- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results. +- Changes: + - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue. + - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run. + - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted). + - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`). + +tests.json schema additions + +- Per-URL optional fields: + - `dismiss-selectors`: array of CSS selectors to click after page load + - `timeout`: numeric page load timeout seconds for this URL + - `extra-wait`: per-URL additional wait seconds + +CLI flags (summary) + +- `--profiledir PATH` — explicit profile path (existing) +- `--profile-name NAME` — choose Firefox profile by display name +- `--save-profile PATH` — persist the copied profile for reuse +- `--no-cleanup` — keep `.temp-profile` +- `--export-dir PATH` — where to write NDJSON exports +- `--no-reset` — do not click `reset-all` between URLs +- `--4cat-url URL` — base URL for 4CAT server +- `--4cat-key KEY` — API key for 4CAT uploads +- `--4cat-per-url` — upload per URL instead of per platform (optional) +- `--no-interactive` — disable pausing (default is to pause per-platform) +- `--pause-before-url` — pause before each URL +- `--pause-on-fail` — pause when a test fails +- `--extra-wait N` — add N seconds to every URL wait +- `--screenshot-dir PATH` — save screenshots on fail/warning +- `--results-file PATH` — write machine-readable results JSON +- `--resume-from PLATFORM` — resume a run from a platform + +Verification checklist + +1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items. +2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`. +3. Run with default interactive behavior and confirm one pause per platform. +4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts. +5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`. + +Implementation steps (recommended order) + +1. Docs and small fixes (this document + tests.json typo fix). +2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection). +3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write. +4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement. +5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots). +6. 4CAT upload integration (optional, requires confirmation of auth header). + +Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs. + +Open questions / confirmations needed + +- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) + +Next steps + +- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`. +- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes. + +--- + +Requested file: `docs/test-plan.md` diff --git a/modules/package.json b/modules/package.json new file mode 100644 index 0000000..3dbc1ca --- /dev/null +++ b/modules/package.json @@ -0,0 +1,3 @@ +{ + "type": "module" +} diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..745e2b4aaad921a459372bb50b39980c50a68136 GIT binary patch literal 7345 zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX) znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@ z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k zj*D~U-e*Pih zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+be-L|A4>0 zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6 zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)% z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R= ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1 zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv| z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@ zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU| z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_ zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2 zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0 z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt( zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_ zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8 zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+ z@gN4*Eki55$ABHD_UHCEaPat|QKoE# z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!? z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38 zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU) literal 0 HcmV?d00001 diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js index 031f663..9f0662b 100644 --- a/tests/duplicate-behavior.test.js +++ b/tests/duplicate-behavior.test.js @@ -5,8 +5,9 @@ * update or merge behaviors to duplicates across navigation boundaries. */ +import 'fake-indexeddb/auto'; + let Dexie; -require('fake-indexeddb/auto'); // Mock browser extension APIs global.browser = { diff --git a/tests/jest.config.js b/tests/jest.config.cjs similarity index 64% rename from tests/jest.config.js rename to tests/jest.config.cjs index 7dd5b02..ea72b10 100644 --- a/tests/jest.config.js +++ b/tests/jest.config.cjs @@ -3,6 +3,7 @@ module.exports = { testMatch: ['**/*.test.js'], transform: {}, moduleFileExtensions: ['js', 'json'], - collectCoverageFrom: ['duplicate-behavior.test.js'], + collectCoverageFrom: ['*.test.js'], + setupFiles: ['/setup-globals.cjs'], verbose: true }; diff --git a/tests/map_item.test.js b/tests/map_item.test.js new file mode 100644 index 0000000..9dee6e8 --- /dev/null +++ b/tests/map_item.test.js @@ -0,0 +1,130 @@ +/** + * Auto-discovery test driver for module `map_item` functions. + * + * Convention: + * tests/fixtures//*.ndjson + * + * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js). + * Each .ndjson line is one Zeeschuimer-stored item exported from the popup. + * + * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer + * presents items to a map_item function, then run through the module's + * map_item. Tests assert: function returns a non-null object, and any fields + * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + */ + +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +/** + * Local mirror of wrap_for_map_item from js/lib.js. + * + * lib.js is loaded by the browser as a plain script (it defines globals + * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be + * imported from Node. The wrap is three trivial lines with no dependencies + * — duplicating it here is cheaper than restructuring lib.js into a module. + * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +/** + * Pre-validate module syntax before dynamic import. + * + * `await import()` on a module with a syntax error throws inside V8's module + * linker in a way Jest's experimental-vm-modules can't always recover from + * (worker retry loop or Node process exit). Running `node --check` first + * gives us a clean error string we can fail the test with. + */ +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { + encoding: 'utf8', + }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +const REQUIRED_NON_EMPTY = { + tiktok: ['id', 'author', 'unix_timestamp'], +}; + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +const module_dirs = list_module_dirs(); +let total_fixtures = 0; + +for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + + if (fixture_files.length === 0) continue; + total_fixtures += fixture_files.length; + + describe(`map_item: ${module_name}`, () => { + let map_item; + let import_error; + + beforeAll(async () => { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + import_error = new Error(`syntax error:\n${syntax_error}`); + return; + } + try { + const mod = await import(`../modules/${module_name}.js`); + map_item = mod.map_item; + if (typeof map_item !== 'function') { + import_error = new Error(`modules/${module_name}.js does not export a map_item function`); + } + } catch (e) { + import_error = e; + } + }); + + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i} maps without throwing`, () => { + if (import_error) { + throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); + } + const stored_item = JSON.parse(line); + const mapped = map_item(wrap_for_map_item(stored_item)); + expect(mapped).not.toBeNull(); + expect(typeof mapped).toBe('object'); + for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) { + expect(mapped[field]).toBeDefined(); + expect(mapped[field]).not.toBe(''); + expect(mapped[field]).not.toBeNull(); + } + }); + }); + }); + } + }); +} + +if (total_fixtures === 0) { + describe('map_item', () => { + test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {}); + }); +} diff --git a/tests/package.json b/tests/package.json index dc3654c..6dd35fb 100644 --- a/tests/package.json +++ b/tests/package.json @@ -2,9 +2,10 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "description": "Unit tests for Zeeschuimer duplicate handling logic", + "type": "module", "scripts": { - "test": "jest", - "test:watch": "jest --watch" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" }, "devDependencies": { "dexie": "^3.2.4", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs new file mode 100644 index 0000000..a19fb09 --- /dev/null +++ b/tests/setup-globals.cjs @@ -0,0 +1,41 @@ +/** + * Make js/lib.js's helpers available as globals inside the Jest test + * environment, mirroring how the browser sees them after the manifest + * loads lib.js as a plain script. + * + * map_item bodies reference these as free identifiers (MappedItem, + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this + * shim they'd hit ReferenceError as soon as a test invokes map_item. + * + * Approach: read lib.js, wrap it in a new Function() body that returns the + * named helpers, call the function, and assign the returned object onto + * globalThis. (Earlier attempt with vm.runInThisContext failed because in + * the jsdom env the vm context's global differs from jsdom's window.) + * + * If a new helper is added to lib.js, append its name to EXPOSED_NAMES. + */ + +const fs = require('node:fs'); +const path = require('node:path'); + +const EXPOSED_NAMES = [ + 'traverse_data', + 'MappedItem', + 'MissingMappedField', + 'wrap_for_map_item', + 'strip_tags', + 'normalize_url_encoding', + 'formatUtcTimestamp', +]; + +const lib_source = fs.readFileSync( + path.join(__dirname, '..', 'js', 'lib.js'), + 'utf8', +); + +const factory = new Function(` +${lib_source} +return { ${EXPOSED_NAMES.join(', ')} }; +`); + +Object.assign(globalThis, factory()); From 46b96c77ffd45f465f90880915e1f6d2836bd87e Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 16:25:56 +0200 Subject: [PATCH 08/33] add fixtures folder and README.md to explain what I did --- tests/fixtures/.gitignore | 5 +++++ tests/fixtures/README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tests/fixtures/.gitignore create mode 100644 tests/fixtures/README.md diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore new file mode 100644 index 0000000..8e89a83 --- /dev/null +++ b/tests/fixtures/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* +# Except these files +!.gitignore +!README.md \ No newline at end of file diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md new file mode 100644 index 0000000..d24fe06 --- /dev/null +++ b/tests/fixtures/README.md @@ -0,0 +1,29 @@ +# Test fixtures for `map_item` + +Real captured items used to exercise each module's auto-generated `map_item` +function. + +## Layout + +``` +tests/fixtures/ + / + .ndjson + .ndjson +``` + +`` matches the filename in `modules/` without `.js` — +e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`. +You can drop multiple `.ndjson` files in a module folder; each gets its own +`describe` block and each line becomes its own `test`. + +Filenames are free-form — the auto-export filename from the popup +(`zeeschuimer-export--.ndjson`) is fine. + +## Privacy / committing + +These files contain real captured platform data — usernames, post +content, URLs, sometimes images and other PII. + +If we want to create test exports or annonomize real exports, add them to +.gitignore. \ No newline at end of file From 487b5b618e4a989cbfca7dbfe2b30b1e78dc62ad Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:53:22 +0200 Subject: [PATCH 09/33] add MapItemException --- js/lib.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/js/lib.js b/js/lib.js index 3b144d2..e38430e 100644 --- a/js/lib.js +++ b/js/lib.js @@ -59,6 +59,19 @@ class MissingMappedField { } } +/** + * Raised by `map_item` to signal a known mapping failure. + * + * Mirrors 4CAT's MapItemException: callers should catch it, skip the item, + * and warn the user that the platform's format may have shifted. + */ +class MapItemException extends Error { + constructor(message) { + super(message); + this.name = "MapItemException"; + } +} + /** * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects. * From b6f487dbfa017a79207726f04f059078aaf4c4b5 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 15:56:14 +0200 Subject: [PATCH 10/33] make a warning pop up --- popup/interface.html | 42 ++++++++++++++++++++++++++++++ popup/interface.js | 62 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 97 insertions(+), 7 deletions(-) diff --git a/popup/interface.html b/popup/interface.html index e9d9b3f..0570e40 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -303,6 +303,42 @@ text-align: center; } + #csv-warning { + position: fixed; + inset: 0; + background: rgba(60, 60, 59, 0.55); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; + } + + #csv-warning[hidden] { + display: none; + } + + #csv-warning .csv-warning-content { + background: var(--accent); + color: var(--neutral-contrast); + border: 2px solid var(--accent-alt); + border-radius: 6px; + padding: 1.25em 1.25em 1em 1.25em; + max-width: 24em; + text-align: center; + box-shadow: 0 0 20px var(--neutral-contrast); + } + + #csv-warning .csv-warning-content p { + margin: 0 0 1em 0; + line-height: 1.4; + } + + #csv-warning .dismiss-csv-warning { + display: block; + margin: 0 auto; + padding: 0.3em 1.25em; + } + .tooltippable:not(a):not(button) { display: inline-block; background: var(--neutral-contrast); @@ -409,6 +445,12 @@ +

Zeeschuimer

diff --git a/popup/interface.js b/popup/interface.js index 3b8aaa9..c56375a 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -351,16 +351,29 @@ async function button_handler(event) { } else if (event.target.matches('.reset-all')) { await background.db.items.clear(); + } else if (event.target.matches('.dismiss-csv-warning')) { + const warning = document.getElementById('csv-warning'); + if(warning) warning.hidden = true; + } else if (event.target.matches('.download-format')) { const format = event.target.getAttribute('data-format'); - const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob; const extension = format; let platform = event.target.getAttribute('data-platform'); let date = new Date(); event.target.classList.add('loading'); - let blob = await blobber(platform); + let blob; + if(format === 'csv') { + const result = await get_csv_blob(platform); + blob = result.blob; + if(result.skipped > 0) { + console.warn(`Zeeschuimer: skipped ${result.skipped} ${platform} item(s) during CSV export. First reason: ${result.firstReason}`); + show_csv_warning(platform, result.skipped); + } + } else { + blob = await get_ndjson_blob(platform); + } let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension; const downloadUrl = window.URL.createObjectURL(blob); const downloadId = await browser.downloads.download({ @@ -637,27 +650,62 @@ function csv_escape(value) { return value; } +/** + * Surface a CSV-export skip warning in the popup. + * + * Shown when the platform's `map_item` raised MapItemException for one or + * more items — typically the platform's response shape has shifted and the + * mapper no longer recognises every field. The user is steered to the + * .ndjson export, which is unaffected because it skips the mapper entirely. + */ +function show_csv_warning(platform, skipped) { + const warning = document.getElementById('csv-warning'); + if(!warning) return; + const message = warning.querySelector('p'); + message.innerText = `Skipped ${skipped} ${platform} item${skipped === 1 ? '' : 's'} in the CSV export — the platform's data format may have changed. Use the .ndjson export to get the full dataset until Zeeschuimer is updated.`; + warning.hidden = false; +} + /** * Get a CSV dump of items * * Returns a Blob with all items in it as CSV rows, mapped via the module's * registered mapper function. A header row is included. * + * Items whose mapper raises MapItemException are skipped and counted; any + * other error propagates. Skip count and the first skip reason are returned + * alongside the blob so the caller can warn the user. Just like 4CAT! + * * @param platform - * @returns {Promise} + * @returns {Promise<{blob: Blob, skipped: number, firstReason: string|null}>} */ async function get_csv_blob(platform) { let csv = []; + let skipped = 0; + let firstReason = null; const module = background.zeeschuimer.modules[platform]; await iterate_items(platform, function(item) { - item = module.mapper(item); + let mapped; + try { + mapped = module.mapper(item); + } catch(e) { + // More JS fun: Check tag rather than `instanceof`. + // Actual Exception lives in some other realm (where modules and lib.js live), and cross-realm + // `instanceof` is unreliable under Firefox's wrappers. + if(e && e.name === 'MapItemException') { + skipped++; + if(firstReason === null) firstReason = e.message; + return; + } + throw e; + } if(csv.length === 0) { - csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.keys(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); } - csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); + csv.push(Object.values(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n"); }) - return new Blob(csv, {type: 'text/csv'}); + return {blob: new Blob(csv, {type: 'text/csv'}), skipped, firstReason}; } /** From f28e310c8893bb49ac535d33cc94089e8d0686b2 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 7 May 2026 16:42:19 +0200 Subject: [PATCH 11/33] add MapItemException --- tests/setup-globals.cjs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index a19fb09..4f54e34 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -22,6 +22,7 @@ const EXPOSED_NAMES = [ 'traverse_data', 'MappedItem', 'MissingMappedField', + 'MapItemException', 'wrap_for_map_item', 'strip_tags', 'normalize_url_encoding', From 5baff31ae49167d215a56cf16ead326b22d975f3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:06 +0200 Subject: [PATCH 12/33] add env variables for tests (to connect to 4CAT) --- .gitignore | 2 ++ tests/.env.example | 9 +++++++++ tests/package-lock.json | 14 ++++++++++++++ tests/package.json | 4 +++- 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 tests/.env.example diff --git a/.gitignore b/.gitignore index 6cf9326..fea65f3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ # Testing artefacts .temp-profile +tests/.env +tests/.env.local # logs geckodriver.log diff --git a/tests/.env.example b/tests/.env.example new file mode 100644 index 0000000..2e021bb --- /dev/null +++ b/tests/.env.example @@ -0,0 +1,9 @@ +# 4CAT API config for the map_item comparison tests. +# Copy this file to .env in this directory and fill in real values. +# .env is gitignored; .env.example is the committed template. + +# Base URL of the 4CAT instance to hit. No trailing slash. +FOURCAT_URL=http://localhost + +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user. +FOURCAT_API_KEY=your-api-key-here diff --git a/tests/package-lock.json b/tests/package-lock.json index cc8f457..d055883 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" @@ -1758,6 +1759,19 @@ "node": ">=12" } }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", diff --git a/tests/package.json b/tests/package.json index 6dd35fb..333564a 100644 --- a/tests/package.json +++ b/tests/package.json @@ -5,10 +5,12 @@ "type": "module", "scripts": { "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch" + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", + "probe": "node probe-4cat.mjs" }, "devDependencies": { "dexie": "^3.2.4", + "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", "jest-environment-jsdom": "^29.7.0" From 6a8ce3870f4e0b6c050d68573d8affa4cc46e37b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 15:16:34 +0200 Subject: [PATCH 13/33] mirror 4CAT API missing value --- js/lib.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/js/lib.js b/js/lib.js index e38430e..c618a6a 100644 --- a/js/lib.js +++ b/js/lib.js @@ -57,6 +57,12 @@ class MissingMappedField { toString() { return `${this.value}`; } + + // Mirror 4CAT's API serialization so JSON.stringify produces the same + // tagged form on both sides. See docs/4cat-map-item-api.md. + toJSON() { + return { __missing: true, value: this.value }; + } } /** From 0c3140376ebd6e37cb1706fc48a105168d84d089 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:41:52 +0200 Subject: [PATCH 14/33] test the 4cat API endpoint --- tests/probe-4cat.mjs | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/probe-4cat.mjs diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs new file mode 100644 index 0000000..0bf4e4d --- /dev/null +++ b/tests/probe-4cat.mjs @@ -0,0 +1,140 @@ +/** + * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item. + * + * Usage: + * node probe-4cat.mjs [] [--index N] + * + * is the Zeeschuimer module filename without `.js` (e.g. + * "tiktok", "pinterest"). If is omitted, the first + * .ndjson in tests/fixtures// is used. --index selects which + * line of the fixture to send (default 0). + * + * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY. + */ + +import 'dotenv/config'; +import { readFileSync, existsSync, readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; + +if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') { + console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env'); + console.error(' (copy tests/.env.example to tests/.env and fill in real values)'); + process.exit(1); +} + +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function auth_headers() { + return { 'Authorization': `${FOURCAT_API_KEY}` }; +} + +async function list_datasources() { + const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() }); + if (!res.ok) { + throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`); + } + const body = await res.json(); + return body.datasources ?? []; +} + +async function map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { ...auth_headers(), 'Content-Type': 'application/json' }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + let body; + try { body = JSON.parse(text); } catch { body = { raw: text }; } + return { status_code: res.status, body }; +} + +function parse_args(argv) { + const args = { module: null, fixture: null, index: 0 }; + const positional = []; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === '--index') { + args.index = parseInt(argv[++i], 10); + } else if (argv[i].startsWith('--index=')) { + args.index = parseInt(argv[i].split('=')[1], 10); + } else { + positional.push(argv[i]); + } + } + args.module = positional[0]; + args.fixture = positional[1]; + return args; +} + +async function main() { + const args = parse_args(process.argv); + if (!args.module) { + console.error('Usage: node probe-4cat.mjs [] [--index N]'); + process.exit(1); + } + + const datasource_id = ID_MAP[args.module] ?? args.module; + const fixture_dir = join(__dirname, 'fixtures', args.module); + + if (!existsSync(fixture_dir)) { + console.error(`error: no fixture dir at ${fixture_dir}`); + process.exit(1); + } + + const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (candidates.length === 0) { + console.error(`error: no .ndjson fixtures under ${fixture_dir}`); + process.exit(1); + } + const fixture_name = args.fixture ?? candidates[0]; + const fixture_path = join(fixture_dir, fixture_name); + if (!existsSync(fixture_path)) { + console.error(`error: fixture ${fixture_path} not found`); + process.exit(1); + } + + const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0); + if (args.index >= lines.length) { + console.error(`error: --index ${args.index} but fixture has ${lines.length} items`); + process.exit(1); + } + const item = JSON.parse(lines[args.index]); + + console.log(`Module: ${args.module}`); + console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`); + console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`); + console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`); + console.log(''); + + const { status_code, body } = await map_item(datasource_id, item); + console.log(`HTTP ${status_code}`); + console.log(JSON.stringify(body, null, 2)); + + if (status_code === 404) { + console.error(''); + console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:'); + try { + const datasources = await list_datasources(); + datasources + .filter(d => d.is_from_zeeschuimer && d.has_map_item) + .forEach(d => console.error(` - ${d.id} (${d.name})`)); + } catch (e) { + console.error(` (couldn't fetch list: ${e.message})`); + } + process.exit(2); + } +} + +main().catch(e => { + console.error(`probe failed: ${e.message}`); + process.exit(2); +}); From be2f3087d8dd5af07175101a808903604c84d78b Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:04 +0200 Subject: [PATCH 15/33] update docs and packages --- docs/test-plan.md | 6 +++--- tests/package-lock.json | 13 ++++++++++++- tests/setup-globals.cjs | 11 +++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/test-plan.md b/docs/test-plan.md index 249a7e0..a4265eb 100644 --- a/docs/test-plan.md +++ b/docs/test-plan.md @@ -63,7 +63,7 @@ Phase 3 — 4CAT integration (optional) - Problem: mapping tests live in 4CAT and need NDJSON input. - Changes: - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. - - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). + - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). - Do not fail the test run on 4CAT errors — print status and continue. Example upload with `requests`: @@ -73,7 +73,7 @@ import requests with open(ndjson_path, 'rb') as f: headers = { 'X-Zeeschuimer-Platform': platform, - 'Authorization': f'Bearer {fourcat_key}' + 'Authorization': f'{fourcat_key}' } r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) # check r.status_code and r.text for details @@ -149,7 +149,7 @@ Estimated effort: 6–10 hours of focused work to implement and test everything Open questions / confirmations needed -- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. +- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. - Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) Next steps diff --git a/tests/package-lock.json b/tests/package-lock.json index d055883..7758e9f 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -12,7 +12,8 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } }, "node_modules/@babel/code-frame": { @@ -4197,6 +4198,16 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/undici": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz", + "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index 4f54e34..6793cc0 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -40,3 +40,14 @@ return { ${EXPOSED_NAMES.join(', ')} }; `); Object.assign(globalThis, factory()); + +// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global +// fetch, so the comparator can't hit 4CAT without help. Polyfill from +// undici (a Node-friendly HTTP client, separately installable on npm — +// distinct from the undici bundled internally by Node, which isn't +// require()-able by name). +// Note: tests that use fetch (e.g. map_item_compare.test.js) declare +// `@jest-environment node` at the top of the file. Node env has fetch +// natively. Don't try to polyfill into jsdom — undici's internals use +// Node-specific globals that jsdom shadows (clearImmediate, +// markResourceTiming, fast timers), and polyfilling them all is brittle. From caf1c7f48a19524282c06b688c08001e534791db Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:43:17 +0200 Subject: [PATCH 16/33] some mapping for odd datasource names --- tests/zeeschuimer-to-4cat.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/zeeschuimer-to-4cat.json diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json new file mode 100644 index 0000000..f7de942 --- /dev/null +++ b/tests/zeeschuimer-to-4cat.json @@ -0,0 +1,7 @@ +{ + "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.", + "9gag": "ninegag", + "truth": "truthsocial", + "rednote": "xiaohongshu", + "rednote-comments": "xiaohongshu-comments" +} From f10fc492845051c87b96b75561eb91de2af99d18 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:05 +0200 Subject: [PATCH 17/33] update existing map_item tests and add helper --- tests/_module-info.js | 45 ++++++++++++++++++ tests/map_item.test.js | 105 +++++++++++++++++++---------------------- 2 files changed, 93 insertions(+), 57 deletions(-) create mode 100644 tests/_module-info.js diff --git a/tests/_module-info.js b/tests/_module-info.js new file mode 100644 index 0000000..e261e4e --- /dev/null +++ b/tests/_module-info.js @@ -0,0 +1,45 @@ +/** + * Shared helper for the map_item test drivers. + * + * Pre-validates a module by: + * 1. Running `node --check` on its file (syntax check; avoids the + * worker-killing experimental-ESM crash when a syntax error reaches + * the dynamic importer). + * 2. Dynamically importing it and checking for a `map_item` export. + * + * Returns one of four states the test driver can branch on: + * { state: 'ok', map_item: } + * { state: 'no_map_item' } + * { state: 'syntax_error', error: } + * { state: 'import_error', error: } + */ + +import { spawnSync } from 'node:child_process'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODULES_ROOT = join(__dirname, '..', 'modules'); + +function check_module_syntax(module_name) { + const module_path = join(MODULES_ROOT, `${module_name}.js`); + const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); + if (result.status === 0) return null; + return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); +} + +export async function inspect_module(module_name) { + const syntax_error = check_module_syntax(module_name); + if (syntax_error) { + return { state: 'syntax_error', error: syntax_error }; + } + try { + const mod = await import(`../modules/${module_name}.js`); + if (typeof mod.map_item !== 'function') { + return { state: 'no_map_item' }; + } + return { state: 'ok', map_item: mod.map_item }; + } catch (e) { + return { state: 'import_error', error: e }; + } +} diff --git a/tests/map_item.test.js b/tests/map_item.test.js index 9dee6e8..2dc1bb6 100644 --- a/tests/map_item.test.js +++ b/tests/map_item.test.js @@ -1,5 +1,5 @@ /** - * Auto-discovery test driver for module `map_item` functions. + * Smoke test driver for module `map_item` functions. * * Convention: * tests/fixtures//*.ndjson @@ -11,52 +11,36 @@ * presents items to a map_item function, then run through the module's * map_item. Tests assert: function returns a non-null object, and any fields * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. + * + * Module-level state is determined upfront by inspect_module(): + * - 'ok' → register per-item tests + * - 'no_map_item' → register a single skipped test (not applicable) + * - 'syntax_error' → register a single failing test pointing at the line + * - 'import_error' → register a single failing test with the message */ import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; -import { spawnSync } from 'node:child_process'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; - -/** - * Local mirror of wrap_for_map_item from js/lib.js. - * - * lib.js is loaded by the browser as a plain script (it defines globals - * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be - * imported from Node. The wrap is three trivial lines with no dependencies - * — duplicating it here is cheaper than restructuring lib.js into a module. - * If lib.js's wrap_for_map_item ever gains real logic, this needs to track. - */ -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; -} +import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const FIXTURE_ROOT = join(__dirname, 'fixtures'); -const MODULES_ROOT = join(__dirname, '..', 'modules'); - -/** - * Pre-validate module syntax before dynamic import. - * - * `await import()` on a module with a syntax error throws inside V8's module - * linker in a way Jest's experimental-vm-modules can't always recover from - * (worker retry loop or Node process exit). Running `node --check` first - * gives us a clean error string we can fail the test with. - */ -function check_module_syntax(module_name) { - const module_path = join(MODULES_ROOT, `${module_name}.js`); - const result = spawnSync(process.execPath, ['--check', module_path], { - encoding: 'utf8', - }); - if (result.status === 0) return null; - return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); -} const REQUIRED_NON_EMPTY = { tiktok: ['id', 'author', 'unix_timestamp'], }; +/** + * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by + * the browser as a plain script and so cannot be imported from Node; this + * three-line mirror is cheaper than restructuring lib.js into a module. + */ +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + function list_module_dirs() { if (!existsSync(FIXTURE_ROOT)) return []; return readdirSync(FIXTURE_ROOT).filter(name => { @@ -66,36 +50,46 @@ function list_module_dirs() { } const module_dirs = list_module_dirs(); + +// Pre-pass: synchronously determine each module's state so we can branch +// on it at describe/test registration time. Top-level await is supported +// in Jest's experimental-vm-modules mode. +const module_info = {}; +for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); +} + let total_fixtures = 0; for (const module_name of module_dirs) { const fixture_dir = join(FIXTURE_ROOT, module_name); const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; total_fixtures += fixture_files.length; - describe(`map_item: ${module_name}`, () => { - let map_item; - let import_error; - - beforeAll(async () => { - const syntax_error = check_module_syntax(module_name); - if (syntax_error) { - import_error = new Error(`syntax error:\n${syntax_error}`); - return; - } - try { - const mod = await import(`../modules/${module_name}.js`); - map_item = mod.map_item; - if (typeof map_item !== 'function') { - import_error = new Error(`modules/${module_name}.js does not export a map_item function`); - } - } catch (e) { - import_error = e; - } + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + describe(`map_item: ${module_name}`, () => { + test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {}); + }); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); }); + continue; + } + + // state === 'ok' — register per-item tests + const map_item = info.map_item; + describe(`map_item: ${module_name}`, () => { for (const fixture_file of fixture_files) { const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') .split('\n') @@ -104,9 +98,6 @@ for (const module_name of module_dirs) { describe(fixture_file, () => { lines.forEach((line, i) => { test(`item ${i} maps without throwing`, () => { - if (import_error) { - throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`); - } const stored_item = JSON.parse(line); const mapped = map_item(wrap_for_map_item(stored_item)); expect(mapped).not.toBeNull(); From 3633cde656da3f70880ae49a2909deba3a044953 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:23 +0200 Subject: [PATCH 18/33] comparison testing for datasources --- tests/map_item_compare.test.js | 283 +++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tests/map_item_compare.test.js diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js new file mode 100644 index 0000000..37e3e4c --- /dev/null +++ b/tests/map_item_compare.test.js @@ -0,0 +1,283 @@ +/** + * @jest-environment node + * + * This file runs in Node test environment (not jsdom) because undici's + * fetch implementation uses Node-internal APIs (`clearImmediate`, + * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or + * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env + * has them all natively. + * + * Trade-off: no DOMParser in node env. The four modules that use + * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser + * polyfill (e.g. via linkedom) before the comparator can run against + * them. Other modules (including instagram) work as-is. + */ +/** + * Compare JS map_item output against 4CAT's Python map_item via the API. + * + * For every line in every fixture, runs the JS map_item locally AND sends + * the same stored item to 4CAT's /api/map-item// endpoint, then + * diffs the two outputs field-by-field. Each item is its own Jest test — + * failures point at exactly which item and which fields diverge. + * + * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so + * `npm test` keeps working without 4CAT configuration. Drop real values in + * tests/.env to enable. + * + * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer + * module filename → 4CAT datasource id, for the few names that diverge). + * + * Module-level state is determined upfront by inspect_module() (no + * map_item / syntax errors / import errors are handled before tests are + * registered, so they appear once per module, not once per item). + */ + +import 'dotenv/config'; +import { jest } from '@jest/globals'; +import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { inspect_module } from './_module-info.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); +const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; +const HAS_4CAT = Boolean( + FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here' +); + +// When true (default), once any item in a module fails, subsequent items +// in that same module skip the HTTP + map_item work and fail fast with a +// "halted" message. Saves time when generator output is broken at the top. +// Set FAIL_FAST=0 in env to run all items regardless. +// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing +// space in the variable value, which would otherwise defeat `!== '0'`. +const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0'; +const halted_modules = new Set(); + +const FIXTURE_ROOT = join(__dirname, 'fixtures'); +const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); +const ID_MAP = existsSync(ID_MAP_PATH) + ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) + : {}; + +function wrap_for_map_item(stored_item) { + const { data, ...meta } = stored_item; + return { ...data, __import_meta: meta }; +} + +async function call_4cat_map_item(datasource_id, item) { + const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { + method: 'POST', + headers: { + // 4CAT accepts the raw key without a `Bearer ` prefix, per probe + 'Authorization': FOURCAT_API_KEY, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ item }), + }); + const text = await res.text(); + if (!res.ok) { + throw new Error(`HTTP ${res.status} from 4CAT: ${text}`); + } + return JSON.parse(text); +} + +// Round-trip a value through JSON so MappedItem, MissingMappedField, etc. +// become plain JSON-compatible objects matching what 4CAT emits. +function normalize(value) { + return JSON.parse(JSON.stringify(value)); +} + +// Recursive structural equality. Doesn't care about object key order, which +// matters for nested values like {__missing: true, value: ""} where JS and +// Python might emit keys in different orders. +function deep_equal(a, b) { + if (a === b) return true; + if (a === null || b === null) return a === b; + if (typeof a !== typeof b) return false; + if (typeof a !== 'object') return false; + if (Array.isArray(a) !== Array.isArray(b)) return false; + if (Array.isArray(a)) { + if (a.length !== b.length) return false; + return a.every((v, i) => deep_equal(v, b[i])); + } + const a_keys = Object.keys(a); + const b_keys = Object.keys(b); + if (a_keys.length !== b_keys.length) return false; + return a_keys.every(k => k in b && deep_equal(a[k], b[k])); +} + +function diff_objects(js_obj, py_obj) { + const diffs = []; + const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]); + for (const key of keys) { + const in_js = js_obj && key in js_obj; + const in_py = py_obj && key in py_obj; + if (!in_js) { + diffs.push({ key, kind: 'only_python', python: py_obj[key] }); + } else if (!in_py) { + diffs.push({ key, kind: 'only_js', js: js_obj[key] }); + } else if (!deep_equal(js_obj[key], py_obj[key])) { + diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] }); + } + } + return diffs; +} + +function format_diffs(diffs) { + return diffs.map(d => { + if (d.kind === 'only_js') { + return ` + only in JS: ${d.key} = ${JSON.stringify(d.js)}`; + } + if (d.kind === 'only_python') { + return ` - only in Python: ${d.key} = ${JSON.stringify(d.python)}`; + } + return ` ~ ${d.key}\n JS: ${JSON.stringify(d.js)}\n Python: ${JSON.stringify(d.python)}`; + }).join('\n'); +} + +// Pull out the first few module-frame lines from an error's stack so the +// failure message points at where in modules/.js the throw happened. +function format_error_with_location(err) { + if (!err) return String(err); + const message = err.message || String(err); + const stack = err.stack || ''; + const module_frames = stack.split('\n') + .filter(l => l.includes('/modules/') || l.includes('\\modules\\')) + .slice(0, 3) + .map(l => l.trim()); + return module_frames.length + ? `${message}\n ${module_frames.join('\n ')}` + : message; +} + +function list_module_dirs() { + if (!existsSync(FIXTURE_ROOT)) return []; + return readdirSync(FIXTURE_ROOT).filter(name => { + try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } + catch { return false; } + }); +} + +// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's +// default 5s is tight under load. +jest.setTimeout(30000); + +if (!HAS_4CAT) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {}); + }); +} else { + const module_dirs = list_module_dirs(); + + // Pre-pass: synchronously determine each module's state so we can branch + // on it at registration time. + const module_info = {}; + for (const module_name of module_dirs) { + module_info[module_name] = await inspect_module(module_name); + } + + let any_fixtures = false; + + for (const module_name of module_dirs) { + const fixture_dir = join(FIXTURE_ROOT, module_name); + const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); + if (fixture_files.length === 0) continue; + any_fixtures = true; + + const datasource_id = ID_MAP[module_name] ?? module_name; + const info = module_info[module_name]; + + if (info.state === 'no_map_item') { + // eslint-disable-next-line no-console + console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`); + continue; + } + + if (info.state === 'syntax_error' || info.state === 'import_error') { + const msg = info.state === 'syntax_error' + ? `syntax error:\n${info.error}` + : `import failed: ${info.error.message}`; + describe(`map_item compare: ${module_name}`, () => { + test(`module loads`, () => { throw new Error(msg); }); + }); + continue; + } + + // state === 'ok' — register per-item comparison tests + const map_item = info.map_item; + + describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => { + for (const fixture_file of fixture_files) { + const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') + .split('\n') + .filter(line => line.trim().length > 0); + + describe(fixture_file, () => { + lines.forEach((line, i) => { + test(`item ${i}`, async () => { + if (FAIL_FAST && halted_modules.has(module_name)) { + throw new Error( + '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]' + ); + } + try { + const stored_item = JSON.parse(line); + + // 4CAT side + const response = await call_4cat_map_item(datasource_id, stored_item); + + // JS side + let js_result; + let js_error; + try { + js_result = map_item(wrap_for_map_item(stored_item)); + } catch (e) { + js_error = e; + } + + if (response.status === 'mapped') { + if (js_error) { + throw new Error( + `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}` + ); + } + const js_obj = normalize(js_result); + const py_obj = normalize(response.item); + const diffs = diff_objects(js_obj, py_obj); + if (diffs.length > 0) { + throw new Error( + `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}` + ); + } + } else if (response.status === 'skipped') { + if (!js_error) { + throw new Error( + `4CAT skipped this item ("${response.reason}") but JS produced a result` + ); + } + // Both rejected — good. Skip reasons may differ in wording. + } else if (response.status === 'error') { + throw new Error(`4CAT errored on this item: ${response.message}`); + } else { + throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`); + } + } catch (e) { + if (FAIL_FAST) halted_modules.add(module_name); + throw e; + } + }); + }); + }); + } + }); + } + + if (!any_fixtures) { + describe('map_item compare (JS vs 4CAT Python)', () => { + test.skip('no fixtures under tests/fixtures//*.ndjson', () => {}); + }); + } +} From 7d97a0fe342e3b7f932c79fe22e9b8c6b3c25bb3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:44:35 +0200 Subject: [PATCH 19/33] list common translation errors --- tests/translation-errors.md | 430 ++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 tests/translation-errors.md diff --git a/tests/translation-errors.md b/tests/translation-errors.md new file mode 100644 index 0000000..fcc160d --- /dev/null +++ b/tests/translation-errors.md @@ -0,0 +1,430 @@ +# Auto-generator translation errors + +Patterns of incorrect Python → JavaScript translation observed in +auto-generated `modules/*.js` files. Each entry has a search pattern so +this doc doubles as a checklist when reviewing a new auto-generator PR. + +When an entry is fixed at the generator level (no longer appears in +fresh output), mark it `[fixed]` and keep the entry around — useful +history when something regresses. + +## How to use + +- Found a new pattern? Add an entry below following the template. +- Reviewing a generator PR? `grep` each `Search pattern` against the + changed module files. Anything that hits is worth a manual look. +- Iterating on the generator prompt? The "Why" lines are the + feedback to add — they describe the exact Python-vs-JS semantic + difference the LLM keeps missing. + +## Template + +``` +### + +**Status:** open | fixed in generator | accepted + +**Why it happens:** + +**Wrong JS:** +```js + +``` + +**Correct JS:** +```js + +``` + +**Example:** `modules/.js:` + +**Search pattern:** `` +``` + +--- + +## Observed patterns + +### `in` operator on strings + +**Status:** open + +**Why it happens:** In Python, `"x" in some_string` is a substring check. +In JavaScript, the `in` operator only works on **objects** and checks for +property/key existence; using it with a string on the right-hand side +throws `TypeError: cannot use 'in' operator to search for "x" in `. + +**Wrong JS:** +```js +const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase(); +``` + +**Correct JS:** +```js +const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris'); +``` + +**Example:** `modules/instagram.js:513` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed +by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/` + +**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()` +— adding `?? ''` guards against `undefined` but the `in` operator itself +still throws on the resulting *string*. The fix is `.includes()`, not just +defaulting the operand. + +--- + +### Python f-string syntax left in single-quoted JS strings + +**Status:** open + +**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses +template literals (backticks) with `${var}`. The auto-generator leaves the +`{var}` notation in a regular single- or double-quoted JS string, which is +just literal text — no interpolation happens. + +**Wrong JS:** +```js +throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}'); +``` + +**Correct JS:** +```js +throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`); +``` + +**Example:** `modules/instagram.js:754` + +**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"` +— a non-template-literal string containing `{identifier}` or `{identifier.path}`. +Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/` + +--- + +### `?? {}` default that defeats subsequent truthy checks + +**Status:** open + +**Why it happens:** When porting Python's `node.get('user') or {}` (which is +intended to make subsequent code safe to call), the generator emits +`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following +`if (user && owner) { ... }` guard then never short-circuits because both +`{}` references are truthy. The check ends up reading "if user and owner +*objects* exist" when the intent was "if user and owner data exist." +Subsequent property accesses then compare real ids/usernames against +`undefined` on the missing side, often throwing. + +**Wrong JS:** +```js +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (user && owner) { + if (user.id === owner.id) { /* … */ } + else if (user.username !== owner.username) { + throw new MapItemException('different user and owner'); + } +} +``` + +**Correct JS** (depending on intent — pick one): +```js +// (a) drop the defaults so truthy guard means "both present" +const user = node.user; +const owner = node.owner; +if (user && owner) { /* compare */ } +``` +```js +// (b) check for actual content, not just object identity +const user = node.user ?? {}; +const owner = node.owner ?? {}; +if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ } +``` + +**Example:** `modules/instagram.js:748-756` + +**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a +review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/` + +--- + +### Bare relative path as a statement (junk auto-imports section) + +**Status:** open + +**Why it happens:** The generator emits an "auto-generated imports" marker +block at the top of the module but writes the import target as a bare +relative path on its own line (`../js/lib.js`) instead of a real `import` +statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error. + +**Wrong JS:** +```js +// === auto-generated imports for map_item — DO NOT EDIT BY HAND === +../js/lib.js +// === end auto-generated imports === +``` + +**Correct JS** (one of): +```js +// === auto-generated imports — DO NOT EDIT BY HAND === +// Provided as globals by js/lib.js (loaded via manifest.json): +// MappedItem, MissingMappedField, MapItemException, traverse_data, +// strip_tags, normalize_url_encoding, formatUtcTimestamp +// === end auto-generated imports === +``` + +Or, if a real import is intended, an ESM import with named bindings: +```js +import { MappedItem, MissingMappedField } from '../js/lib.js'; +``` + +**Example:** seen historically in `modules/tiktok.js:2` + +**Search pattern:** `^\.\./` at the start of a line in module files. +Quick check: `grep -nE "^\.\." modules/*.js` + +--- + +### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`) + +**Status:** open + +**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on +the value* — returns False if the key is missing **or** if the value is +`None`/empty/falsy. The generator translates this to `if ('usertags' in +node)`, which in JS is a *key-existence check* — returns True even when +the value is `null`. Subsequent property accesses on the null value then +throw `Cannot read properties of null`. + +**Wrong JS:** +```js +const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : ''; +// node.usertags can be null → .in.map blows up +``` + +**Correct JS:** +```js +const usertags = node.usertags ? node.usertags.in.map(...).join(',') : ''; +``` + +**Example:** `modules/instagram.js:777` + +**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in` +identifier followed by `?` (ternary). Quick check: +`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/` + +--- + +### Datetime serialization format mismatch + +**Status:** open + +**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')` +produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's +`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T +separator, milliseconds, Z. The generator emits the JS `.toISOString()` form +instead of using the existing `formatUtcTimestamp` helper from lib.js that +mimics Python's output exactly. + +**Wrong JS:** +```js +collected_at = new Date(node.taken_at * 1000).toISOString(); +``` + +**Correct JS:** +```js +collected_at = formatUtcTimestamp(node.taken_at); +// formatUtcTimestamp is defined in js/lib.js as: +// new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19) +``` + +**Example:** `modules/instagram.js:782` + +**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of +`.toISOString()`. The helper should be used instead. Quick check: +`grep -nE "\.toISOString\(\)" modules/` + +--- + +### `re.findall` capture groups vs JS `.match` with /g flag + +**Status:** open + +**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture +group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the +global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture +groups are ignored. The generator translates the regex literally without +adjusting for this semantic difference, so the resulting strings keep +prefixes/wrappers that Python would have stripped. + +**Wrong JS:** +```js +hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') +// produces "#lotr,#woodart" +``` + +**Correct JS:** +```js +// Option A: strip the literal prefix from each full match +hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? '' +// Option B: use matchAll to get capture groups properly +hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? '' +``` + +**Example:** `modules/instagram.js:812` (also 766, 870 — three copies) + +**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with +a global-flag regex containing a capture group. Quick check: +`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/` + +--- + +### `undefined` field values get dropped from JSON, but Python's `None` becomes `null` + +**Status:** open + +**Why it happens:** When `JSON.stringify` encounters an object property whose +value is `undefined`, it **omits the key entirely** from the output. Python's +`json.dumps` serializes `None` as `null`, keeping the key. The generator +writes assignments like `location.city = node.location.city` where the +right-hand side can be `undefined`, producing missing keys in JS output +that show up as `only in Python: = null` diffs against 4CAT. + +**Wrong JS:** +```js +location.city = node.location.city; // undefined if .city missing +// JSON.stringify({location_city: undefined}) → "{}" (key omitted) + +body: caption, // null if no caption — Python returns "" here, not null +``` + +**Correct JS:** +```js +// Whichever fallback Python uses for that specific field: +location.city = node.location.city ?? null; // some fields → null +body: caption ?? '', // other fields → "" +``` + +**Example:** `modules/instagram.js:745, 853` (`null` flavor), +559, 648, 798 (`""` flavor for `body`) + +**Note:** Python's choice of `None` vs `""` is per-field — there's no +universal rule. When the comparator reports `~ X JS: null Python: ""` use +`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The +distinction matters because the JS output should match Python's choice +exactly for that field. + +**Search pattern:** harder to grep automatically — any property assignment +where the RHS could be `undefined`/`null` and the resulting field is +expected to appear in the mapped output. Look at "only in Python: X = null" +and "~ X JS: null Python: \"\"" diffs in the comparator output to find +specific cases. + +--- + +### Object-reference inequality used as type check + +**Status:** open + +**Why it happens:** The generator emits `caption !== new MissingMappedField('')` +to mean "caption is not a missing-marker", but `new MissingMappedField('')` +creates a fresh object every time, and `!==` on objects compares references. +The expression is **always true**, so the conditional never takes the +"missing" branch. Likely originates from Python idioms like `caption != ""` +or `caption is not None`, mistranslated through the MissingMappedField +abstraction. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '', +// !== between two different object references is always true +``` + +**Correct JS:** +```js +// If the intent was "if caption has content", just truthy-check it: +hashtags: caption ? caption.match(...) : '', +// If the intent was "if caption is not a MissingMappedField instance": +hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '', +``` + +**Example:** `modules/instagram.js:812` (and two other copies) + +**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality +comparison with a freshly-constructed object. Quick check: +`grep -nE "(!==|===) new [A-Z]" modules/` + +--- + +### `.method()` chain on potentially-null result + +**Status:** open + +**Why it happens:** In Python, calling a method on `None` raises +`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on +`null`/`undefined` throws `TypeError: Cannot read properties of null +(reading '')`. The generator emits the same dotted chain without +optional-chaining (`?.`) protection. + +**Wrong JS:** +```js +hashtags: caption !== new MissingMappedField('') + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') + : '', +``` +(here `caption` is allowed to be `null`, so `caption.match(...)` blows up +on null caption) + +**Correct JS:** +```js +hashtags: caption + ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? '' + : '', +``` + +**Example:** `modules/instagram.js:809` + +**Search pattern:** harder to grep — needs reading. Worth manual review of +any field that uses `caption.match`, `something.split`, `something.join` +without `?.` on a value that could be null/undefined. + +--- + +## Generator prompt feedback (running list) + +Concrete things to fold into the generator's prompt over time: + +1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS, + never `x in y`. +2. **Python f-strings** → use JS template literals (backticks) with + `${...}` syntax. Never leave `{...}` in single- or double-quoted strings. +3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the + following code does property-access. If the following code does a + truthy guard (`if (x && y)`), drop the default and use just `node.user`. +4. **Method chains on possibly-null values** → use `?.` (optional + chaining) instead of `.` whenever the receiver could be null/undefined. +5. **The auto-imports header block** → emit either real `import { ... }` + statements with valid relative paths, or a comment-only header. + Never emit bare paths as JS statements. +6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or + `node.X != null`), not `'X' in node`. The `in` operator checks key + existence, which is True even for explicit-null values. +7. **Datetime serialization** → use the `formatUtcTimestamp` helper from + lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format), + not `new Date(...).toISOString()` (which has a different output shape: + T separator, milliseconds, Z suffix). +8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns + full matches, NOT capture groups. To get capture-group behavior, use + either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the + full matches with `.map(...)` to strip the literal parts. +9. **Object-reference equality (`!== new X(...)`)** → never. Creating an + object with `new` produces a fresh reference; `===`/`!==` compares + identity. Use `instanceof X` for type checks, or compare values + directly. The MissingMappedField "is this missing?" check should be + `caption instanceof MissingMappedField` or just truthy-check the value. +10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a + field's value could be missing and Python returns `null` for it, + JS must explicitly assign `null` (not leave the value as `undefined`). + `JSON.stringify` drops `undefined` keys silently. Use `value ?? null` + when the field is expected to appear in the mapped output. From 6ad4c134cf35d0993b2968f3b2dc832e2766794d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 27 May 2026 18:45:52 +0200 Subject: [PATCH 20/33] package.json fix --- tests/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/package.json b/tests/package.json index 333564a..390fdd3 100644 --- a/tests/package.json +++ b/tests/package.json @@ -13,6 +13,7 @@ "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0" + "jest-environment-jsdom": "^29.7.0", + "undici": "^6.20.0" } } From 11ffffbdea4b853fd88e219d719d6d7947fab6df Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:04:51 +0200 Subject: [PATCH 21/33] rm other test doc --- docs/test-plan.md | 162 ---------------------------------------------- 1 file changed, 162 deletions(-) delete mode 100644 docs/test-plan.md diff --git a/docs/test-plan.md b/docs/test-plan.md deleted file mode 100644 index a4265eb..0000000 --- a/docs/test-plan.md +++ /dev/null @@ -1,162 +0,0 @@ -# Selenium Test Harness — Improvement Plan - -Date: 2026-04-30 - -Overview - -This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to: - -- Make profile handling reliable and reusable (so logged-in sessions persist across runs). -- Preserve and export captured data per platform for offline analysis and for passing to 4CAT. -- Add optional automated upload to a 4CAT instance for mapping/validation tests. -- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns). -- Improve robustness, error handling, and machine-readable results. - -Scope - -All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB). - -Phases & Changes - -Phase 1 — Profile management - -- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data. -- Changes: - - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running. - - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root. - - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run). - -Implementation note (copytree ignore example): - -```python -def _profile_ignore(root, names): - # Only ignore these entries in the root profile dir - if os.path.abspath(root) == os.path.abspath(profile_dir): - return {"storage", "extensions", "signedInUser.json"} - return set() - -shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore) -``` - -Phase 2 — Data preservation & export - -- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests. -- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform. -- Changes: - - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`). - - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`. - - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL). - -Execute_async_script pattern (example): - -```python -script = ''' -const cb = arguments[0]; -background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)}))); -''' -items_json = driver.execute_async_script(script) -items = json.loads(items_json) -``` - -Phase 3 — 4CAT integration (optional) - -- Problem: mapping tests live in 4CAT and need NDJSON input. -- Changes: - - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload. - - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required). - - Do not fail the test run on 4CAT errors — print status and continue. - -Example upload with `requests`: - -```python -import requests -with open(ndjson_path, 'rb') as f: - headers = { - 'X-Zeeschuimer-Platform': platform, - 'Authorization': f'{fourcat_key}' - } - r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f) - # check r.status_code and r.text for details -``` - -Phase 4 — Interactive controls & popup dismissals - -- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures. -- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options. -- Changes: - - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning). - - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example: - -```json -"dismiss-selectors": ["button.cookie-accept", ".modal .close"] -``` - - - Add per-URL `timeout` (page load timeout override). - -Phase 5 — Runner robustness & reporting - -- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results. -- Changes: - - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue. - - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run. - - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted). - - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`). - -tests.json schema additions - -- Per-URL optional fields: - - `dismiss-selectors`: array of CSS selectors to click after page load - - `timeout`: numeric page load timeout seconds for this URL - - `extra-wait`: per-URL additional wait seconds - -CLI flags (summary) - -- `--profiledir PATH` — explicit profile path (existing) -- `--profile-name NAME` — choose Firefox profile by display name -- `--save-profile PATH` — persist the copied profile for reuse -- `--no-cleanup` — keep `.temp-profile` -- `--export-dir PATH` — where to write NDJSON exports -- `--no-reset` — do not click `reset-all` between URLs -- `--4cat-url URL` — base URL for 4CAT server -- `--4cat-key KEY` — API key for 4CAT uploads -- `--4cat-per-url` — upload per URL instead of per platform (optional) -- `--no-interactive` — disable pausing (default is to pause per-platform) -- `--pause-before-url` — pause before each URL -- `--pause-on-fail` — pause when a test fails -- `--extra-wait N` — add N seconds to every URL wait -- `--screenshot-dir PATH` — save screenshots on fail/warning -- `--results-file PATH` — write machine-readable results JSON -- `--resume-from PLATFORM` — resume a run from a platform - -Verification checklist - -1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items. -2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`. -3. Run with default interactive behavior and confirm one pause per platform. -4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts. -5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`. - -Implementation steps (recommended order) - -1. Docs and small fixes (this document + tests.json typo fix). -2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection). -3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write. -4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement. -5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots). -6. 4CAT upload integration (optional, requires confirmation of auth header). - -Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs. - -Open questions / confirmations needed - -- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead. -- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.) - -Next steps - -- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`. -- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes. - ---- - -Requested file: `docs/test-plan.md` From 6cc61003e95be381b191baae1486f989a2ed3e71 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:05:55 +0200 Subject: [PATCH 22/33] map_item.test.js verify modules import and map_item exists only --- tests/map_item.test.js | 134 ++++++++++------------------------------- 1 file changed, 31 insertions(+), 103 deletions(-) diff --git a/tests/map_item.test.js b/tests/map_item.test.js index 2dc1bb6..774c083 100644 --- a/tests/map_item.test.js +++ b/tests/map_item.test.js @@ -1,121 +1,49 @@ /** - * Smoke test driver for module `map_item` functions. + * Load-only smoke for every module under `modules/*.js`. * - * Convention: - * tests/fixtures//*.ndjson + * For each module file, runs `inspect_module()` and asserts the module: + * - parses (no SyntaxError) + * - imports without throwing + * - either exports a `map_item` function, or doesn't (both are fine here) * - * matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js). - * Each .ndjson line is one Zeeschuimer-stored item exported from the popup. + * No data is fed through `map_item`. That work belongs in the comparator + * (Tier 2 — `npm run test:compare`), where real items pulled from a 4CAT + * dataset provide both the input and the expected output. * - * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer - * presents items to a map_item function, then run through the module's - * map_item. Tests assert: function returns a non-null object, and any fields - * listed in REQUIRED_NON_EMPTY for that module are present and non-empty. - * - * Module-level state is determined upfront by inspect_module(): - * - 'ok' → register per-item tests - * - 'no_map_item' → register a single skipped test (not applicable) - * - 'syntax_error' → register a single failing test pointing at the line - * - 'import_error' → register a single failing test with the message + * Catches: parse errors, import-time throws, broken top-level statements. + * Does NOT catch: anything that requires running `map_item` on real input. */ -import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { readdirSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { inspect_module } from './_module-info.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const FIXTURE_ROOT = join(__dirname, 'fixtures'); - -const REQUIRED_NON_EMPTY = { - tiktok: ['id', 'author', 'unix_timestamp'], -}; - -/** - * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by - * the browser as a plain script and so cannot be imported from Node; this - * three-line mirror is cheaper than restructuring lib.js into a module. - */ -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; -} - -function list_module_dirs() { - if (!existsSync(FIXTURE_ROOT)) return []; - return readdirSync(FIXTURE_ROOT).filter(name => { - try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } - catch { return false; } - }); -} +const MODULES_ROOT = join(__dirname, '..', 'modules'); -const module_dirs = list_module_dirs(); +const module_files = readdirSync(MODULES_ROOT) + .filter(f => f.endsWith('.js') && !f.startsWith('_')); -// Pre-pass: synchronously determine each module's state so we can branch -// on it at describe/test registration time. Top-level await is supported -// in Jest's experimental-vm-modules mode. const module_info = {}; -for (const module_name of module_dirs) { - module_info[module_name] = await inspect_module(module_name); +for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + module_info[name] = await inspect_module(name); } -let total_fixtures = 0; - -for (const module_name of module_dirs) { - const fixture_dir = join(FIXTURE_ROOT, module_name); - const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; - total_fixtures += fixture_files.length; - - const info = module_info[module_name]; - - if (info.state === 'no_map_item') { - describe(`map_item: ${module_name}`, () => { - test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {}); +describe('module load smoke', () => { + for (const file of module_files) { + const name = file.replace(/\.js$/, ''); + test(`modules/${file} loads cleanly`, () => { + const info = module_info[name]; + if (info.state === 'syntax_error') { + throw new Error(`syntax error in modules/${file}:\n${info.error}`); + } + if (info.state === 'import_error') { + throw new Error(`import failed for modules/${file}: ${info.error.message}`); + } + // 'ok' or 'no_map_item' — both acceptable at this tier. + expect(['ok', 'no_map_item']).toContain(info.state); }); - continue; } - - if (info.state === 'syntax_error' || info.state === 'import_error') { - const msg = info.state === 'syntax_error' - ? `syntax error:\n${info.error}` - : `import failed: ${info.error.message}`; - describe(`map_item: ${module_name}`, () => { - test(`module loads`, () => { throw new Error(msg); }); - }); - continue; - } - - // state === 'ok' — register per-item tests - const map_item = info.map_item; - - describe(`map_item: ${module_name}`, () => { - for (const fixture_file of fixture_files) { - const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') - .split('\n') - .filter(line => line.trim().length > 0); - - describe(fixture_file, () => { - lines.forEach((line, i) => { - test(`item ${i} maps without throwing`, () => { - const stored_item = JSON.parse(line); - const mapped = map_item(wrap_for_map_item(stored_item)); - expect(mapped).not.toBeNull(); - expect(typeof mapped).toBe('object'); - for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) { - expect(mapped[field]).toBeDefined(); - expect(mapped[field]).not.toBe(''); - expect(mapped[field]).not.toBeNull(); - } - }); - }); - }); - } - }); -} - -if (total_fixtures === 0) { - describe('map_item', () => { - test.skip('no fixtures found under tests/fixtures//*.ndjson', () => {}); - }); -} +}); From a090675c162573b3ae8633584010464d3d264bdc Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:06:24 +0200 Subject: [PATCH 23/33] remove old fixtures and 4cat probe --- tests/__pycache__/test.cpython-39.pyc | Bin 7345 -> 0 bytes tests/fixtures/.gitignore | 5 - tests/fixtures/README.md | 29 ------ tests/probe-4cat.mjs | 140 -------------------------- 4 files changed, 174 deletions(-) delete mode 100644 tests/__pycache__/test.cpython-39.pyc delete mode 100644 tests/fixtures/.gitignore delete mode 100644 tests/fixtures/README.md delete mode 100644 tests/probe-4cat.mjs diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc deleted file mode 100644 index 745e2b4aaad921a459372bb50b39980c50a68136..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7345 zcmai3-E$k)b>CeqKnNl!lATnGrZeqKUpk()>F?ZK zfYgUo?Cjn9aqc#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT93{IL20jve)WFJ2Ur;?Rb^p|fd`7tSsMTX) znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq zmQDT6@3C5>zBh(m30UiHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@ z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k zj*D~U-e*Pih zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+be-L|A4>0 zWz=u5dH#Wzhi?br&a$LC;2bI__ahHBlP7u zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6 zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)Gy*k-{)9n^1&QO@`RU##kU>0t^p zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l z>rY58EC28q#XtPNGW)a-Z*K0AxoAtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)% z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R= ziv{I}#s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1 zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv| z^_Qc;LZK=B6~`6akf++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@ zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU| z5Hr}>XxN27G;Il~@l)1v+e%J1qd^Li`JS-EI}l6-EOYzp zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uEo z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH z^{(DlgSgobgi6%csq+FF3M|+ScYVwDnQ_ zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2 zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyoqD0Q_<1q z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0 z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt( zS>Gm|S1G3ig=oK3^jeLA13wEOA;ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_ zYq`v_wydIt34U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF zZnhV>!SMIP4HA6vf=ZxszUj7l81M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5OhrELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8 zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+ z@gN4*Eki55$ABHD_UHCEaPat|QKoE# z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!? z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38 zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU) diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore deleted file mode 100644 index 8e89a83..0000000 --- a/tests/fixtures/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# Ignore everything in this directory -* -# Except these files -!.gitignore -!README.md \ No newline at end of file diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md deleted file mode 100644 index d24fe06..0000000 --- a/tests/fixtures/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Test fixtures for `map_item` - -Real captured items used to exercise each module's auto-generated `map_item` -function. - -## Layout - -``` -tests/fixtures/ - / - .ndjson - .ndjson -``` - -`` matches the filename in `modules/` without `.js` — -e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`. -You can drop multiple `.ndjson` files in a module folder; each gets its own -`describe` block and each line becomes its own `test`. - -Filenames are free-form — the auto-export filename from the popup -(`zeeschuimer-export--.ndjson`) is fine. - -## Privacy / committing - -These files contain real captured platform data — usernames, post -content, URLs, sometimes images and other PII. - -If we want to create test exports or annonomize real exports, add them to -.gitignore. \ No newline at end of file diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs deleted file mode 100644 index 0bf4e4d..0000000 --- a/tests/probe-4cat.mjs +++ /dev/null @@ -1,140 +0,0 @@ -/** - * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item. - * - * Usage: - * node probe-4cat.mjs [] [--index N] - * - * is the Zeeschuimer module filename without `.js` (e.g. - * "tiktok", "pinterest"). If is omitted, the first - * .ndjson in tests/fixtures// is used. --index selects which - * line of the fixture to send (default 0). - * - * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY. - */ - -import 'dotenv/config'; -import { readFileSync, existsSync, readdirSync } from 'node:fs'; -import { join, dirname } from 'node:path'; -import { fileURLToPath } from 'node:url'; - -const __dirname = dirname(fileURLToPath(import.meta.url)); - -const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); -const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; - -if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') { - console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env'); - console.error(' (copy tests/.env.example to tests/.env and fill in real values)'); - process.exit(1); -} - -const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); -const ID_MAP = existsSync(ID_MAP_PATH) - ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) - : {}; - -function auth_headers() { - return { 'Authorization': `${FOURCAT_API_KEY}` }; -} - -async function list_datasources() { - const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() }); - if (!res.ok) { - throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`); - } - const body = await res.json(); - return body.datasources ?? []; -} - -async function map_item(datasource_id, item) { - const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { - method: 'POST', - headers: { ...auth_headers(), 'Content-Type': 'application/json' }, - body: JSON.stringify({ item }), - }); - const text = await res.text(); - let body; - try { body = JSON.parse(text); } catch { body = { raw: text }; } - return { status_code: res.status, body }; -} - -function parse_args(argv) { - const args = { module: null, fixture: null, index: 0 }; - const positional = []; - for (let i = 2; i < argv.length; i++) { - if (argv[i] === '--index') { - args.index = parseInt(argv[++i], 10); - } else if (argv[i].startsWith('--index=')) { - args.index = parseInt(argv[i].split('=')[1], 10); - } else { - positional.push(argv[i]); - } - } - args.module = positional[0]; - args.fixture = positional[1]; - return args; -} - -async function main() { - const args = parse_args(process.argv); - if (!args.module) { - console.error('Usage: node probe-4cat.mjs [] [--index N]'); - process.exit(1); - } - - const datasource_id = ID_MAP[args.module] ?? args.module; - const fixture_dir = join(__dirname, 'fixtures', args.module); - - if (!existsSync(fixture_dir)) { - console.error(`error: no fixture dir at ${fixture_dir}`); - process.exit(1); - } - - const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (candidates.length === 0) { - console.error(`error: no .ndjson fixtures under ${fixture_dir}`); - process.exit(1); - } - const fixture_name = args.fixture ?? candidates[0]; - const fixture_path = join(fixture_dir, fixture_name); - if (!existsSync(fixture_path)) { - console.error(`error: fixture ${fixture_path} not found`); - process.exit(1); - } - - const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0); - if (args.index >= lines.length) { - console.error(`error: --index ${args.index} but fixture has ${lines.length} items`); - process.exit(1); - } - const item = JSON.parse(lines[args.index]); - - console.log(`Module: ${args.module}`); - console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`); - console.log(`URL: ${FOURCAT_URL}/api/map-item/${datasource_id}/`); - console.log(`Fixture: ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`); - console.log(''); - - const { status_code, body } = await map_item(datasource_id, item); - console.log(`HTTP ${status_code}`); - console.log(JSON.stringify(body, null, 2)); - - if (status_code === 404) { - console.error(''); - console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:'); - try { - const datasources = await list_datasources(); - datasources - .filter(d => d.is_from_zeeschuimer && d.has_map_item) - .forEach(d => console.error(` - ${d.id} (${d.name})`)); - } catch (e) { - console.error(` (couldn't fetch list: ${e.message})`); - } - process.exit(2); - } -} - -main().catch(e => { - console.error(`probe failed: ${e.message}`); - process.exit(2); -}); From c62a7e796db9bc3e1f7cb12f78fc50cbfa37e60c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:06:47 +0200 Subject: [PATCH 24/33] update lib.js note on new endpoint --- js/lib.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/js/lib.js b/js/lib.js index c618a6a..518a6fa 100644 --- a/js/lib.js +++ b/js/lib.js @@ -59,7 +59,11 @@ class MissingMappedField { } // Mirror 4CAT's API serialization so JSON.stringify produces the same - // tagged form on both sides. See docs/4cat-map-item-api.md. + // tagged form on both sides: 4CAT's /api/dataset//items/ endpoint, + // when called with `missing_fields=keep`, emits missing values as + // `{ __missing: true, value: }`. Matching that shape here + // lets the map_item comparator deep-equal both sides without special + // handling. toJSON() { return { __missing: true, value: this.value }; } From 234f1ce4377ceedf64777054b303e01d84293a2c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:07:21 +0200 Subject: [PATCH 25/33] update tests/.env.example (comments and dataset keys) --- tests/.env.example | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/.env.example b/tests/.env.example index 2e021bb..137a52b 100644 --- a/tests/.env.example +++ b/tests/.env.example @@ -1,9 +1,23 @@ -# 4CAT API config for the map_item comparison tests. +# 4CAT API config for the map_item comparator (`npm run test:compare`). # Copy this file to .env in this directory and fill in real values. # .env is gitignored; .env.example is the committed template. -# Base URL of the 4CAT instance to hit. No trailing slash. +# Base URL of the 4CAT instance to hit. No trailing slash. Default ports: +# :80 for nginx (production) +# :4000 for the Flask dev server FOURCAT_URL=http://localhost -# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user. +# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your +# user. 4CAT accepts the raw key as the Authorization header value (no +# `Bearer ` prefix). FOURCAT_API_KEY=your-api-key-here + +# Comma-separated list of dataset keys (the 32-char ids from 4CAT dataset +# URLs) to compare. The comparator pulls inputs from /download/ and +# expected outputs from +# /api/dataset//items/?annotations=no&missing_fields=keep&stream=true +# for each. Datasource is read from each dataset's metadata. +# +# `npm run test:compare -- ` narrows a single run to one key; the key +# must still be listed here. +FOURCAT_DATASETS=key1,key2,key3 From e0d0fb834983456aafadf4f1f9708855aa502b1c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:08:31 +0200 Subject: [PATCH 26/33] note on _loader.js for `wrap_for_map_item` --- modules/_loader.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/_loader.js b/modules/_loader.js index afae2d7..ceb0080 100644 --- a/modules/_loader.js +++ b/modules/_loader.js @@ -1,3 +1,8 @@ +// Load-order dependency: `wrap_for_map_item` (used below) is a free global +// defined in js/lib.js, which manifest.json loads as a plain background +// script before this module. There is no import for it here on purpose — +// MV2 background scripts share one global scope. If lib.js stops being +// loaded first, the mapper wrapper below will ReferenceError. async function load() { const imported_modules = [ await import("./tiktok.js"), From f2341d6e798a39f777d13e5c60af81d360ae6714 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:09:51 +0200 Subject: [PATCH 27/33] fix my test environment; scripts vs libraries --- .gitignore | 2 + tests/_module-info.js | 36 ++++++++++++------ tests/jest.compare.config.cjs | 20 ++++++++++ tests/jest.config.cjs | 3 ++ tests/package-lock.json | 70 +++++++++++++++++++++++++++++------ tests/package.json | 12 +++--- tests/run-compare.mjs | 43 +++++++++++++++++++++ tests/setup-globals.cjs | 52 +++++++++++--------------- 8 files changed, 179 insertions(+), 59 deletions(-) create mode 100644 tests/jest.compare.config.cjs create mode 100644 tests/run-compare.mjs diff --git a/.gitignore b/.gitignore index fea65f3..4d495c9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ .temp-profile tests/.env tests/.env.local +__pycache__/ +*.pyc # logs geckodriver.log diff --git a/tests/_module-info.js b/tests/_module-info.js index e261e4e..e6866a3 100644 --- a/tests/_module-info.js +++ b/tests/_module-info.js @@ -7,6 +7,9 @@ * the dynamic importer). * 2. Dynamically importing it and checking for a `map_item` export. * + * Results are cached per module name so test files that load this helper + * via separate Jest workers/files don't pay the spawnSync cost twice. + * * Returns one of four states the test driver can branch on: * { state: 'ok', map_item: } * { state: 'no_map_item' } @@ -21,25 +24,36 @@ import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); const MODULES_ROOT = join(__dirname, '..', 'modules'); +const syntax_cache = new Map(); +const inspect_cache = new Map(); + function check_module_syntax(module_name) { + if (syntax_cache.has(module_name)) return syntax_cache.get(module_name); const module_path = join(MODULES_ROOT, `${module_name}.js`); const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' }); - if (result.status === 0) return null; - return (result.stderr || result.stdout || `exit code ${result.status}`).trim(); + const out = result.status === 0 + ? null + : (result.stderr || result.stdout || `exit code ${result.status}`).trim(); + syntax_cache.set(module_name, out); + return out; } export async function inspect_module(module_name) { + if (inspect_cache.has(module_name)) return inspect_cache.get(module_name); const syntax_error = check_module_syntax(module_name); + let result; if (syntax_error) { - return { state: 'syntax_error', error: syntax_error }; - } - try { - const mod = await import(`../modules/${module_name}.js`); - if (typeof mod.map_item !== 'function') { - return { state: 'no_map_item' }; + result = { state: 'syntax_error', error: syntax_error }; + } else { + try { + const mod = await import(`../modules/${module_name}.js`); + result = typeof mod.map_item === 'function' + ? { state: 'ok', map_item: mod.map_item } + : { state: 'no_map_item' }; + } catch (e) { + result = { state: 'import_error', error: e }; } - return { state: 'ok', map_item: mod.map_item }; - } catch (e) { - return { state: 'import_error', error: e }; } + inspect_cache.set(module_name, result); + return result; } diff --git a/tests/jest.compare.config.cjs b/tests/jest.compare.config.cjs new file mode 100644 index 0000000..070e2ff --- /dev/null +++ b/tests/jest.compare.config.cjs @@ -0,0 +1,20 @@ +// Tier 2 — live comparator against a 4CAT instance. +// +// Runs only `map_item_compare.test.js`. Requires FOURCAT_URL, +// FOURCAT_API_KEY, and FOURCAT_DATASETS to be set in tests/.env. Hard-errors +// rather than silently skipping if env is missing. +// +// Env is jsdom so that the four modules using `strip_tags` (gab, pinterest, +// rednote, truth) have a native DOMParser. The comparator uses cross-fetch +// to provide a jsdom-friendly fetch (jsdom doesn't ship fetch and undici +// crashes inside jsdom). +module.exports = { + testEnvironment: 'jsdom', + testMatch: ['**/map_item_compare.test.js'], + testPathIgnorePatterns: ['/node_modules/'], + transform: {}, + moduleFileExtensions: ['js', 'json'], + setupFiles: ['/setup-globals.cjs'], + testTimeout: 30000, + verbose: true +}; diff --git a/tests/jest.config.cjs b/tests/jest.config.cjs index ea72b10..239abbc 100644 --- a/tests/jest.config.cjs +++ b/tests/jest.config.cjs @@ -1,6 +1,9 @@ +// Default Jest config — Tier 1 only (duplicate-behavior + load-only smoke). +// The comparator is excluded; invoke it via `npm run test:compare`. module.exports = { testEnvironment: 'jsdom', testMatch: ['**/*.test.js'], + testPathIgnorePatterns: ['/node_modules/', 'map_item_compare\\.test\\.js$'], transform: {}, moduleFileExtensions: ['js', 'json'], collectCoverageFrom: ['*.test.js'], diff --git a/tests/package-lock.json b/tests/package-lock.json index 7758e9f..ada8011 100644 --- a/tests/package-lock.json +++ b/tests/package-lock.json @@ -8,12 +8,12 @@ "name": "zeeschuimer-db-tests", "version": "1.0.0", "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0", - "undici": "^6.20.0" + "jest-environment-jsdom": "^29.7.0" } }, "node_modules/@babel/code-frame": { @@ -1599,6 +1599,16 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/cross-fetch": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.1.0.tgz", + "integrity": "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "node-fetch": "^2.7.0" + } + }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -3481,6 +3491,52 @@ "dev": true, "license": "MIT" }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/node-fetch/node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "dev": true, + "license": "MIT" + }, + "node_modules/node-fetch/node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "dev": true, + "license": "BSD-2-Clause" + }, + "node_modules/node-fetch/node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -4198,16 +4254,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/undici": { - "version": "6.26.0", - "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz", - "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.17" - } - }, "node_modules/undici-types": { "version": "7.16.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", diff --git a/tests/package.json b/tests/package.json index 390fdd3..763321c 100644 --- a/tests/package.json +++ b/tests/package.json @@ -1,19 +1,19 @@ { "name": "zeeschuimer-db-tests", "version": "1.0.0", - "description": "Unit tests for Zeeschuimer duplicate handling logic", + "description": "Unit tests for Zeeschuimer duplicate handling logic and map_item generator output", "type": "module", "scripts": { - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch", - "probe": "node probe-4cat.mjs" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs", + "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs --watch", + "test:compare": "node run-compare.mjs" }, "devDependencies": { + "cross-fetch": "^4.0.0", "dexie": "^3.2.4", "dotenv": "^16.4.5", "fake-indexeddb": "^5.0.1", "jest": "^29.7.0", - "jest-environment-jsdom": "^29.7.0", - "undici": "^6.20.0" + "jest-environment-jsdom": "^29.7.0" } } diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs new file mode 100644 index 0000000..69240ab --- /dev/null +++ b/tests/run-compare.mjs @@ -0,0 +1,43 @@ +/** + * Launcher for the Tier 2 map_item comparator (`npm run test:compare`). + * + * npm run test:compare -> compares every key in FOURCAT_DATASETS + * npm run test:compare -- -> narrows the run to a single key + * npm run test:compare -- -t "id=123" -> key + forwarded jest flags + * + * Why this exists instead of invoking jest directly: jest treats any bare + * positional argument as a test-path-pattern filter. A 4CAT dataset key + * (`5daeba72a2dfbb5ed8c855f824a61570`) matches no test file path, so + * `jest ` silently discovers zero tests and exits "green" having run + * nothing. This launcher intercepts the first non-flag argument, hands it to + * the comparator through the COMPARE_DATASET env var, and forwards only the + * remaining flags to jest — so the key never reaches jest's argv. + */ + +import { spawn } from 'node:child_process'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const args = process.argv.slice(2); + +// First non-flag arg (if any) is the dataset key to narrow to. Everything +// that looks like a flag is forwarded to jest verbatim. +const dataset_key = args.find(a => !a.startsWith('-')); +const jest_flags = args.filter(a => a !== dataset_key); + +const env = { ...process.env }; +if (dataset_key) env.COMPARE_DATASET = dataset_key; + +const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js'); +const child = spawn( + process.execPath, + ['--experimental-vm-modules', jest_bin, '--config', 'jest.compare.config.cjs', ...jest_flags], + { stdio: 'inherit', cwd: __dirname, env }, +); + +child.on('exit', code => process.exit(code ?? 1)); +child.on('error', err => { + console.error(`failed to launch jest: ${err.message}`); + process.exit(1); +}); diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs index 6793cc0..b55e659 100644 --- a/tests/setup-globals.cjs +++ b/tests/setup-globals.cjs @@ -4,50 +4,42 @@ * loads lib.js as a plain script. * * map_item bodies reference these as free identifiers (MappedItem, - * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this - * shim they'd hit ReferenceError as soon as a test invokes map_item. + * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without + * this shim they'd hit ReferenceError as soon as a test invokes map_item. * - * Approach: read lib.js, wrap it in a new Function() body that returns the - * named helpers, call the function, and assign the returned object onto - * globalThis. (Earlier attempt with vm.runInThisContext failed because in - * the jsdom env the vm context's global differs from jsdom's window.) - * - * If a new helper is added to lib.js, append its name to EXPOSED_NAMES. + * Names are auto-discovered from lib.js by regex-matching top-level + * `function name(...)` and `class Name ...` declarations. Adding a helper + * to lib.js makes it available to tests without touching this file. */ const fs = require('node:fs'); const path = require('node:path'); -const EXPOSED_NAMES = [ - 'traverse_data', - 'MappedItem', - 'MissingMappedField', - 'MapItemException', - 'wrap_for_map_item', - 'strip_tags', - 'normalize_url_encoding', - 'formatUtcTimestamp', -]; - const lib_source = fs.readFileSync( path.join(__dirname, '..', 'js', 'lib.js'), 'utf8', ); +// Match `function name(` and `class Name {` / `class Name extends` at +// column 0 of a line. lib.js is a classic script with all top-level +// declarations unindented; requiring column 0 keeps nested helpers (like +// the `_traverse_data` IIFE inside `traverse_data`) from being exposed. +const NAME_PATTERN = /^(?:function|class)\s+([A-Za-z_$][A-Za-z0-9_$]*)\b/gm; +const EXPOSED_NAMES = Array.from( + lib_source.matchAll(NAME_PATTERN), + m => m[1], +); + +if (EXPOSED_NAMES.length === 0) { + throw new Error( + 'setup-globals.cjs: no top-level function/class declarations found in js/lib.js — ' + + 'auto-discovery regex may be broken. Tests will ReferenceError if not fixed.' + ); +} + const factory = new Function(` ${lib_source} return { ${EXPOSED_NAMES.join(', ')} }; `); Object.assign(globalThis, factory()); - -// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global -// fetch, so the comparator can't hit 4CAT without help. Polyfill from -// undici (a Node-friendly HTTP client, separately installable on npm — -// distinct from the undici bundled internally by Node, which isn't -// require()-able by name). -// Note: tests that use fetch (e.g. map_item_compare.test.js) declare -// `@jest-environment node` at the top of the file. Node env has fetch -// natively. Don't try to polyfill into jsdom — undici's internals use -// Node-specific globals that jsdom shadows (clearImmediate, -// markResourceTiming, fast timers), and polyfilling them all is brittle. From e39ad4276e93b7792d852a55c83ce2cbf9c805d4 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:10:12 +0200 Subject: [PATCH 28/33] update map_item_compare.test.js for new 4CAT endpoints --- tests/README.md | 193 ++++++++++--- tests/map_item_compare.test.js | 505 +++++++++++++++++++++------------ 2 files changed, 478 insertions(+), 220 deletions(-) diff --git a/tests/README.md b/tests/README.md index f1188e2..cd35e0a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,31 +1,42 @@ ## Tests for Zeeschuimer -This folder contains **testing** code for Zeeschuimer. +This folder contains testing code for Zeeschuimer. There are three suites, +each with a different purpose and a different runtime environment: -### Integration Tests (Selenium) +| Suite | Tests | Environment | When it runs | Needs | +|----------------------------------|-----------------------------------------------------------|--------------------|---------------------------------|----------------------------------------| +| Selenium integration | Page captures real items from each supported platform | Real Firefox | Reviewer-supervised, manual | Firefox profile, sometimes a human | +| Duplicate-behavior unit (Jest) | DB merge / keep / update semantics in isolation | jsdom + fake-IDB | `npm test` (every push) | None | +| Module load smoke (Jest, Tier 1) | Each `modules/*.js` parses and imports cleanly | jsdom | `npm test` (every push) | None | +| `map_item` comparator (Jest, Tier 2) | JS `map_item` output matches 4CAT's Python mapping per item | jsdom + cross-fetch | `npm run test:compare` (on demand) | Live 4CAT, API key, dataset key(s) | -The Python + Selenium tests visit pages on supported platforms -and see how many items are captured. If the amount of items captured is -unexpectedly low or high, this is flagged and may indicate that Zeeschuimer no -longer properly captures data from the platform. +Hermetic suites (no external dependencies) live in `npm test`. Anything that +requires a real browser, a 4CAT server, or a human in the loop is opt-in. -These tests are **supervised** i.e. they require monitoring by a human and +### Integration tests (Selenium) + +The Python + Selenium tests visit pages on supported platforms and see how +many items are captured. If the amount of items captured is unexpectedly +low or high, this is flagged and may indicate that Zeeschuimer no longer +properly captures data from the platform. + +These tests are **supervised** — they require monitoring by a human and cannot run fully autonomously, since some platforms (TikTok in particular) occasionally show CAPTCHAs that need to be completed for a test to run successfully. This is also why Selenium does not run a headless Firefox. -The amount of items returned per page is somewhat variable for most platforms, -so if the number is slightly lower or higher than expected this is not -necessarily a problem (but worth checking). +The amount of items returned per page is somewhat variable for most +platforms, so if the number is slightly lower or higher than expected this +is not necessarily a problem (but worth checking). -Additionally, most platforms require logging in before (full) access to the UI -is available. The testing script borrows a Firefox profile directory from -elsewhere on the system to do this. It will try to find one automatically but -you can also pass one with the `--profiledir` argument. The idea is that you -log in to the various sites (Instagram, etc) in your 'normal' Firefox, and the -tests then borrow that login to interface with the website. +Most platforms require logging in before (full) access to the UI is +available. The testing script borrows a Firefox profile directory from +elsewhere on the system to do this. It will try to find one automatically +but you can also pass one with the `--profiledir` argument. Log in to the +various sites (Instagram, etc) in your 'normal' Firefox, and the tests then +borrow that login. -Run `test.py` to run tests. Required non-standard libraries are in +Run `test.py` to run tests. Required non-standard libraries are in `requirements.txt`. Tests are defined in `tests.json` with the following structure: @@ -35,49 +46,139 @@ Tests are defined in `tests.json` with the following structure: "platform id as in zeeschuimer (e.g. 'tiktok.com')": { "test case (e.g. 'Home feed')": { "url": { - "expected": 0, # amount of items expected to be captured on this page - "more-after-scroll": false, # whether scrolling is supposed to load more items (currently unsupported) - "wait": 10 # wait time before checking number of items (optional, default 5) - } # more URLS can be added per test case + "expected": 0, + "more-after-scroll": false, + "wait": 10 + } } } } ``` -### Unit Tests (Jest) - -The JavaScript unit tests verify duplicate-handling logic in isolation using -a mocked Dexie database. These tests ensure that when the duplicate behavior -setting is changed, the correct existing record is selected for updates. +### Jest suites **Prerequisites** -- Node.js (v18 or later) and npm must be installed +- Node.js (v18 or later) and npm +- `cd tests && npm install` + +**Recommended: develop the tests inside Docker.** On Windows the global +permission model can make `npm install` / `npm test` awkward to run from +an arbitrary shell, and an agentic assistant working in auto-mode will +hit deny-rules before it can do a `cross-fetch`-style dependency spike. +Any minimal `node:20`-or-newer image with this repo mounted in is +enough — install what you need, run `npm install`, run `npm test` and +`npm run test:compare`. The host's `tests/.env` is picked up via the +mount, and `FOURCAT_URL` can point at a 4CAT reachable from the +container (`host.docker.internal` on Windows/Mac, the host IP on +Linux). + +#### Duplicate-behavior unit tests + +Verify duplicate-handling logic in isolation using a mocked Dexie database. +Ensures that when the duplicate behavior setting is changed, the correct +existing record is selected for updates. + +Coverage: +- Schema upgrade backfills `last_updated` from `timestamp_collected` +- Compound index correctly selects most recent item by `last_updated` +- Forward-looking behavior: "keep" → "update" targets newest record +- Forward-looking behavior: "update" → "keep" creates new records +- Merge: shallow merge preserves fields from both records +- Skip: no modifications occur when duplicate found +- Platform isolation: same `item_id` on different platforms are independent +- Tie-breaker: when `last_updated` is equal, prefer higher `id` + +#### Module load smoke (Tier 1) + +For every file under `modules/*.js`, `tests/map_item.test.js` asserts the +module parses and imports without throwing. Modules with a `map_item` +export and modules without one both pass this tier — the goal is purely to +catch a generator that emits a syntax error or an import-time throw. + +No data is run through `map_item` here; that work belongs in the +comparator. + +#### `map_item` comparator (Tier 2) + +For every 4CAT dataset key listed in `FOURCAT_DATASETS`, +`tests/map_item_compare.test.js`: -**Setup** +1. fetches `/api/dataset//metadata/` to learn the datasource id +2. translates that id to a Zeeschuimer module name via + `zeeschuimer-to-4cat.json` (used in reverse) +3. fetches `/download/` (NDJSON inputs, already wrapped via + `wrap_for_map_item` by Zeeschuimer pre-upload) and + `/api/dataset//items/?annotations=no&missing_fields=keep&stream=true` + (expected outputs from 4CAT's Python `map_item`, as NDJSON — `stream=true` + avoids the JSON form's `limit=100` pagination) +4. pairs items by `id` (or by index with a warning if `id` is missing on + either side), runs each input through the local `map_item`, and + field-by-field diffs against the expected output (4CAT's API-only + aggregate `missing_fields` key is excluded; per-field `{__missing:true}` + markers are still compared) -1. Install Node.js dependencies: - ```bash - cd tests - npm install - ``` +The comparator does **not** exercise `wrap_for_map_item` itself — Zeeschuimer +applies it pre-storage and `/download/` returns post-wrap items. This +is an accepted gap; see `docs/map-item-test-plan.md`. -**Running tests** +**Configuration:** copy `tests/.env.example` to `tests/.env` and set: +- `FOURCAT_URL` — base URL of the 4CAT instance (no trailing slash) +- `FOURCAT_API_KEY` — raw API key (no `Bearer ` prefix) +- `FOURCAT_DATASETS` — comma-separated list of dataset keys + +The comparator hard-errors at startup if any of these are missing. + +**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in +every dataset; default is to halt subsequent items in a dataset once one +has failed. + +### Running ```bash +# everything that's hermetic — duplicate-behavior unit + module load smoke npm test -``` -For watch mode during development: -```bash +# watch mode for the same npm run test:watch + +# the comparator — every dataset key in FOURCAT_DATASETS +npm run test:compare + +# the comparator narrowed to one dataset key (must still appear in +# FOURCAT_DATASETS — protects against typos) +npm run test:compare -- ``` -**Test coverage** -- Schema upgrade backfills `last_updated` from `timestamp_collected` -- Compound index correctly selects most recent item by `last_updated` -- Forward-looking behavior: switching from "keep" to "update" targets newest record -- Forward-looking behavior: switching from "update" to "keep" creates new records -- Merge behavior: shallow merge preserves fields from both records -- Skip behavior: no modifications occur when duplicate found -- Platform isolation: same `item_id` on different platforms are independent -- Tie-breaker: when `last_updated` is equal, prefer higher `id` +### Where does a new test go? + +- **Pure data transformation, no live external state, runs anywhere.** + Duplicate-behavior unit suite (DB logic) or the Tier 1 smoke + (`map_item` static checks). +- **Field-by-field correctness against 4CAT's Python `map_item`.** Tier 2 + comparator. Add a dataset to `FOURCAT_DATASETS` that covers the case; + the comparator will pick it up. +- **End-to-end user flow in the extension.** Selenium. + +### Why the environments differ + +The two Jest tiers run in **jsdom** rather than node env. The reasoning: + +- `map_item` bodies are pure data transformation, but four of them + (`gab`, `pinterest`, `rednote`, `truth`) call `strip_tags`, which + invokes `new DOMParser()`. jsdom provides a spec-compliant native + `DOMParser`; node env doesn't. +- jsdom doesn't ship `fetch`. The standard workaround + (`undici`) crashes inside jsdom because it pokes at + `clearImmediate` / `markResourceTiming` / fast-now timers that jsdom + shadows. `cross-fetch` wraps `node-fetch` v2 internally and doesn't + hit those Node internals, so it works in jsdom — the comparator + imports `cross-fetch/polyfill` to assign `globalThis.fetch`. + +The tradeoff is parser parity. `cross-fetch`-via-`node-fetch` and +jsdom's `DOMParser` are not byte-equal to Firefox's Gecko `DOMParser`, +which is what runs in production. Whitespace handling around `
` and +block elements is the usual suspect. If the comparator emits false- +positive diffs on text fields for the four `strip_tags` modules, the +right fix is to normalise whitespace in the comparator's `deep_equal` +rather than chase parser parity. The Selenium tier sits above and +provides the real-Gecko fidelity check. diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 37e3e4c..86ab707 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -1,40 +1,60 @@ /** - * @jest-environment node + * Compare JS map_item output against 4CAT's Python map_item via dataset keys. * - * This file runs in Node test environment (not jsdom) because undici's - * fetch implementation uses Node-internal APIs (`clearImmediate`, - * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or - * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env - * has them all natively. + * For each 4CAT dataset key in FOURCAT_DATASETS, this test: + * 1. fetches /api/dataset//metadata/ to learn the datasource id + * 2. translates that id back to a Zeeschuimer module name via + * zeeschuimer-to-4cat.json (used in reverse) + * 3. inspects the local module (must export map_item) + * 4. fetches in parallel, both as NDJSON: + * /download/ -> INPUTS (post-wrap) + * /api/dataset//items/?annotations=no&missing_fields=keep&stream=true + * -> mapped EXPECTED OUTPUTS + * 5. pairs items by `id`, runs each input through the local map_item, and + * deep-equals the result against the corresponding expected output. * - * Trade-off: no DOMParser in node env. The four modules that use - * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser - * polyfill (e.g. via linkedom) before the comparator can run against - * them. Other modules (including instagram) work as-is. - */ -/** - * Compare JS map_item output against 4CAT's Python map_item via the API. + * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array + * form paginates at `limit=100`, silently dropping rows on larger datasets. + * `annotations=no` drops processor-added fields; `missing_fields=keep` keeps + * unmapped fields as `{ __missing: true, value: "" }` markers (matching the JS + * side) and additionally adds a comma-joined `missing_fields` summary key. + * That summary is API-only — the JS map_item never emits it — so it is + * excluded from the diff (see API_ONLY_FIELDS); the per-field markers it + * summarizes are still compared. * - * For every line in every fixture, runs the JS map_item locally AND sends - * the same stored item to 4CAT's /api/map-item// endpoint, then - * diffs the two outputs field-by-field. Each item is its own Jest test — - * failures point at exactly which item and which fields diverge. + * Items from /download/ already have `wrap_for_map_item` applied by + * Zeeschuimer pre-upload, so they're fed to map_item directly without + * re-wrapping. The trade-off is that this comparator does not exercise + * `wrap_for_map_item` itself — see docs/map-item-test-plan.md for the + * accepted-gap rationale. * - * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so - * `npm test` keeps working without 4CAT configuration. Drop real values in - * tests/.env to enable. + * Environment notes (fetch + DOMParser): + * - jsdom env so `strip_tags` (used by gab/pinterest/rednote/truth) has + * a native DOMParser. + * - jsdom doesn't ship `fetch`. Spiked three candidates on 2026-06-03 + * under node:20-alpine: + * * `undici` — crashes at import in jsdom (pokes at + * clearImmediate/markResourceTiming/fast-now + * timers that jsdom shadows). + * * `node-fetch` v3 — imports clean but `res.text()` throws + * `ReferenceError: TextDecoder is not defined` + * (jsdom doesn't expose TextDecoder as a global). + * * `cross-fetch/polyfill` — clean import + working round-trip. + * So this file imports `cross-fetch/polyfill`, which assigns + * `globalThis.fetch` when undefined. * - * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer - * module filename → 4CAT datasource id, for the few names that diverge). + * Invocation: + * npm run test:compare # runs every key in FOURCAT_DATASETS + * npm run test:compare -- # narrows to one key (must be in + * # FOURCAT_DATASETS to avoid typos) * - * Module-level state is determined upfront by inspect_module() (no - * map_item / syntax errors / import errors are handled before tests are - * registered, so they appear once per module, not once per item). + * Hard-errors at registration time if FOURCAT_URL, FOURCAT_API_KEY, or + * FOURCAT_DATASETS is missing — by Tier 2 contract these are required. */ +import 'cross-fetch/polyfill'; import 'dotenv/config'; -import { jest } from '@jest/globals'; -import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs'; +import { readFileSync, existsSync } from 'node:fs'; import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { inspect_module } from './_module-info.js'; @@ -43,56 +63,100 @@ const __dirname = dirname(fileURLToPath(import.meta.url)); const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, ''); const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY; -const HAS_4CAT = Boolean( - FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here' -); -// When true (default), once any item in a module fails, subsequent items -// in that same module skip the HTTP + map_item work and fail fast with a -// "halted" message. Saves time when generator output is broken at the top. -// Set FAIL_FAST=0 in env to run all items regardless. -// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing -// space in the variable value, which would otherwise defeat `!== '0'`. -const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0'; -const halted_modules = new Set(); +// Hard-fail if env is missing — Tier 2 contract. +function require_env(name, value, placeholder_values = []) { + if (!value || placeholder_values.includes(value)) { + throw new Error( + `${name} is not configured. Set it in tests/.env (see tests/.env.example).` + ); + } + return value; +} +require_env('FOURCAT_URL', FOURCAT_URL); +require_env('FOURCAT_API_KEY', FOURCAT_API_KEY, ['your-api-key-here']); + +const FOURCAT_DATASETS = require_env( + 'FOURCAT_DATASETS', + process.env.FOURCAT_DATASETS, + ['key1,key2,key3'], +) + .split(',') + .map(k => k.trim()) + .filter(k => k.length > 0); + +if (FOURCAT_DATASETS.length === 0) { + throw new Error('FOURCAT_DATASETS parsed as empty. Set a comma-separated list of dataset keys in tests/.env.'); +} + +// Optional narrowing to a single dataset key. The `npm run test:compare -- +// ` form is handled by run-compare.mjs, which sets COMPARE_DATASET; jest +// itself would mis-read a bare key as a test-path-pattern filter and silently +// run nothing. A narrowed key must still be declared in FOURCAT_DATASETS — +// erroring on an unlisted key catches typos and keeps the dataset list the +// single source of truth. +const COMPARE_DATASET = process.env.COMPARE_DATASET?.trim() || undefined; +if (COMPARE_DATASET && !FOURCAT_DATASETS.includes(COMPARE_DATASET)) { + throw new Error( + `COMPARE_DATASET=${COMPARE_DATASET} is not listed in FOURCAT_DATASETS. ` + + `Add it to tests/.env before narrowing the run to it.` + ); +} + +const DATASET_KEYS_TO_RUN = COMPARE_DATASET ? [COMPARE_DATASET] : FOURCAT_DATASETS; -const FIXTURE_ROOT = join(__dirname, 'fixtures'); +// 4CAT datasource id -> Zeeschuimer module name. The on-disk map is +// authored in the natural direction (zeeschuimer -> 4cat); flip here. const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json'); -const ID_MAP = existsSync(ID_MAP_PATH) +const ZEESCHUIMER_TO_4CAT = existsSync(ID_MAP_PATH) ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8')) : {}; +const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries( + Object.entries(ZEESCHUIMER_TO_4CAT) + .filter(([k]) => !k.startsWith('_')) + .map(([z, f]) => [f, z]) +); + +// When true (default), comparison of a dataset stops at its first failing +// item; the remaining items are reported as a single skipped "halted" +// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 && +// ...` in cmd.exe includes the trailing space; treat both '0' and 'false' +// (case-insensitive) as off. +const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase(); +const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false'; -function wrap_for_map_item(stored_item) { - const { data, ...meta } = stored_item; - return { ...data, __import_meta: meta }; +function auth_headers(extra = {}) { + return { + // 4CAT accepts the raw key without a `Bearer ` prefix. + 'Authorization': FOURCAT_API_KEY, + ...extra, + }; } -async function call_4cat_map_item(datasource_id, item) { - const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, { - method: 'POST', - headers: { - // 4CAT accepts the raw key without a `Bearer ` prefix, per probe - 'Authorization': FOURCAT_API_KEY, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ item }), - }); +async function fetch_json(url) { + const res = await fetch(url, { headers: auth_headers() }); const text = await res.text(); - if (!res.ok) { - throw new Error(`HTTP ${res.status} from 4CAT: ${text}`); - } + if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); return JSON.parse(text); } -// Round-trip a value through JSON so MappedItem, MissingMappedField, etc. -// become plain JSON-compatible objects matching what 4CAT emits. +async function fetch_ndjson(url) { + const res = await fetch(url, { headers: auth_headers() }); + const text = await res.text(); + if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); + return text + .split('\n') + .filter(line => line.trim().length > 0) + .map((line, i) => { + try { return JSON.parse(line); } + catch (e) { throw new Error(`bad NDJSON at line ${i} of ${url}: ${e.message}`); } + }); +} + function normalize(value) { return JSON.parse(JSON.stringify(value)); } -// Recursive structural equality. Doesn't care about object key order, which -// matters for nested values like {__missing: true, value: ""} where JS and -// Python might emit keys in different orders. function deep_equal(a, b) { if (a === b) return true; if (a === null || b === null) return a === b; @@ -138,8 +202,6 @@ function format_diffs(diffs) { }).join('\n'); } -// Pull out the first few module-frame lines from an error's stack so the -// failure message points at where in modules/.js the throw happened. function format_error_with_location(err) { if (!err) return String(err); const message = err.message || String(err); @@ -153,131 +215,226 @@ function format_error_with_location(err) { : message; } -function list_module_dirs() { - if (!existsSync(FIXTURE_ROOT)) return []; - return readdirSync(FIXTURE_ROOT).filter(name => { - try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); } - catch { return false; } - }); -} +// Pair inputs and expected outputs by `id`. Falls back to index pairing +// (with a logged warning) if either side is missing the field on its +// first item. +function pair_items(inputs, outputs, dataset_key) { + const probe_in = inputs[0]; + const probe_out = outputs[0]; + const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null; + const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null; -// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's -// default 5s is tight under load. -jest.setTimeout(30000); + if (!has_id_in || !has_id_out) { + // eslint-disable-next-line no-console + console.warn( + `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` + + `side — falling back to index pairing for this dataset.` + ); + const n = Math.min(inputs.length, outputs.length); + return { + mode: 'index', + pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })), + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs: [], + unmatched_outputs: [], + }; + } -if (!HAS_4CAT) { - describe('map_item compare (JS vs 4CAT Python)', () => { - test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {}); - }); -} else { - const module_dirs = list_module_dirs(); - - // Pre-pass: synchronously determine each module's state so we can branch - // on it at registration time. - const module_info = {}; - for (const module_name of module_dirs) { - module_info[module_name] = await inspect_module(module_name); + const by_id_out = new Map(); + for (const item of outputs) by_id_out.set(String(item.id), item); + + const pairs = []; + const unmatched_inputs = []; + for (const input of inputs) { + const expected = by_id_out.get(String(input.id)); + if (expected) { + pairs.push({ input, expected, id: input.id }); + by_id_out.delete(String(input.id)); + } else { + unmatched_inputs.push(input.id); + } } + return { + mode: 'id', + pairs, + input_count: inputs.length, + output_count: outputs.length, + unmatched_inputs, + unmatched_outputs: Array.from(by_id_out.keys()), + }; +} + +// 4CAT exposes the datasource via `metadata.type`, which is the datasource +// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`, +// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare +// id, which we then translate to a Zeeschuimer module via +// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens +// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string. +function extract_datasource_id(metadata) { + const type = metadata?.type; + if (!type) return null; + return type.replace(/-(search|import)$/, ''); +} - let any_fixtures = false; +// Fields 4CAT's API attaches to every mapped item that the JS map_item never +// produces, so they would otherwise diff as spurious "only_python" entries. +// `missing_fields` is a comma-joined summary of which fields came back as +// MissingMappedField — redundant with the per-field `{__missing:true}` +// markers, which ARE compared. +const API_ONLY_FIELDS = new Set(['missing_fields']); - for (const module_name of module_dirs) { - const fixture_dir = join(FIXTURE_ROOT, module_name); - const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson')); - if (fixture_files.length === 0) continue; - any_fixtures = true; +function strip_api_fields(obj) { + if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj; + const out = {}; + for (const k of Object.keys(obj)) { + if (!API_ONLY_FIELDS.has(k)) out[k] = obj[k]; + } + return out; +} - const datasource_id = ID_MAP[module_name] ?? module_name; - const info = module_info[module_name]; +// Run each paired input through the local map_item and diff the result +// against 4CAT's expected output. With FAIL_FAST on (default), stop at the +// first failing item and record how many were left unchecked — so one bad +// item yields a single failure plus one skipped "halted" placeholder, not N +// failures. +function compare_pairs(pairs, map_item) { + const results = []; + let halted_count = 0; + for (let i = 0; i < pairs.length; i++) { + const { input, expected, id } = pairs[i]; + let message = null; + try { + let js_result; + try { + js_result = map_item(input); + } catch (e) { + throw new Error(`JS map_item threw: ${format_error_with_location(e)}`); + } + const diffs = diff_objects( + strip_api_fields(normalize(js_result)), + strip_api_fields(normalize(expected)), + ); + if (diffs.length > 0) { + message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`; + } + } catch (e) { + message = e.message; + } + results.push({ id, ok: message === null, message }); + if (message !== null && FAIL_FAST) { + halted_count = pairs.length - (i + 1); + break; + } + } + return { results, halted_count }; +} - if (info.state === 'no_map_item') { - // eslint-disable-next-line no-console - console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`); - continue; +// Pre-pass: for each dataset, fetch metadata + items and run the comparison +// up front, so tests register with knowable counts and a deterministic +// pass/fail per item. Fetch/setup failures become a single "setup" failure +// inside that dataset's describe. +const dataset_state = {}; +for (const key of DATASET_KEYS_TO_RUN) { + try { + const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`); + const datasource_id = extract_datasource_id(metadata); + if (!datasource_id) { + throw new Error( + `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)` + ); } + const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id; + const module_state = await inspect_module(module_name); - if (info.state === 'syntax_error' || info.state === 'import_error') { - const msg = info.state === 'syntax_error' - ? `syntax error:\n${info.error}` - : `import failed: ${info.error.message}`; - describe(`map_item compare: ${module_name}`, () => { - test(`module loads`, () => { throw new Error(msg); }); - }); - continue; + if (module_state.state === 'ok') { + // Both sides as NDJSON. `stream=true` on the items endpoint avoids + // the JSON-array form's default `limit=100` pagination, which would + // silently drop rows (and break id-pairing) on larger datasets. + const [inputs, outputs] = await Promise.all([ + fetch_ndjson(`${FOURCAT_URL}/download/${key}`), + fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`), + ]); + const pairing = pair_items(inputs, outputs, key); + const comparison = compare_pairs(pairing.pairs, module_state.map_item); + dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison }; + } else { + dataset_state[key] = { metadata, datasource_id, module_name, module_state }; } + } catch (e) { + dataset_state[key] = { error: e }; + } +} - // state === 'ok' — register per-item comparison tests - const map_item = info.map_item; - - describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => { - for (const fixture_file of fixture_files) { - const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8') - .split('\n') - .filter(line => line.trim().length > 0); - - describe(fixture_file, () => { - lines.forEach((line, i) => { - test(`item ${i}`, async () => { - if (FAIL_FAST && halted_modules.has(module_name)) { - throw new Error( - '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]' - ); - } - try { - const stored_item = JSON.parse(line); - - // 4CAT side - const response = await call_4cat_map_item(datasource_id, stored_item); - - // JS side - let js_result; - let js_error; - try { - js_result = map_item(wrap_for_map_item(stored_item)); - } catch (e) { - js_error = e; - } - - if (response.status === 'mapped') { - if (js_error) { - throw new Error( - `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}` - ); - } - const js_obj = normalize(js_result); - const py_obj = normalize(response.item); - const diffs = diff_objects(js_obj, py_obj); - if (diffs.length > 0) { - throw new Error( - `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}` - ); - } - } else if (response.status === 'skipped') { - if (!js_error) { - throw new Error( - `4CAT skipped this item ("${response.reason}") but JS produced a result` - ); - } - // Both rejected — good. Skip reasons may differ in wording. - } else if (response.status === 'error') { - throw new Error(`4CAT errored on this item: ${response.message}`); - } else { - throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`); - } - } catch (e) { - if (FAIL_FAST) halted_modules.add(module_name); - throw e; - } - }); - }); - }); - } +for (const dataset_key of DATASET_KEYS_TO_RUN) { + const info = dataset_state[dataset_key]; + + if (info.error) { + describe(`map_item compare: dataset ${dataset_key}`, () => { + test('setup', () => { throw info.error; }); }); + continue; } - if (!any_fixtures) { - describe('map_item compare (JS vs 4CAT Python)', () => { - test.skip('no fixtures under tests/fixtures//*.ndjson', () => {}); + const { datasource_id, module_name, module_state, pairing, comparison } = info; + const label = `${dataset_key} (datasource: ${datasource_id}, module: ${module_name})`; + + if (module_state.state === 'no_map_item') { + describe(`map_item compare: ${label}`, () => { + test.skip(`modules/${module_name}.js has no map_item — nothing to compare`, () => {}); }); + continue; } + if (module_state.state === 'syntax_error' || module_state.state === 'import_error') { + const msg = module_state.state === 'syntax_error' + ? `syntax error:\n${module_state.error}` + : `import failed: ${module_state.error.message}`; + describe(`map_item compare: ${label}`, () => { + test('module loads', () => { throw new Error(msg); }); + }); + continue; + } + + describe(`map_item compare: ${label}`, () => { + test('pairing', () => { + const messages = []; + if (pairing.input_count !== pairing.output_count) { + messages.push( + `input count ${pairing.input_count} != output count ${pairing.output_count}` + ); + } + if (pairing.unmatched_inputs.length) { + const shown = pairing.unmatched_inputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_inputs.length > 5 + ? ` (+${pairing.unmatched_inputs.length - 5} more)` + : ''; + messages.push(`unmatched input ids: ${shown}${extra}`); + } + if (pairing.unmatched_outputs.length) { + const shown = pairing.unmatched_outputs.slice(0, 5).join(', '); + const extra = pairing.unmatched_outputs.length > 5 + ? ` (+${pairing.unmatched_outputs.length - 5} more)` + : ''; + messages.push(`unmatched output ids: ${shown}${extra}`); + } + if (pairing.mode === 'index') { + messages.push(`paired by index (no usable 'id' field) — diffs may be misaligned`); + } + if (messages.length) throw new Error(messages.join('\n')); + }); + + comparison.results.forEach(({ id, ok, message }, i) => { + test(`item ${i} (id=${id})`, () => { + if (!ok) throw new Error(message); + }); + }); + + if (comparison.halted_count > 0) { + test.skip( + `halted after first failure — ${comparison.halted_count} later item(s) not compared ` + + `(set FAIL_FAST=0 to compare all)`, + () => {}, + ); + } + }); } From d7fcb4c72deb18de311d6056e521b156be299457 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 15:48:12 +0200 Subject: [PATCH 29/33] fast_fail OR --all for tests --- tests/README.md | 18 +++++++++++++++--- tests/map_item_compare.test.js | 9 +++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/README.md b/tests/README.md index cd35e0a..beaee44 100644 --- a/tests/README.md +++ b/tests/README.md @@ -128,9 +128,18 @@ is an accepted gap; see `docs/map-item-test-plan.md`. The comparator hard-errors at startup if any of these are missing. -**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in -every dataset; default is to halt subsequent items in a dataset once one -has failed. +**Optional knob:** by default the comparator halts a dataset at its first +failing item (reporting the rest as one skipped "halted" placeholder). To +compare *every* item, pass `--all`: + +```bash +npm run test:compare -- --all +``` + +`FAIL_FAST=0` (or `FAIL_FAST=false`) does the same, but prefer `--all`: an +inline `FAIL_FAST=0 npm run …` does not reliably reach node when npm/node is +the Windows binary run through WSL interop, and isn't env syntax in cmd.exe. +A CLI flag crosses every shell. ### Running @@ -147,6 +156,9 @@ npm run test:compare # the comparator narrowed to one dataset key (must still appear in # FOURCAT_DATASETS — protects against typos) npm run test:compare -- + +# compare every item instead of halting at the first failure +npm run test:compare -- --all ``` ### Where does a new test go? diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 86ab707..2ca1d27 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -119,9 +119,10 @@ const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries( // When true (default), comparison of a dataset stops at its first failing // item; the remaining items are reported as a single skipped "halted" -// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 && -// ...` in cmd.exe includes the trailing space; treat both '0' and 'false' -// (case-insensitive) as off. +// placeholder rather than one failure each. Disable it with the `--all` +// launcher flag (preferred — crosses every shell) or FAIL_FAST=0. Trim +// because `set FAIL_FAST=0 && ...` in cmd.exe includes the trailing space; +// treat both '0' and 'false' (case-insensitive) as off. const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase(); const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false'; @@ -432,7 +433,7 @@ for (const dataset_key of DATASET_KEYS_TO_RUN) { if (comparison.halted_count > 0) { test.skip( `halted after first failure — ${comparison.halted_count} later item(s) not compared ` + - `(set FAIL_FAST=0 to compare all)`, + `(pass --all, or set FAIL_FAST=0, to compare every item)`, () => {}, ); } From 4f9e69c3dc8e38ed98b4d0fe17f8f413a0b7c40a Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 16:34:30 +0200 Subject: [PATCH 30/33] use headers for datasource --- tests/README.md | 3 +- tests/map_item_compare.test.js | 63 ++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/tests/README.md b/tests/README.md index beaee44..f203b60 100644 --- a/tests/README.md +++ b/tests/README.md @@ -103,7 +103,8 @@ comparator. For every 4CAT dataset key listed in `FOURCAT_DATASETS`, `tests/map_item_compare.test.js`: -1. fetches `/api/dataset//metadata/` to learn the datasource id +1. sends a HEAD to the items endpoint and reads the datasource id from its + `X-4CAT-Dataset-Datasource` response header (no metadata-endpoint call) 2. translates that id to a Zeeschuimer module name via `zeeschuimer-to-4cat.json` (used in reverse) 3. fetches `/download/` (NDJSON inputs, already wrapped via diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 2ca1d27..681076c 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -2,7 +2,8 @@ * Compare JS map_item output against 4CAT's Python map_item via dataset keys. * * For each 4CAT dataset key in FOURCAT_DATASETS, this test: - * 1. fetches /api/dataset//metadata/ to learn the datasource id + * 1. HEADs the items endpoint to read the datasource id from the + * `X-4CAT-Dataset-*` response headers (no metadata-endpoint dependency) * 2. translates that id back to a Zeeschuimer module name via * zeeschuimer-to-4cat.json (used in reverse) * 3. inspects the local module (must export map_item) @@ -134,11 +135,10 @@ function auth_headers(extra = {}) { }; } -async function fetch_json(url) { - const res = await fetch(url, { headers: auth_headers() }); - const text = await res.text(); - if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`); - return JSON.parse(text); +async function fetch_headers(url) { + const res = await fetch(url, { method: 'HEAD', headers: auth_headers() }); + if (!res.ok) throw new Error(`HTTP ${res.status} from HEAD ${url}`); + return res.headers; } async function fetch_ndjson(url) { @@ -266,16 +266,18 @@ function pair_items(inputs, outputs, dataset_key) { }; } -// 4CAT exposes the datasource via `metadata.type`, which is the datasource -// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`, -// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare -// id, which we then translate to a Zeeschuimer module via -// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens -// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string. -function extract_datasource_id(metadata) { - const type = metadata?.type; - if (!type) return null; - return type.replace(/-(search|import)$/, ''); +// Recover the datasource id from a dataset's response headers. 4CAT exposes it +// directly as `X-4CAT-Dataset-Datasource`. Older responses may only carry +// `X-4CAT-Dataset-Type` (the datasource id with a `-search`/`-import` suffix), +// so fall back to stripping that — anchored to end-of-string because +// datasource ids can themselves contain hyphens (e.g. `xiaohongshu-comments`). +// The result is translated to a Zeeschuimer module via FOURCAT_TO_ZEESCHUIMER. +function datasource_id_from_headers(headers) { + const datasource = headers.get('x-4cat-dataset-datasource'); + if (datasource) return datasource.trim(); + const type = headers.get('x-4cat-dataset-type'); + if (type) return type.trim().replace(/-(search|import)$/, ''); + return null; } // Fields 4CAT's API attaches to every mapped item that the JS map_item never @@ -331,36 +333,39 @@ function compare_pairs(pairs, map_item) { return { results, halted_count }; } -// Pre-pass: for each dataset, fetch metadata + items and run the comparison -// up front, so tests register with knowable counts and a deterministic -// pass/fail per item. Fetch/setup failures become a single "setup" failure -// inside that dataset's describe. +// Pre-pass: for each dataset, resolve the datasource (HEAD), fetch items, and +// run the comparison up front, so tests register with knowable counts and a +// deterministic pass/fail per item. Fetch/setup failures become a single +// "setup" failure inside that dataset's describe. const dataset_state = {}; for (const key of DATASET_KEYS_TO_RUN) { try { - const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`); - const datasource_id = extract_datasource_id(metadata); + // The same items URL serves double duty: a HEAD reveals the datasource + // (via X-4CAT-Dataset-* headers) with no body; the GET pulls the mapped + // rows. `stream=true` avoids the JSON form's limit=100 pagination, which + // would silently drop rows (and break id-pairing) on larger datasets. + const items_url = `${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`; + const headers = await fetch_headers(items_url); + const datasource_id = datasource_id_from_headers(headers); if (!datasource_id) { throw new Error( - `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)` + `no datasource id in response headers for ${key} ` + + `(looked for X-4CAT-Dataset-Datasource / X-4CAT-Dataset-Type)` ); } const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id; const module_state = await inspect_module(module_name); if (module_state.state === 'ok') { - // Both sides as NDJSON. `stream=true` on the items endpoint avoids - // the JSON-array form's default `limit=100` pagination, which would - // silently drop rows (and break id-pairing) on larger datasets. const [inputs, outputs] = await Promise.all([ fetch_ndjson(`${FOURCAT_URL}/download/${key}`), - fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`), + fetch_ndjson(items_url), ]); const pairing = pair_items(inputs, outputs, key); const comparison = compare_pairs(pairing.pairs, module_state.map_item); - dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison }; + dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison }; } else { - dataset_state[key] = { metadata, datasource_id, module_name, module_state }; + dataset_state[key] = { datasource_id, module_name, module_state }; } } catch (e) { dataset_state[key] = { error: e }; From 8b918d46ba99f2939610a5f0e34fbf0e3aa434bd Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 3 Jun 2026 16:35:08 +0200 Subject: [PATCH 31/33] add the --all instead of just fail_fail --- tests/run-compare.mjs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs index 69240ab..57efb66 100644 --- a/tests/run-compare.mjs +++ b/tests/run-compare.mjs @@ -3,6 +3,7 @@ * * npm run test:compare -> compares every key in FOURCAT_DATASETS * npm run test:compare -- -> narrows the run to a single key + * npm run test:compare -- --all -> compare every item (no fail-fast) * npm run test:compare -- -t "id=123" -> key + forwarded jest flags * * Why this exists instead of invoking jest directly: jest treats any bare @@ -21,13 +22,22 @@ import { dirname, join } from 'node:path'; const __dirname = dirname(fileURLToPath(import.meta.url)); const args = process.argv.slice(2); -// First non-flag arg (if any) is the dataset key to narrow to. Everything -// that looks like a flag is forwarded to jest verbatim. +// First non-flag arg (if any) is the dataset key to narrow to. const dataset_key = args.find(a => !a.startsWith('-')); -const jest_flags = args.filter(a => a !== dataset_key); +const flags = args.filter(a => a !== dataset_key); + +// `--all` (alias `--no-fail-fast`) compares every item instead of halting at +// the first failure. It's offered as a flag, not only via the FAIL_FAST env +// var, because `FAIL_FAST=0 npm run ...` does not reliably reach node when +// npm/node is the Windows binary invoked through WSL interop, and isn't env +// syntax at all in cmd.exe. A CLI flag crosses every shell; the env var still +// works where it propagates. +const disable_fail_fast = flags.includes('--all') || flags.includes('--no-fail-fast'); +const jest_flags = flags.filter(f => f !== '--all' && f !== '--no-fail-fast'); const env = { ...process.env }; if (dataset_key) env.COMPARE_DATASET = dataset_key; +if (disable_fail_fast) env.FAIL_FAST = '0'; const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js'); const child = spawn( From 00f0369d12804e397202a7206d25b9b864414c82 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 4 Jun 2026 16:46:51 +0200 Subject: [PATCH 32/33] map_item_compare.test.js: compare based on mapped `id` field not raw `id` --- tests/map_item_compare.test.js | 98 ++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 681076c..2d1403b 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -11,8 +11,11 @@ * /download/ -> INPUTS (post-wrap) * /api/dataset//items/?annotations=no&missing_fields=keep&stream=true * -> mapped EXPECTED OUTPUTS - * 5. pairs items by `id`, runs each input through the local map_item, and - * deep-equals the result against the corresponding expected output. + * 5. runs each input through the local map_item, then pairs by the + * resulting MAPPED `id` — which can differ from the raw input id (e.g. + * instagram maps to the post shortcode, not the numeric pk) — and + * deep-equals each mapped result against the corresponding expected + * output. * * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array * form paginates at `limit=100`, silently dropping rows on larger datasets. @@ -216,25 +219,49 @@ function format_error_with_location(err) { : message; } -// Pair inputs and expected outputs by `id`. Falls back to index pairing -// (with a logged warning) if either side is missing the field on its -// first item. -function pair_items(inputs, outputs, dataset_key) { - const probe_in = inputs[0]; +// Map each input through the local map_item, then pair the mapped result +// against the expected output by `id`. Pairing MUST key on the mapped id: +// some modules emit an `id` that differs from the raw input id — instagram, +// for instance, maps to the post shortcode (`node.code`), not the numeric pk +// — so pairing raw input ids against the API's already-mapped ids would match +// nothing. Falls back to index pairing (with a logged warning) if either side +// lacks a usable id. A throw inside map_item is captured per-item and surfaced +// later as that item's failure. +function map_and_pair(inputs, outputs, map_item, dataset_key) { + // Map every input up front so pairing can key on the mapped id. + const mapped = inputs.map(input => { + try { + return { input, js_result: map_item(input), error: null }; + } catch (e) { + return { + input, + js_result: null, + error: new Error(`JS map_item threw: ${format_error_with_location(e)}`), + }; + } + }); + + const probe_mapped = mapped.find(m => m.js_result)?.js_result; const probe_out = outputs[0]; - const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null; + const has_id_mapped = probe_mapped && 'id' in probe_mapped && probe_mapped.id != null; const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null; - if (!has_id_in || !has_id_out) { + if (!has_id_mapped || !has_id_out) { // eslint-disable-next-line no-console console.warn( - `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` + + `[compare] ${dataset_key}: no usable 'id' on ${!has_id_mapped ? 'map_item output' : '/items'} ` + `side — falling back to index pairing for this dataset.` ); - const n = Math.min(inputs.length, outputs.length); + const n = Math.min(mapped.length, outputs.length); return { mode: 'index', - pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })), + pairs: Array.from({ length: n }, (_, i) => ({ + input: mapped[i].input, + js_result: mapped[i].js_result, + error: mapped[i].error, + expected: outputs[i], + id: i, + })), input_count: inputs.length, output_count: outputs.length, unmatched_inputs: [], @@ -247,13 +274,19 @@ function pair_items(inputs, outputs, dataset_key) { const pairs = []; const unmatched_inputs = []; - for (const input of inputs) { - const expected = by_id_out.get(String(input.id)); + for (const m of mapped) { + // Key on the mapped id when mapping succeeded; for a throw (no mapped + // id available) fall back to the raw input id so a pass-through-id + // module still surfaces the failure against its expected output. + const lookup_id = m.js_result && m.js_result.id != null + ? String(m.js_result.id) + : (m.input && m.input.id != null ? String(m.input.id) : null); + const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined; if (expected) { - pairs.push({ input, expected, id: input.id }); - by_id_out.delete(String(input.id)); + pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id }); + by_id_out.delete(lookup_id); } else { - unmatched_inputs.push(input.id); + unmatched_inputs.push(lookup_id); } } return { @@ -296,24 +329,21 @@ function strip_api_fields(obj) { return out; } -// Run each paired input through the local map_item and diff the result -// against 4CAT's expected output. With FAIL_FAST on (default), stop at the -// first failing item and record how many were left unchecked — so one bad -// item yields a single failure plus one skipped "halted" placeholder, not N -// failures. -function compare_pairs(pairs, map_item) { +// Diff each paired (already-mapped) JS result against 4CAT's expected output. +// map_item was run up front during pairing — so we could key on the mapped id +// — so here we only diff, or report an input whose map_item threw. With +// FAIL_FAST on (default), stop at the first failing item and record how many +// were left unchecked — so one bad item yields a single failure plus one +// skipped "halted" placeholder, not N failures. +function compare_pairs(pairs) { const results = []; let halted_count = 0; for (let i = 0; i < pairs.length; i++) { - const { input, expected, id } = pairs[i]; + const { id, js_result, error, expected } = pairs[i]; let message = null; - try { - let js_result; - try { - js_result = map_item(input); - } catch (e) { - throw new Error(`JS map_item threw: ${format_error_with_location(e)}`); - } + if (error) { + message = error.message; + } else { const diffs = diff_objects( strip_api_fields(normalize(js_result)), strip_api_fields(normalize(expected)), @@ -321,8 +351,6 @@ function compare_pairs(pairs, map_item) { if (diffs.length > 0) { message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`; } - } catch (e) { - message = e.message; } results.push({ id, ok: message === null, message }); if (message !== null && FAIL_FAST) { @@ -361,8 +389,8 @@ for (const key of DATASET_KEYS_TO_RUN) { fetch_ndjson(`${FOURCAT_URL}/download/${key}`), fetch_ndjson(items_url), ]); - const pairing = pair_items(inputs, outputs, key); - const comparison = compare_pairs(pairing.pairs, module_state.map_item); + const pairing = map_and_pair(inputs, outputs, module_state.map_item, key); + const comparison = compare_pairs(pairing.pairs); dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison }; } else { dataset_state[key] = { datasource_id, module_name, module_state }; From c7bb9ac9b2c7e046ef25d15b1ea07217e3fbeabc Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 4 Jun 2026 17:15:05 +0200 Subject: [PATCH 33/33] map_item_compare.test.js: still show errors on failed `id` matches --- tests/map_item_compare.test.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js index 2d1403b..8e06979 100644 --- a/tests/map_item_compare.test.js +++ b/tests/map_item_compare.test.js @@ -275,15 +275,21 @@ function map_and_pair(inputs, outputs, map_item, dataset_key) { const pairs = []; const unmatched_inputs = []; for (const m of mapped) { - // Key on the mapped id when mapping succeeded; for a throw (no mapped - // id available) fall back to the raw input id so a pass-through-id - // module still surfaces the failure against its expected output. - const lookup_id = m.js_result && m.js_result.id != null - ? String(m.js_result.id) - : (m.input && m.input.id != null ? String(m.input.id) : null); + // A throw produces no mapped id to pair on. Surface it as its own + // failing item (labelled with the raw input id) rather than burying it + // in the unmatched-id list — otherwise an id-transforming module hides + // the actual map_item error behind a generic "unmatched input" report. + if (m.error) { + const label = m.input && m.input.id != null ? String(m.input.id) : '(no id)'; + pairs.push({ input: m.input, js_result: null, error: m.error, expected: null, id: label }); + continue; + } + // Key on the mapped id; a successful map whose id matches no output is + // a genuine pairing miss and goes to unmatched_inputs. + const lookup_id = m.js_result && m.js_result.id != null ? String(m.js_result.id) : null; const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined; if (expected) { - pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id }); + pairs.push({ input: m.input, js_result: m.js_result, error: null, expected, id: lookup_id }); by_id_out.delete(lookup_id); } else { unmatched_inputs.push(lookup_id);