From 491f51bc07520317f31416a68a9a221ccade03f9 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 5 May 2026 17:38:25 +0200
Subject: [PATCH 01/33] minimal changes for direct from 4CAT mapping

---
 js/lib.js          | 16 +++++++++++++++-
 modules/_loader.js |  6 +++++-
 popup/interface.js |  2 +-
 3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/js/lib.js b/js/lib.js
index 6199d01..1579195 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -57,4 +57,18 @@ class MissingMappedField {
     toString() {
         return `${this.value}`;
     }
-}
\ No newline at end of file
+}
+
+/**
+ * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects.
+ *
+ * 4CAT's importer constructs:
+ *   { ...item.data, __import_meta: { ...everything in item except data } }
+ *
+ * Mirroring that here means map_item functions auto-generated from 4CAT
+ * data sources can run against Zeeschuimer-stored items without translation.
+ */
+function wrap_for_map_item(stored_item) {
+    const { data, ...meta } = stored_item;
+    return { ...data, __import_meta: meta };
+}
diff --git a/modules/_loader.js b/modules/_loader.js
index 47697ca..afae2d7 100644
--- a/modules/_loader.js
+++ b/modules/_loader.js
@@ -17,11 +17,15 @@ async function load() {
     ];
 
     for(const module of imported_modules) {
+        const mapper = module.map_item
+            ? (stored_item) => module.map_item(wrap_for_map_item(stored_item))
+            : null;
+
         zeeschuimer.register_module(
             module.MODULE_NAME,
             module.DOMAIN,
             module.capture,
-            module.map_item,
+            mapper,
             module.MODULE_ID ? module.MODULE_ID : module.MODULE_DOMAIN,
             module.overwrite_partial,
             module.TOOLTIP ? module.TOOLTIP : null,
diff --git a/popup/interface.js b/popup/interface.js
index 5cc7864..1ae60a2 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -619,7 +619,7 @@ async function get_csv_blob(platform) {
     let csv = [];
     const module = background.zeeschuimer.modules[platform];
     await iterate_items(platform, function(item) {
-        item = module.mapper(item.data);
+        item = module.mapper(item);
         if(csv.length === 0) {
             csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
         }

From b06805f711a97fad6e9e3f6615db3a0cf936205e Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 10:54:13 +0200
Subject: [PATCH 02/33] give me some standard helper functions

---
 js/lib.js         |  54 +++++++++++++++++++++
 modules/tiktok.js | 119 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/js/lib.js b/js/lib.js
index 1579195..3b144d2 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -72,3 +72,57 @@ function wrap_for_map_item(stored_item) {
     const { data, ...meta } = stored_item;
     return { ...data, __import_meta: meta };
 }
+
+/**
+ * Ports of 4CAT functions commonly used by `map_item` below
+ */
+
+/**
+ * Strip HTML tags from a string.
+ * @param {string} html
+ * @param {boolean} convertNewlines  Convert <br> and </p> tags to \n before stripping.
+ * @returns {string}
+ */
+function strip_tags(html, convertNewlines = true) {
+    if (!html) return "";
+    if (convertNewlines) {
+        html = html.replace(/<br\s*\/?>/gi, "\n").replace(/<\/p>/gi, "</p>\n");
+        html = html.replace(/\n+/g, "\n");
+    }
+    const doc = new DOMParser().parseFromString(html, "text/html");
+    return doc.body.textContent || "";
+}
+
+/**
+ * Normalize URL encoding for display and linking.
+ * Decodes percent-encoded URLs and re-encodes the query string canonically.
+ * Returns the original URL on parse failure.
+ * @param {string} url
+ * @returns {string}
+ */
+function normalize_url_encoding(url) {
+    if (!url) return "";
+    try {
+        // Iterative decode handles double-encoded inputs.
+        let decoded = url;
+        let prev;
+        do {
+            prev = decoded;
+            try {
+                decoded = decodeURIComponent(prev);
+            } catch {
+                decoded = prev;
+                break;
+            }
+        } while (decoded !== prev);
+        const parsed = new URL(decoded);
+        // URL.toString() re-encodes the query/fragment correctly.
+        return parsed.toString();
+    } catch {
+        return url;
+    }
+}
+
+function formatUtcTimestamp(unixSeconds) {
+    return new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19);
+}
\ No newline at end of file
diff --git a/modules/tiktok.js b/modules/tiktok.js
index 55e6fbf..ea52532 100644
--- a/modules/tiktok.js
+++ b/modules/tiktok.js
@@ -1,3 +1,4 @@
+
 export const MODULE_NAME = 'TikTok (posts)';
 export const DOMAIN = 'tiktok.com';
 
@@ -103,4 +104,120 @@ export function capture(response, source_platform_url, source_url) {
     } else {
         return [];
     }
-}
\ No newline at end of file
+}
+
+// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ===
+// (regenerated from datasources/tiktok/search_tiktok.py)
+export function map_item(post) {
+  // Zeeschuimer metadata
+  const metadata = post.__import_meta || {};
+
+  const challenges = Array.isArray(post.challenges)
+    ? post.challenges.map(ch => ch.title).filter(Boolean)
+    : [];
+
+  const hashtags = Array.isArray(post.textExtra)
+    ? post.textExtra
+        .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName)
+        .map(e => e.hashtagName)
+    : [];
+
+  const diversificationLabels = Array.isArray(post.diversificationLabels)
+    ? post.diversificationLabels.join(',')
+    : '';
+
+  let user_nickname = '';
+  let user_fullname = '';
+  let user_thumbnail = '';
+
+  if (post.author && typeof post.author === 'object') {
+    user_nickname = post.author.uniqueId || '';
+    user_fullname = post.author.nickname || '';
+    user_thumbnail = post.author.avatarThumb || '';
+  } else if (post.author) {
+    user_nickname = post.author || '';
+    user_fullname = post.nickname || '';
+    user_thumbnail = '';
+  }
+
+  const thumbnailOptions = [];
+
+  if (post.video && Array.isArray(post.video.shareCover)) {
+    thumbnailOptions.push(...post.video.shareCover);
+  }
+
+  if (post.video && post.video.cover) {
+    thumbnailOptions.push(post.video.cover);
+  }
+
+  const now = Math.floor(Date.now() / 1000);
+
+  const validThumbnails = thumbnailOptions.filter(url => {
+    try {
+      const parsedUrl = new URL(url);
+      const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0;
+      return expires >= now;
+    } catch (e) {
+      return false;
+    }
+  });
+
+  const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : '';
+
+  return new MappedItem({
+    collected_from_url: metadata.source_platform_url
+      ? normalize_url_encoding(metadata.source_platform_url)
+      : '',
+    id: post.id || '',
+    thread_id: post.id || '',
+    author: user_nickname,
+    author_full: user_fullname,
+    author_followers: post.authorStats?.followerCount ?? '',
+    author_likes: post.authorStats?.diggCount ?? '',
+    author_videos: post.authorStats?.videoCount ?? '',
+    author_avatar: user_thumbnail,
+    body: post.desc || '',
+    stickers: Array.isArray(post.stickersOnItem)
+      ? post.stickersOnItem
+          .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : ''))
+          .filter(Boolean)
+          .join('')
+      : '',
+    timestamp: post.createTime
+      ? formatUtcTimestamp(parseInt(post.createTime, 10))
+      : '',
+    unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0,
+    is_duet:
+      post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0'
+        ? 'yes'
+        : 'no',
+    is_ad: post.isAd ? 'yes' : 'no',
+    is_paid_partnership: post.adAuthorization ? 'yes' : 'no',
+    is_sensitive: post.maskType === 3 ? 'yes' : 'no',
+    is_photosensitive: post.maskType === 4 ? 'yes' : 'no',
+    music_name: post.music?.title ?? '',
+    music_id: post.music?.id ?? '',
+    music_url: post.music?.playUrl ?? '',
+    music_thumbnail: post.music?.coverLarge ?? '',
+    music_author: post.music?.authorName ?? '',
+    video_url: post.video?.downloadAddr ?? '',
+    tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`,
+    thumbnail_url: thumbnail_url,
+    likes: post.stats?.diggCount ?? '',
+    comments: post.stats?.commentCount ?? '',
+    shares: post.stats?.shareCount ?? '',
+    plays: post.stats?.playCount ?? '',
+    hashtags: hashtags.join(','),
+    challenges: challenges.join(','),
+    diversification_labels: diversificationLabels,
+    location_created: post.locationCreated ?? '',
+    effects: Array.isArray(post.effectStickers)
+      ? post.effectStickers.map(e => e.name).join(',')
+      : '',
+    warning: Array.isArray(post.warnInfo)
+      ? post.warnInfo.map(w => w.text).join(',')
+      : '',
+  });
+}
+// === end auto-generated ===
+// === end auto-generated ===

From f9a2405a0703bcadfdee7492ccd57af12917733e Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 13:07:43 +0200
Subject: [PATCH 03/33] fix csv export

---
 popup/interface.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/popup/interface.js b/popup/interface.js
index 1ae60a2..8afd1b1 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -595,7 +595,7 @@ const CSV_ESCAPED = `"${CSV_SEPARATOR}\n`;
 function csv_escape(value) {
     value = String(value);
     let needs_escape = false;
-    for(const character in CSV_ESCAPED) {
+    for(const character of CSV_ESCAPED) {
         if(value.indexOf(character) >= 0) {
             needs_escape = true;
         }

From 2f084b9352c25a1034429bb05d8390b5961d35ef Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 15:19:18 +0200
Subject: [PATCH 04/33] another to CSV fix

---
 popup/interface.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/popup/interface.js b/popup/interface.js
index 8afd1b1..94fff77 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -626,7 +626,7 @@ async function get_csv_blob(platform) {
         csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
     })
 
-    return new Blob([csv], {type: 'text/csv'});
+    return new Blob(csv, {type: 'text/csv'});
 }
 
 /**

From d7870426c7765a6107c47c4fff062f5643725167 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 15:25:42 +0200
Subject: [PATCH 05/33] revert tiktok (mistaken test result commited)

---
 modules/tiktok.js | 119 +---------------------------------------------
 1 file changed, 1 insertion(+), 118 deletions(-)

diff --git a/modules/tiktok.js b/modules/tiktok.js
index ea52532..55e6fbf 100644
--- a/modules/tiktok.js
+++ b/modules/tiktok.js
@@ -1,4 +1,3 @@
-
 export const MODULE_NAME = 'TikTok (posts)';
 export const DOMAIN = 'tiktok.com';
 
@@ -104,120 +103,4 @@ export function capture(response, source_platform_url, source_url) {
     } else {
         return [];
     }
-}
-
-// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ===
-// (regenerated from datasources/tiktok/search_tiktok.py)
-export function map_item(post) {
-  // Zeeschuimer metadata
-  const metadata = post.__import_meta || {};
-
-  const challenges = Array.isArray(post.challenges)
-    ? post.challenges.map(ch => ch.title).filter(Boolean)
-    : [];
-
-  const hashtags = Array.isArray(post.textExtra)
-    ? post.textExtra
-        .filter(e => e.hasOwnProperty('hashtagName') && e.hashtagName)
-        .map(e => e.hashtagName)
-    : [];
-
-  const diversificationLabels = Array.isArray(post.diversificationLabels)
-    ? post.diversificationLabels.join(',')
-    : '';
-
-  let user_nickname = '';
-  let user_fullname = '';
-  let user_thumbnail = '';
-
-  if (post.author && typeof post.author === 'object') {
-    user_nickname = post.author.uniqueId || '';
-    user_fullname = post.author.nickname || '';
-    user_thumbnail = post.author.avatarThumb || '';
-  } else if (post.author) {
-    user_nickname = post.author || '';
-    user_fullname = post.nickname || '';
-    user_thumbnail = '';
-  }
-
-  const thumbnailOptions = [];
-
-  if (post.video && Array.isArray(post.video.shareCover)) {
-    thumbnailOptions.push(...post.video.shareCover);
-  }
-
-  if (post.video && post.video.cover) {
-    thumbnailOptions.push(post.video.cover);
-  }
-
-  const now = Math.floor(Date.now() / 1000);
-
-  const validThumbnails = thumbnailOptions.filter(url => {
-    try {
-      const parsedUrl = new URL(url);
-      const expires = parseInt(parsedUrl.searchParams.get('x-expires'), 10) || 0;
-      return expires >= now;
-    } catch (e) {
-      return false;
-    }
-  });
-
-  const thumbnail_url = validThumbnails.length ? validThumbnails[validThumbnails.length - 1] : '';
-
-  return new MappedItem({
-    collected_from_url: metadata.source_platform_url
-      ? normalize_url_encoding(metadata.source_platform_url)
-      : '',
-    id: post.id || '',
-    thread_id: post.id || '',
-    author: user_nickname,
-    author_full: user_fullname,
-    author_followers: post.authorStats?.followerCount ?? '',
-    author_likes: post.authorStats?.diggCount ?? '',
-    author_videos: post.authorStats?.videoCount ?? '',
-    author_avatar: user_thumbnail,
-    body: post.desc || '',
-    stickers: Array.isArray(post.stickersOnItem)
-      ? post.stickersOnItem
-          .map(s => (Array.isArray(s.stickerText) ? s.stickerText.join(' ') : ''))
-          .filter(Boolean)
-          .join('')
-      : '',
-    timestamp: post.createTime
-      ? formatUtcTimestamp(parseInt(post.createTime, 10))
-      : '',
-    unix_timestamp: post.createTime ? parseInt(post.createTime, 10) : 0,
-    is_duet:
-      post.duetInfo && post.duetInfo.duetFromId && post.duetInfo.duetFromId !== '0'
-        ? 'yes'
-        : 'no',
-    is_ad: post.isAd ? 'yes' : 'no',
-    is_paid_partnership: post.adAuthorization ? 'yes' : 'no',
-    is_sensitive: post.maskType === 3 ? 'yes' : 'no',
-    is_photosensitive: post.maskType === 4 ? 'yes' : 'no',
-    music_name: post.music?.title ?? '',
-    music_id: post.music?.id ?? '',
-    music_url: post.music?.playUrl ?? '',
-    music_thumbnail: post.music?.coverLarge ?? '',
-    music_author: post.music?.authorName ?? '',
-    video_url: post.video?.downloadAddr ?? '',
-    tiktok_url: `https://www.tiktok.com/@${user_nickname}/video/${post.id}`,
-    thumbnail_url: thumbnail_url,
-    likes: post.stats?.diggCount ?? '',
-    comments: post.stats?.commentCount ?? '',
-    shares: post.stats?.shareCount ?? '',
-    plays: post.stats?.playCount ?? '',
-    hashtags: hashtags.join(','),
-    challenges: challenges.join(','),
-    diversification_labels: diversificationLabels,
-    location_created: post.locationCreated ?? '',
-    effects: Array.isArray(post.effectStickers)
-      ? post.effectStickers.map(e => e.name).join(',')
-      : '',
-    warning: Array.isArray(post.warnInfo)
-      ? post.warnInfo.map(w => w.text).join(',')
-      : '',
-  });
-}
-// === end auto-generated ===
-// === end auto-generated ===
+}
\ No newline at end of file

From a9fba9a9caee86d8799ee35d11374fbb602c9a41 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 15:57:45 +0200
Subject: [PATCH 06/33] clean up UI (make download menu button)

---
 popup/interface.html | 32 +++++++++++++++++++++-
 popup/interface.js   | 63 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/popup/interface.html b/popup/interface.html
index 356f2b5..e9d9b3f 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -215,10 +215,39 @@
             text-indent: 2em;
         }
 
-        td > button:not(:last-child) {
+        td > button:not(:last-child),
+        td > .download-menu:not(:last-child) {
             margin-right: 0.25em;
         }
 
+        /* download chooser: trigger is a regular button (inherits all button
+           styles); */
+        .download-menu {
+            display: inline-block;
+            position: relative;
+        }
+
+        /* :not([hidden]) so the explicit display:flex doesn't override the
+           [hidden] attribute's default display:none */
+        .download-menu > .download-options:not([hidden]) {
+            position: absolute;
+            top: calc(100% + 0.25em);
+            left: 0;
+            display: flex;
+            flex-direction: column;
+            gap: 0.25em;
+            padding: 0.25em;
+            background: var(--neutral-contrast-alt);
+            border: 2px solid var(--neutral-contrast);
+            border-radius: 0.5em;
+            z-index: 10;
+            white-space: nowrap;
+        }
+
+        .download-menu > .download-options > button {
+            margin: 0;
+        }
+
         input:not([type=checkbox]):not([type=radio]), button {
             background: var(--neutral-contrast-alt);
             color: var(--accent);
@@ -302,6 +331,7 @@
 
         .toggle-switch input {
             -moz-appearance: none;
+            appearance: none;
             opacity: 0;
         }
 
diff --git a/popup/interface.js b/popup/interface.js
index 94fff77..3b8aaa9 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -119,7 +119,7 @@ async function set_4cat_url(e) {
 function activate_buttons() {
     document.querySelectorAll("td button").forEach(button => {
         let current = button.disabled;
-        let items = parseInt(button.parentNode.parentNode.querySelector('.num-items').innerText);
+        let items = parseInt(button.closest('tr').querySelector('.num-items').innerText);
         let new_status = current;
 
         if(button.classList.contains('upload-to-4cat') && !is_uploading) {
@@ -132,7 +132,7 @@ function activate_buttons() {
                 button.setAttribute('title', '');
             }
 
-        } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset') || button.classList.contains('download-csv')) {
+        } else if(button.classList.contains('download-format') || button.classList.contains('download-menu-trigger') || button.classList.contains('reset')) {
             new_status = !(items > 0);
         }
 
@@ -234,21 +234,32 @@ async function get_stats() {
 
             let actions = createElement("td");
             const clear_button = createElement("button", {"data-platform": platform, "class": "reset"}, "Delete");
-            const csv_button = createElement("button", {"data-platform": platform, 'class': 'download-csv'}, '.csv');
-            const download_button = createElement("button", {
-                "data-platform": platform,
-                "class": "download-ndjson"
-            }, ".ndjson");
+
+            // Render the download chooser as a button + popover panel,
+            // (even when only NDJSON is available as visual consistent)
+            const download_widget = createElement("span", {"class": "download-menu"});
+            const trigger = createElement("button", {
+                "data-platform": platform, "class": "download-menu-trigger"
+            }, "Download");
+            const options = createElement("div", {"class": "download-options", "hidden": ""});
+            options.appendChild(createElement("button", {
+                "data-platform": platform, "data-format": "ndjson", "class": "download-format"
+            }, ".ndjson (original)"));
+            if(module.mapper) {
+                options.appendChild(createElement("button", {
+                    "data-platform": platform, "data-format": "csv", "class": "download-format"
+                }, ".csv"));
+            }
+            download_widget.appendChild(trigger);
+            download_widget.appendChild(options);
+
             const fourcat_button = createElement("button", {
                 "data-platform": platform,
                 "class": "upload-to-4cat",
             }, "to 4CAT");
 
             actions.appendChild(clear_button);
-            if(module.mapper) {
-                actions.appendChild(csv_button);
-            }
-            actions.appendChild(download_button);
+            actions.appendChild(download_widget);
             actions.appendChild(fourcat_button);
 
             row.appendChild(actions);
@@ -317,22 +328,38 @@ async function get_stats() {
 async function button_handler(event) {
     let status = document.getElementById('upload-status');
 
-    if (event.target.matches('.reset')) {
+    // Close any open download-format popovers when clicking outside their host.
+    // Skip if the click is on a trigger or inside an options panel 
+    if(!event.target.matches('.download-menu-trigger') && !event.target.closest('.download-options')) {
+        document.querySelectorAll('.download-options:not([hidden])').forEach(el => el.hidden = true);
+    }
+
+    if (event.target.matches('.download-menu-trigger')) {
+        const widget = event.target.closest('.download-menu');
+        const options = widget.querySelector('.download-options');
+        const opening = options.hidden;
+        // close any other menus before opening this one
+        document.querySelectorAll('.download-options:not([hidden])').forEach(el => {
+            if(el !== options) el.hidden = true;
+        });
+        options.hidden = !opening;
+
+    } else if (event.target.matches('.reset')) {
         let platform = event.target.getAttribute('data-platform');
         await background.db.items.where("source_platform").equals(platform).delete();
 
     } else if (event.target.matches('.reset-all')) {
         await background.db.items.clear();
 
-    } else if (event.target.matches('.download-ndjson') || event.target.matches('.download-csv')) {
-        const blobber = event.target.matches('.download-ndjson') ? get_ndjson_blob : get_csv_blob;
-        const extension = event.target.matches('.download-ndjson') ? 'ndjson' : 'csv';
+    } else if (event.target.matches('.download-format')) {
+        const format = event.target.getAttribute('data-format');
+        const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob;
+        const extension = format;
 
         let platform = event.target.getAttribute('data-platform');
         let date = new Date();
         event.target.classList.add('loading');
 
-        //let blob = await download_blob(platform, 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.ndjson');
         let blob = await blobber(platform);
         let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension;
         const downloadUrl = window.URL.createObjectURL(blob);
@@ -345,6 +372,10 @@ async function button_handler(event) {
 
         event.target.classList.remove('loading');
 
+        // collapse the popover menu after the download fires
+        const widget = event.target.closest('.download-menu');
+        if(widget) widget.querySelector('.download-options').hidden = true;
+
     } else if (event.target.matches('.upload-to-4cat')) {
         let platform = event.target.getAttribute('data-platform');
         status.innerText = 'Creating data file for uploading...';

From 0980a56f0ba6872884bfc1e891efc2cb9f4e4c33 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 16:13:52 +0200
Subject: [PATCH 07/33] testing is hard in JS

---
 docs/test-plan.md                         | 162 ++++++++++++++++++++++
 modules/package.json                      |   3 +
 tests/__pycache__/test.cpython-39.pyc     | Bin 0 -> 7345 bytes
 tests/duplicate-behavior.test.js          |   3 +-
 tests/{jest.config.js => jest.config.cjs} |   3 +-
 tests/map_item.test.js                    | 130 +++++++++++++++++
 tests/package.json                        |   5 +-
 tests/setup-globals.cjs                   |  41 ++++++
 8 files changed, 343 insertions(+), 4 deletions(-)
 create mode 100644 docs/test-plan.md
 create mode 100644 modules/package.json
 create mode 100644 tests/__pycache__/test.cpython-39.pyc
 rename tests/{jest.config.js => jest.config.cjs} (64%)
 create mode 100644 tests/map_item.test.js
 create mode 100644 tests/setup-globals.cjs

diff --git a/docs/test-plan.md b/docs/test-plan.md
new file mode 100644
index 0000000..249a7e0
--- /dev/null
+++ b/docs/test-plan.md
@@ -0,0 +1,162 @@
+# Selenium Test Harness — Improvement Plan
+
+Date: 2026-04-30
+
+Overview
+
+This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to:
+
+- Make profile handling reliable and reusable (so logged-in sessions persist across runs).
+- Preserve and export captured data per platform for offline analysis and for passing to 4CAT.
+- Add optional automated upload to a 4CAT instance for mapping/validation tests.
+- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns).
+- Improve robustness, error handling, and machine-readable results.
+
+Scope
+
+All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB).
+
+Phases & Changes
+
+Phase 1 — Profile management
+
+- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data.
+- Changes:
+  - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running.
+  - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root.
+  - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run).
+
+Implementation note (copytree ignore example):
+
+```python
+def _profile_ignore(root, names):
+    # Only ignore these entries in the root profile dir
+    if os.path.abspath(root) == os.path.abspath(profile_dir):
+        return {"storage", "extensions", "signedInUser.json"}
+    return set()
+
+shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore)
+```
+
+Phase 2 — Data preservation & export
+
+- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests.
+- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform.
+- Changes:
+  - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`).
+  - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`.
+  - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL).
+
+Execute_async_script pattern (example):
+
+```python
+script = '''
+const cb = arguments[0];
+background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)})));
+'''
+items_json = driver.execute_async_script(script)
+items = json.loads(items_json)
+```
+
+Phase 3 — 4CAT integration (optional)
+
+- Problem: mapping tests live in 4CAT and need NDJSON input.
+- Changes:
+  - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload.
+  - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
+  - Do not fail the test run on 4CAT errors — print status and continue.
+
+Example upload with `requests`:
+
+```python
+import requests
+with open(ndjson_path, 'rb') as f:
+    headers = {
+        'X-Zeeschuimer-Platform': platform,
+        'Authorization': f'Bearer {fourcat_key}'
+    }
+    r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f)
+    # check r.status_code and r.text for details
+```
+
+Phase 4 — Interactive controls & popup dismissals
+
+- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures.
+- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options.
+- Changes:
+  - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning).
+  - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example:
+
+```json
+"dismiss-selectors": ["button.cookie-accept", ".modal .close"]
+```
+
+  - Add per-URL `timeout` (page load timeout override).
+
+Phase 5 — Runner robustness & reporting
+
+- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results.
+- Changes:
+  - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue.
+  - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run.
+  - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted).
+  - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`).
+
+tests.json schema additions
+
+- Per-URL optional fields:
+  - `dismiss-selectors`: array of CSS selectors to click after page load
+  - `timeout`: numeric page load timeout seconds for this URL
+  - `extra-wait`: per-URL additional wait seconds
+
+CLI flags (summary)
+
+- `--profiledir PATH` — explicit profile path (existing)
+- `--profile-name NAME` — choose Firefox profile by display name
+- `--save-profile PATH` — persist the copied profile for reuse
+- `--no-cleanup` — keep `.temp-profile`
+- `--export-dir PATH` — where to write NDJSON exports
+- `--no-reset` — do not click `reset-all` between URLs
+- `--4cat-url URL` — base URL for 4CAT server
+- `--4cat-key KEY` — API key for 4CAT uploads
+- `--4cat-per-url` — upload per URL instead of per platform (optional)
+- `--no-interactive` — disable pausing (default is to pause per-platform)
+- `--pause-before-url` — pause before each URL
+- `--pause-on-fail` — pause when a test fails
+- `--extra-wait N` — add N seconds to every URL wait
+- `--screenshot-dir PATH` — save screenshots on fail/warning
+- `--results-file PATH` — write machine-readable results JSON
+- `--resume-from PLATFORM` — resume a run from a platform
+
+Verification checklist
+
+1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items.
+2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`.
+3. Run with default interactive behavior and confirm one pause per platform.
+4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts.
+5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`.
+
+Implementation steps (recommended order)
+
+1. Docs and small fixes (this document + tests.json typo fix).
+2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection).
+3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write.
+4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement.
+5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots).
+6. 4CAT upload integration (optional, requires confirmation of auth header).
+
+Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs.
+
+Open questions / confirmations needed
+
+- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
+- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.)
+
+Next steps
+
+- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`.
+- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes.
+
+---
+
+Requested file: `docs/test-plan.md`
diff --git a/modules/package.json b/modules/package.json
new file mode 100644
index 0000000..3dbc1ca
--- /dev/null
+++ b/modules/package.json
@@ -0,0 +1,3 @@
+{
+  "type": "module"
+}
diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..745e2b4aaad921a459372bb50b39980c50a68136
GIT binary patch
literal 7345
zcmai3-E$k)b>CeqKnNl!lA<VDww1LkN`yoJvK4<(AC@dpksV1^L@BZ_NyAIrOJKpp
zE_8Px5-Z6`L$@!DCQSO$#!&|BL#FLZUOLlsI+MSk(}z6b$>TnGrZeqKUpk()>F?ZK
zfYgUo?Cjn9aqc<ip6^TALqj<Yzkm9NhfcbvY5z)toxfoW&fyIP0mn6_ah)5k&UE!P
zn4!KYmQr7nnd+NnY4y#p48AFMppj)+o%WjUu11dKR6p(RZVa+P)z7#?jbXM&^#|OM
z#wZ(A{j59I*vs~+{w{aCG0w(SKj-diOt1;n-|g;iOtMMUA9P=69AF1jf5?5Y@gjRs
z*M#1AX-i`-i<j#M`7qzZNBHQ&6y|9>#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn
zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT9<vL+5tZ$s}K-qJSp;8QWh)!G*tfAvvn
zLkE>3{IL20jve)WFJ2Ur;?Rb^p|fd`7t<S%_NoC~zrfz9$9BZ4hgVIk{x?>SsMTX)
znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq
zmQDT6@3C5>zBh(m30U<PG^`tniW;$_SUI7_Q~WjcJ-}bbX<y?fxAe~Ikj_bQQk~t9
zSA@~|uE0FbY}DTnN5x70UB%}cN*`}RQg3bOKRdwQ7OBoV;%#vF)@_X&re<mz(3m&&
znzm^KKRu!8>iHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@
z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k
zj*D~U<KSR!GbK-m9Q8JI&)^2TB=*+7CoYL|{N0BJyDTo(-v_P({2U|%*+8>-e*Pih
zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+<Aq6PtBN}5BcQ;>be-L|A4>0
zWz=u5dH#Wzhi<?BD8+7q&MV?(3{}6?`60iO^l!6|aQ>?br&a$LC;2bI__ahHBlP7u
zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6
zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U
z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS
z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)<!Zl>Gy*k-{)9n^1&QO@`RU##kU>0t^p
zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL<e-E6a3hSqj48E`t`zPCg
zZGwXd#eokF{<-Hhib7L3+<pNzDDJd$VEK=D9Jf`&%fvk<g(Quk#hF)An;Ch8FUNEI
z?JrDTiC3uq$n<isZv_u83=)lwY#Goj=lHU)FTtJ_v_z(JmkT~ZKEPK*2w7AgrEo?>
zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!<k`Ccc&
zxz}JRtNivBY14gVvU}h)D}O5PLEACJ`VNT3qtsg(VAf)Jt)_9VU#k(xSy!WVv4+@A
z^Y>sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E
z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l
z>rY58EC28q#XtPNGW)a-Z*K0Axo<gV`>AtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)%
z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R=
ziv{I}#<ry(mj20pcsVjk4D%DQl)&_(4`4PPgZW%6CNO;+0On6)7;emK`9F(}<Zg-}
z46GGjT1<$bQfoO4A#=HTA*>s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1
zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv|
z^_Qc;LZK=B6~`6ak<r8@M~W4H-Ad-by32*^YI&n;)`Z*a4sfwzx7;uqDio@svg*gQ
zqrHUH3Vmzm<Upa|`c=n^Ub<~Np#>f++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@
zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU|
z5Hr}>XxN27G;Il~@l)1v+e%J1qd<t+Xy&uh`DXK?9onTEzTE(w(p5ilT-Poo!YW;l
z1sIf$MJGNhU2&GBE!(uqbt*O;z?yG0o4yR6u_JG=f!hw^)>^<a7j3x)X^nqYDmD@t
zE3ra=dQU<o(TNr9wmd(?sO9nE*E-}Cy?xE!p*fnjuG>Li`JS-EI<ez~ZrfUOpsJmE
zFGiDBp;M;`c_x}Cs|AAD3L)<OsGq`n0dFvfCe%9meWZmt8N7ZipXwTZ&^3e54!hYP
zEH@Hc)cN0PasnH^IeB4r@fNfnEOOfwi#StN+6@xxVzlidi%N&8EjHWT>}l6-EOYzp
zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc
zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uE<Bjx75BwG*<L3PVkrj$*6V+
zxI&Z!?A-?@X~DBZ1s2{WFp$ed$1eEIHnGw}i3J-r*MOD4cHkGW;S`I-u3n6WinP>o
z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH
z^{(DlgSgobgi6%csq+FF3M|+ScYVw<hy@d`VuN&3)$VIcBM<cZdPm#Rq<LTexpBvw
zkF5Ym(m^PlCP*5^6J34f37Xy9X_7xYw0ZUxuqo2=FnZtUGttX8lduW{g_%mL>DnQ_
zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k
z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z
z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY
z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2
zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyo<x<aMwhN5!w1$JL0a|-wQav;)$F@)
zCBIsQkC9^`_QJRN`cg1!wiouI@;CaKsKWh9s{!XM-f0Qho)@n2p7X9deY6+?mn#YZ
zVpydHl+9LnG+%`B=j=*t8h;J2&YZRGS<cF|C{}7nP)L#(V*t&qPUo!$r>qD0Q_<1q
z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza
z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW
z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0
z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt(
zS>Gm|S1G3ig=oK3^jeLA13wEOA;<HCya}U?MjMDp1!Z}Kpdx*w?sG_{NUX(p_+ESZ
z#LTziC&LE581%Ams<kkGstRe~_vO1^Q*4WJmLjXvIGPxFyn&_?wfyQYmRPzfaZIS+
z4k#_C(<jS+{@W$s=}lVE)C?*nWN|9AXDrwNvXeyWP#zQ`C)+t3gVEY^pb9$)#&Dhw
zC#gtbQ$BheDNCV(jApi(d5ZE=D542V{eZR4n-a%x3|d7&LRg|b+hcXKXf%$xRCJ-r
zAVZ3F_hK}xG)fg;Ux~67;;--VATkP8D+8G?d<A)0(EtoOK%P$KAVe$rz*@5<#q_|M
zR-DJmQHI`=?~H})(DHgiB>ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_
zYq`v_wydIt3<S#rwN~i3%%tM8Ym&RkF`S}nNWl(~lX*_7QS4Xj$TF(jE+$DtQW||O
zV3KxZiU41F21Y7s5k;2Ul1!HE>4U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF
zZnhV><t_pl@PlF#b-$#)k#`5E>!SMIP4HA6vf=ZxszUj7l8<C5L^L*Nb6$qb)B?*X
z(a{)%qiR!d2sW5hMrFFC0H}XQ>1M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5Oh<JI4(jc
zy#;3j$ut{Uk-n--5mOpXxtG`-rxT>rELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD
zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8
zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+
z@gN4*Eki55$A<A&h)N4+;NMOYydZRNwolPQPAStt%1It!V>BHD_UHCEaPat|QKoE#
z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!?
z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F
z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO
z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E
z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp
zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38
zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G
zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP
z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti
r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU)

literal 0
HcmV?d00001

diff --git a/tests/duplicate-behavior.test.js b/tests/duplicate-behavior.test.js
index 031f663..9f0662b 100644
--- a/tests/duplicate-behavior.test.js
+++ b/tests/duplicate-behavior.test.js
@@ -5,8 +5,9 @@
  * update or merge behaviors to duplicates across navigation boundaries.
  */
 
+import 'fake-indexeddb/auto';
+
 let Dexie;
-require('fake-indexeddb/auto');
 
 // Mock browser extension APIs
 global.browser = {
diff --git a/tests/jest.config.js b/tests/jest.config.cjs
similarity index 64%
rename from tests/jest.config.js
rename to tests/jest.config.cjs
index 7dd5b02..ea72b10 100644
--- a/tests/jest.config.js
+++ b/tests/jest.config.cjs
@@ -3,6 +3,7 @@ module.exports = {
   testMatch: ['**/*.test.js'],
   transform: {},
   moduleFileExtensions: ['js', 'json'],
-  collectCoverageFrom: ['duplicate-behavior.test.js'],
+  collectCoverageFrom: ['*.test.js'],
+  setupFiles: ['<rootDir>/setup-globals.cjs'],
   verbose: true
 };
diff --git a/tests/map_item.test.js b/tests/map_item.test.js
new file mode 100644
index 0000000..9dee6e8
--- /dev/null
+++ b/tests/map_item.test.js
@@ -0,0 +1,130 @@
+/**
+ * Auto-discovery test driver for module `map_item` functions.
+ *
+ * Convention:
+ *   tests/fixtures/<module_name>/*.ndjson
+ *
+ * <module_name> matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js).
+ * Each .ndjson line is one Zeeschuimer-stored item exported from the popup.
+ *
+ * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer
+ * presents items to a map_item function, then run through the module's
+ * map_item. Tests assert: function returns a non-null object, and any fields
+ * listed in REQUIRED_NON_EMPTY for that module are present and non-empty.
+ */
+
+import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { spawnSync } from 'node:child_process';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+/**
+ * Local mirror of wrap_for_map_item from js/lib.js.
+ *
+ * lib.js is loaded by the browser as a plain script (it defines globals
+ * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be
+ * imported from Node. The wrap is three trivial lines with no dependencies
+ * — duplicating it here is cheaper than restructuring lib.js into a module.
+ * If lib.js's wrap_for_map_item ever gains real logic, this needs to track.
+ */
+function wrap_for_map_item(stored_item) {
+    const { data, ...meta } = stored_item;
+    return { ...data, __import_meta: meta };
+}
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const FIXTURE_ROOT = join(__dirname, 'fixtures');
+const MODULES_ROOT = join(__dirname, '..', 'modules');
+
+/**
+ * Pre-validate module syntax before dynamic import.
+ *
+ * `await import()` on a module with a syntax error throws inside V8's module
+ * linker in a way Jest's experimental-vm-modules can't always recover from
+ * (worker retry loop or Node process exit). Running `node --check` first
+ * gives us a clean error string we can fail the test with.
+ */
+function check_module_syntax(module_name) {
+    const module_path = join(MODULES_ROOT, `${module_name}.js`);
+    const result = spawnSync(process.execPath, ['--check', module_path], {
+        encoding: 'utf8',
+    });
+    if (result.status === 0) return null;
+    return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+}
+
+const REQUIRED_NON_EMPTY = {
+    tiktok: ['id', 'author', 'unix_timestamp'],
+};
+
+function list_module_dirs() {
+    if (!existsSync(FIXTURE_ROOT)) return [];
+    return readdirSync(FIXTURE_ROOT).filter(name => {
+        try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
+        catch { return false; }
+    });
+}
+
+const module_dirs = list_module_dirs();
+let total_fixtures = 0;
+
+for (const module_name of module_dirs) {
+    const fixture_dir = join(FIXTURE_ROOT, module_name);
+    const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+
+    if (fixture_files.length === 0) continue;
+    total_fixtures += fixture_files.length;
+
+    describe(`map_item: ${module_name}`, () => {
+        let map_item;
+        let import_error;
+
+        beforeAll(async () => {
+            const syntax_error = check_module_syntax(module_name);
+            if (syntax_error) {
+                import_error = new Error(`syntax error:\n${syntax_error}`);
+                return;
+            }
+            try {
+                const mod = await import(`../modules/${module_name}.js`);
+                map_item = mod.map_item;
+                if (typeof map_item !== 'function') {
+                    import_error = new Error(`modules/${module_name}.js does not export a map_item function`);
+                }
+            } catch (e) {
+                import_error = e;
+            }
+        });
+
+        for (const fixture_file of fixture_files) {
+            const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
+                .split('\n')
+                .filter(line => line.trim().length > 0);
+
+            describe(fixture_file, () => {
+                lines.forEach((line, i) => {
+                    test(`item ${i} maps without throwing`, () => {
+                        if (import_error) {
+                            throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`);
+                        }
+                        const stored_item = JSON.parse(line);
+                        const mapped = map_item(wrap_for_map_item(stored_item));
+                        expect(mapped).not.toBeNull();
+                        expect(typeof mapped).toBe('object');
+                        for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) {
+                            expect(mapped[field]).toBeDefined();
+                            expect(mapped[field]).not.toBe('');
+                            expect(mapped[field]).not.toBeNull();
+                        }
+                    });
+                });
+            });
+        }
+    });
+}
+
+if (total_fixtures === 0) {
+    describe('map_item', () => {
+        test.skip('no fixtures found under tests/fixtures/<module_name>/*.ndjson', () => {});
+    });
+}
diff --git a/tests/package.json b/tests/package.json
index dc3654c..6dd35fb 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -2,9 +2,10 @@
   "name": "zeeschuimer-db-tests",
   "version": "1.0.0",
   "description": "Unit tests for Zeeschuimer duplicate handling logic",
+  "type": "module",
   "scripts": {
-    "test": "jest",
-    "test:watch": "jest --watch"
+    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
+    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch"
   },
   "devDependencies": {
     "dexie": "^3.2.4",
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
new file mode 100644
index 0000000..a19fb09
--- /dev/null
+++ b/tests/setup-globals.cjs
@@ -0,0 +1,41 @@
+/**
+ * Make js/lib.js's helpers available as globals inside the Jest test
+ * environment, mirroring how the browser sees them after the manifest
+ * loads lib.js as a plain script.
+ *
+ * map_item bodies reference these as free identifiers (MappedItem,
+ * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this
+ * shim they'd hit ReferenceError as soon as a test invokes map_item.
+ *
+ * Approach: read lib.js, wrap it in a new Function() body that returns the
+ * named helpers, call the function, and assign the returned object onto
+ * globalThis. (Earlier attempt with vm.runInThisContext failed because in
+ * the jsdom env the vm context's global differs from jsdom's window.)
+ *
+ * If a new helper is added to lib.js, append its name to EXPOSED_NAMES.
+ */
+
+const fs = require('node:fs');
+const path = require('node:path');
+
+const EXPOSED_NAMES = [
+    'traverse_data',
+    'MappedItem',
+    'MissingMappedField',
+    'wrap_for_map_item',
+    'strip_tags',
+    'normalize_url_encoding',
+    'formatUtcTimestamp',
+];
+
+const lib_source = fs.readFileSync(
+    path.join(__dirname, '..', 'js', 'lib.js'),
+    'utf8',
+);
+
+const factory = new Function(`
+${lib_source}
+return { ${EXPOSED_NAMES.join(', ')} };
+`);
+
+Object.assign(globalThis, factory());

From 46b96c77ffd45f465f90880915e1f6d2836bd87e Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 16:25:56 +0200
Subject: [PATCH 08/33] add fixtures folder and README.md to explain what I did

---
 tests/fixtures/.gitignore |  5 +++++
 tests/fixtures/README.md  | 29 +++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 tests/fixtures/.gitignore
 create mode 100644 tests/fixtures/README.md

diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore
new file mode 100644
index 0000000..8e89a83
--- /dev/null
+++ b/tests/fixtures/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except these files
+!.gitignore
+!README.md
\ No newline at end of file
diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md
new file mode 100644
index 0000000..d24fe06
--- /dev/null
+++ b/tests/fixtures/README.md
@@ -0,0 +1,29 @@
+# Test fixtures for `map_item`
+
+Real captured items used to exercise each module's auto-generated `map_item`
+function.
+
+## Layout
+
+```
+tests/fixtures/
+  <module_name>/
+    <whatever>.ndjson
+    <whatever-else>.ndjson
+```
+
+`<module_name>` matches the filename in `modules/` without `.js` —
+e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`.
+You can drop multiple `.ndjson` files in a module folder; each gets its own
+`describe` block and each line becomes its own `test`.
+
+Filenames are free-form — the auto-export filename from the popup
+(`zeeschuimer-export-<platform>-<timestamp>.ndjson`) is fine.
+
+## Privacy / committing
+
+These files contain real captured platform data — usernames, post
+content, URLs, sometimes images and other PII. 
+
+If we want to create test exports or annonomize real exports, add them to 
+.gitignore.
\ No newline at end of file

From 487b5b618e4a989cbfca7dbfe2b30b1e78dc62ad Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 15:53:22 +0200
Subject: [PATCH 09/33] add MapItemException

---
 js/lib.js | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/js/lib.js b/js/lib.js
index 3b144d2..e38430e 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -59,6 +59,19 @@ class MissingMappedField {
     }
 }
 
+/**
+ * Raised by `map_item` to signal a known mapping failure.
+ *
+ * Mirrors 4CAT's MapItemException: callers should catch it, skip the item,
+ * and warn the user that the platform's format may have shifted.
+ */
+class MapItemException extends Error {
+    constructor(message) {
+        super(message);
+        this.name = "MapItemException";
+    }
+}
+
 /**
  * Wrap a Zeeschuimer stored item to match the shape a 4CAT map_item expects.
  *

From b6f487dbfa017a79207726f04f059078aaf4c4b5 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 15:56:14 +0200
Subject: [PATCH 10/33] make a warning pop up

---
 popup/interface.html | 42 ++++++++++++++++++++++++++++++
 popup/interface.js   | 62 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/popup/interface.html b/popup/interface.html
index e9d9b3f..0570e40 100644
--- a/popup/interface.html
+++ b/popup/interface.html
@@ -303,6 +303,42 @@
             text-align: center;
         }
 
+        #csv-warning {
+            position: fixed;
+            inset: 0;
+            background: rgba(60, 60, 59, 0.55);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            z-index: 1000;
+        }
+
+        #csv-warning[hidden] {
+            display: none;
+        }
+
+        #csv-warning .csv-warning-content {
+            background: var(--accent);
+            color: var(--neutral-contrast);
+            border: 2px solid var(--accent-alt);
+            border-radius: 6px;
+            padding: 1.25em 1.25em 1em 1.25em;
+            max-width: 24em;
+            text-align: center;
+            box-shadow: 0 0 20px var(--neutral-contrast);
+        }
+
+        #csv-warning .csv-warning-content p {
+            margin: 0 0 1em 0;
+            line-height: 1.4;
+        }
+
+        #csv-warning .dismiss-csv-warning {
+            display: block;
+            margin: 0 auto;
+            padding: 0.3em 1.25em;
+        }
+
         .tooltippable:not(a):not(button) {
             display: inline-block;
             background: var(--neutral-contrast);
@@ -409,6 +445,12 @@
     <link rel="shortcut icon" href="/images/zeeschuimer-96.png">
 </head>
 <body>
+<div id="csv-warning" hidden>
+    <div class="csv-warning-content">
+        <p></p>
+        <button class="dismiss-csv-warning">Dismiss</button>
+    </div>
+</div>
 <article>
     <header>
         <h1>Zeeschuimer</h1>
diff --git a/popup/interface.js b/popup/interface.js
index 3b8aaa9..c56375a 100644
--- a/popup/interface.js
+++ b/popup/interface.js
@@ -351,16 +351,29 @@ async function button_handler(event) {
     } else if (event.target.matches('.reset-all')) {
         await background.db.items.clear();
 
+    } else if (event.target.matches('.dismiss-csv-warning')) {
+        const warning = document.getElementById('csv-warning');
+        if(warning) warning.hidden = true;
+
     } else if (event.target.matches('.download-format')) {
         const format = event.target.getAttribute('data-format');
-        const blobber = format === 'csv' ? get_csv_blob : get_ndjson_blob;
         const extension = format;
 
         let platform = event.target.getAttribute('data-platform');
         let date = new Date();
         event.target.classList.add('loading');
 
-        let blob = await blobber(platform);
+        let blob;
+        if(format === 'csv') {
+            const result = await get_csv_blob(platform);
+            blob = result.blob;
+            if(result.skipped > 0) {
+                console.warn(`Zeeschuimer: skipped ${result.skipped} ${platform} item(s) during CSV export. First reason: ${result.firstReason}`);
+                show_csv_warning(platform, result.skipped);
+            }
+        } else {
+            blob = await get_ndjson_blob(platform);
+        }
         let filename = 'zeeschuimer-export-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.' + extension;
         const downloadUrl = window.URL.createObjectURL(blob);
         const downloadId = await browser.downloads.download({
@@ -637,27 +650,62 @@ function csv_escape(value) {
     return value;
 }
 
+/**
+ * Surface a CSV-export skip warning in the popup.
+ *
+ * Shown when the platform's `map_item` raised MapItemException for one or
+ * more items — typically the platform's response shape has shifted and the
+ * mapper no longer recognises every field. The user is steered to the
+ * .ndjson export, which is unaffected because it skips the mapper entirely.
+ */
+function show_csv_warning(platform, skipped) {
+    const warning = document.getElementById('csv-warning');
+    if(!warning) return;
+    const message = warning.querySelector('p');
+    message.innerText = `Skipped ${skipped} ${platform} item${skipped === 1 ? '' : 's'} in the CSV export — the platform's data format may have changed. Use the .ndjson export to get the full dataset until Zeeschuimer is updated.`;
+    warning.hidden = false;
+}
+
 /**
  * Get a CSV dump of items
  *
  * Returns a Blob with all items in it as CSV rows, mapped via the module's
  * registered mapper function. A header row is included.
  *
+ * Items whose mapper raises MapItemException are skipped and counted; any
+ * other error propagates. Skip count and the first skip reason are returned
+ * alongside the blob so the caller can warn the user. Just like 4CAT!
+ *
  * @param platform
- * @returns {Promise<Blob>}
+ * @returns {Promise<{blob: Blob, skipped: number, firstReason: string|null}>}
  */
 async function get_csv_blob(platform) {
     let csv = [];
+    let skipped = 0;
+    let firstReason = null;
     const module = background.zeeschuimer.modules[platform];
     await iterate_items(platform, function(item) {
-        item = module.mapper(item);
+        let mapped;
+        try {
+            mapped = module.mapper(item);
+        } catch(e) {
+            // More JS fun: Check tag rather than `instanceof`.
+            // Actual Exception lives in some other realm (where modules and lib.js live), and cross-realm
+            // `instanceof` is unreliable under Firefox's wrappers.
+            if(e && e.name === 'MapItemException') {
+                skipped++;
+                if(firstReason === null) firstReason = e.message;
+                return;
+            }
+            throw e;
+        }
         if(csv.length === 0) {
-            csv.push(Object.keys(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
+            csv.push(Object.keys(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
         }
-        csv.push(Object.values(item).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
+        csv.push(Object.values(mapped).map(v => csv_escape(v)).join(CSV_SEPARATOR) + "\n");
     })
 
-    return new Blob(csv, {type: 'text/csv'});
+    return {blob: new Blob(csv, {type: 'text/csv'}), skipped, firstReason};
 }
 
 /**

From f28e310c8893bb49ac535d33cc94089e8d0686b2 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 16:42:19 +0200
Subject: [PATCH 11/33] add MapItemException

---
 tests/setup-globals.cjs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
index a19fb09..4f54e34 100644
--- a/tests/setup-globals.cjs
+++ b/tests/setup-globals.cjs
@@ -22,6 +22,7 @@ const EXPOSED_NAMES = [
     'traverse_data',
     'MappedItem',
     'MissingMappedField',
+    'MapItemException',
     'wrap_for_map_item',
     'strip_tags',
     'normalize_url_encoding',

From 5baff31ae49167d215a56cf16ead326b22d975f3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 15:16:06 +0200
Subject: [PATCH 12/33] add env variables for tests (to connect to 4CAT)

---
 .gitignore              |  2 ++
 tests/.env.example      |  9 +++++++++
 tests/package-lock.json | 14 ++++++++++++++
 tests/package.json      |  4 +++-
 4 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 tests/.env.example

diff --git a/.gitignore b/.gitignore
index 6cf9326..fea65f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 
 # Testing artefacts
 .temp-profile
+tests/.env
+tests/.env.local
 
 # logs
 geckodriver.log
diff --git a/tests/.env.example b/tests/.env.example
new file mode 100644
index 0000000..2e021bb
--- /dev/null
+++ b/tests/.env.example
@@ -0,0 +1,9 @@
+# 4CAT API config for the map_item comparison tests.
+# Copy this file to .env in this directory and fill in real values.
+# .env is gitignored; .env.example is the committed template.
+
+# Base URL of the 4CAT instance to hit. No trailing slash.
+FOURCAT_URL=http://localhost
+
+# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user.
+FOURCAT_API_KEY=your-api-key-here
diff --git a/tests/package-lock.json b/tests/package-lock.json
index cc8f457..d055883 100644
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
@@ -9,6 +9,7 @@
       "version": "1.0.0",
       "devDependencies": {
         "dexie": "^3.2.4",
+        "dotenv": "^16.4.5",
         "fake-indexeddb": "^5.0.1",
         "jest": "^29.7.0",
         "jest-environment-jsdom": "^29.7.0"
@@ -1758,6 +1759,19 @@
         "node": ">=12"
       }
     },
+    "node_modules/dotenv": {
+      "version": "16.6.1",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
+      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://dotenvx.com"
+      }
+    },
     "node_modules/dunder-proto": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
diff --git a/tests/package.json b/tests/package.json
index 6dd35fb..333564a 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -5,10 +5,12 @@
   "type": "module",
   "scripts": {
     "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
-    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch"
+    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
+    "probe": "node probe-4cat.mjs"
   },
   "devDependencies": {
     "dexie": "^3.2.4",
+    "dotenv": "^16.4.5",
     "fake-indexeddb": "^5.0.1",
     "jest": "^29.7.0",
     "jest-environment-jsdom": "^29.7.0"

From 6a8ce3870f4e0b6c050d68573d8affa4cc46e37b Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 15:16:34 +0200
Subject: [PATCH 13/33] mirror 4CAT API missing value

---
 js/lib.js | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/js/lib.js b/js/lib.js
index e38430e..c618a6a 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -57,6 +57,12 @@ class MissingMappedField {
     toString() {
         return `${this.value}`;
     }
+
+    // Mirror 4CAT's API serialization so JSON.stringify produces the same
+    // tagged form on both sides. See docs/4cat-map-item-api.md.
+    toJSON() {
+        return { __missing: true, value: this.value };
+    }
 }
 
 /**

From 0c3140376ebd6e37cb1706fc48a105168d84d089 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:41:52 +0200
Subject: [PATCH 14/33] test the 4cat API endpoint

---
 tests/probe-4cat.mjs | 140 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 tests/probe-4cat.mjs

diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs
new file mode 100644
index 0000000..0bf4e4d
--- /dev/null
+++ b/tests/probe-4cat.mjs
@@ -0,0 +1,140 @@
+/**
+ * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item.
+ *
+ * Usage:
+ *   node probe-4cat.mjs <module_name> [<fixture_filename>] [--index N]
+ *
+ * <module_name> is the Zeeschuimer module filename without `.js` (e.g.
+ *   "tiktok", "pinterest"). If <fixture_filename> is omitted, the first
+ *   .ndjson in tests/fixtures/<module_name>/ is used. --index selects which
+ *   line of the fixture to send (default 0).
+ *
+ * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY.
+ */
+
+import 'dotenv/config';
+import { readFileSync, existsSync, readdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
+const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
+
+if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') {
+    console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env');
+    console.error('       (copy tests/.env.example to tests/.env and fill in real values)');
+    process.exit(1);
+}
+
+const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
+const ID_MAP = existsSync(ID_MAP_PATH)
+    ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
+    : {};
+
+function auth_headers() {
+    return { 'Authorization': `${FOURCAT_API_KEY}` };
+}
+
+async function list_datasources() {
+    const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() });
+    if (!res.ok) {
+        throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`);
+    }
+    const body = await res.json();
+    return body.datasources ?? [];
+}
+
+async function map_item(datasource_id, item) {
+    const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
+        method: 'POST',
+        headers: { ...auth_headers(), 'Content-Type': 'application/json' },
+        body: JSON.stringify({ item }),
+    });
+    const text = await res.text();
+    let body;
+    try { body = JSON.parse(text); } catch { body = { raw: text }; }
+    return { status_code: res.status, body };
+}
+
+function parse_args(argv) {
+    const args = { module: null, fixture: null, index: 0 };
+    const positional = [];
+    for (let i = 2; i < argv.length; i++) {
+        if (argv[i] === '--index') {
+            args.index = parseInt(argv[++i], 10);
+        } else if (argv[i].startsWith('--index=')) {
+            args.index = parseInt(argv[i].split('=')[1], 10);
+        } else {
+            positional.push(argv[i]);
+        }
+    }
+    args.module = positional[0];
+    args.fixture = positional[1];
+    return args;
+}
+
+async function main() {
+    const args = parse_args(process.argv);
+    if (!args.module) {
+        console.error('Usage: node probe-4cat.mjs <module_name> [<fixture_filename>] [--index N]');
+        process.exit(1);
+    }
+
+    const datasource_id = ID_MAP[args.module] ?? args.module;
+    const fixture_dir = join(__dirname, 'fixtures', args.module);
+
+    if (!existsSync(fixture_dir)) {
+        console.error(`error: no fixture dir at ${fixture_dir}`);
+        process.exit(1);
+    }
+
+    const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+    if (candidates.length === 0) {
+        console.error(`error: no .ndjson fixtures under ${fixture_dir}`);
+        process.exit(1);
+    }
+    const fixture_name = args.fixture ?? candidates[0];
+    const fixture_path = join(fixture_dir, fixture_name);
+    if (!existsSync(fixture_path)) {
+        console.error(`error: fixture ${fixture_path} not found`);
+        process.exit(1);
+    }
+
+    const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0);
+    if (args.index >= lines.length) {
+        console.error(`error: --index ${args.index} but fixture has ${lines.length} items`);
+        process.exit(1);
+    }
+    const item = JSON.parse(lines[args.index]);
+
+    console.log(`Module:        ${args.module}`);
+    console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`);
+    console.log(`URL:           ${FOURCAT_URL}/api/map-item/${datasource_id}/`);
+    console.log(`Fixture:       ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`);
+    console.log('');
+
+    const { status_code, body } = await map_item(datasource_id, item);
+    console.log(`HTTP ${status_code}`);
+    console.log(JSON.stringify(body, null, 2));
+
+    if (status_code === 404) {
+        console.error('');
+        console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:');
+        try {
+            const datasources = await list_datasources();
+            datasources
+                .filter(d => d.is_from_zeeschuimer && d.has_map_item)
+                .forEach(d => console.error(`  - ${d.id}  (${d.name})`));
+        } catch (e) {
+            console.error(`  (couldn't fetch list: ${e.message})`);
+        }
+        process.exit(2);
+    }
+}
+
+main().catch(e => {
+    console.error(`probe failed: ${e.message}`);
+    process.exit(2);
+});

From be2f3087d8dd5af07175101a808903604c84d78b Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:43:04 +0200
Subject: [PATCH 15/33] update docs and packages

---
 docs/test-plan.md       |  6 +++---
 tests/package-lock.json | 13 ++++++++++++-
 tests/setup-globals.cjs | 11 +++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/docs/test-plan.md b/docs/test-plan.md
index 249a7e0..a4265eb 100644
--- a/docs/test-plan.md
+++ b/docs/test-plan.md
@@ -63,7 +63,7 @@ Phase 3 — 4CAT integration (optional)
 - Problem: mapping tests live in 4CAT and need NDJSON input.
 - Changes:
   - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload.
-  - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: Bearer {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
+  - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
   - Do not fail the test run on 4CAT errors — print status and continue.
 
 Example upload with `requests`:
@@ -73,7 +73,7 @@ import requests
 with open(ndjson_path, 'rb') as f:
     headers = {
         'X-Zeeschuimer-Platform': platform,
-        'Authorization': f'Bearer {fourcat_key}'
+        'Authorization': f'{fourcat_key}'
     }
     r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f)
     # check r.status_code and r.text for details
@@ -149,7 +149,7 @@ Estimated effort: 6–10 hours of focused work to implement and test everything
 
 Open questions / confirmations needed
 
-- Confirm 4CAT API key header format (currently suggested: `Authorization: Bearer {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
+- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
 - Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.)
 
 Next steps
diff --git a/tests/package-lock.json b/tests/package-lock.json
index d055883..7758e9f 100644
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
@@ -12,7 +12,8 @@
         "dotenv": "^16.4.5",
         "fake-indexeddb": "^5.0.1",
         "jest": "^29.7.0",
-        "jest-environment-jsdom": "^29.7.0"
+        "jest-environment-jsdom": "^29.7.0",
+        "undici": "^6.20.0"
       }
     },
     "node_modules/@babel/code-frame": {
@@ -4197,6 +4198,16 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/undici": {
+      "version": "6.26.0",
+      "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz",
+      "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.17"
+      }
+    },
     "node_modules/undici-types": {
       "version": "7.16.0",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
index 4f54e34..6793cc0 100644
--- a/tests/setup-globals.cjs
+++ b/tests/setup-globals.cjs
@@ -40,3 +40,14 @@ return { ${EXPOSED_NAMES.join(', ')} };
 `);
 
 Object.assign(globalThis, factory());
+
+// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global
+// fetch, so the comparator can't hit 4CAT without help. Polyfill from
+// undici (a Node-friendly HTTP client, separately installable on npm —
+// distinct from the undici bundled internally by Node, which isn't
+// require()-able by name).
+// Note: tests that use fetch (e.g. map_item_compare.test.js) declare
+// `@jest-environment node` at the top of the file. Node env has fetch
+// natively. Don't try to polyfill into jsdom — undici's internals use
+// Node-specific globals that jsdom shadows (clearImmediate,
+// markResourceTiming, fast timers), and polyfilling them all is brittle.

From caf1c7f48a19524282c06b688c08001e534791db Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:43:17 +0200
Subject: [PATCH 16/33] some mapping for odd datasource names

---
 tests/zeeschuimer-to-4cat.json | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 tests/zeeschuimer-to-4cat.json

diff --git a/tests/zeeschuimer-to-4cat.json b/tests/zeeschuimer-to-4cat.json
new file mode 100644
index 0000000..f7de942
--- /dev/null
+++ b/tests/zeeschuimer-to-4cat.json
@@ -0,0 +1,7 @@
+{
+  "_comment": "Maps Zeeschuimer module filenames (without .js) to 4CAT datasource ids when they differ. Default behavior is identity — only include entries where the two diverge. Discovered via http://localhost/api/datasources/.",
+  "9gag": "ninegag",
+  "truth": "truthsocial",
+  "rednote": "xiaohongshu",
+  "rednote-comments": "xiaohongshu-comments"
+}

From f10fc492845051c87b96b75561eb91de2af99d18 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:44:05 +0200
Subject: [PATCH 17/33] update existing map_item tests and add helper

---
 tests/_module-info.js  |  45 ++++++++++++++++++
 tests/map_item.test.js | 105 +++++++++++++++++++----------------------
 2 files changed, 93 insertions(+), 57 deletions(-)
 create mode 100644 tests/_module-info.js

diff --git a/tests/_module-info.js b/tests/_module-info.js
new file mode 100644
index 0000000..e261e4e
--- /dev/null
+++ b/tests/_module-info.js
@@ -0,0 +1,45 @@
+/**
+ * Shared helper for the map_item test drivers.
+ *
+ * Pre-validates a module by:
+ *   1. Running `node --check` on its file (syntax check; avoids the
+ *      worker-killing experimental-ESM crash when a syntax error reaches
+ *      the dynamic importer).
+ *   2. Dynamically importing it and checking for a `map_item` export.
+ *
+ * Returns one of four states the test driver can branch on:
+ *   { state: 'ok',           map_item: <fn> }
+ *   { state: 'no_map_item' }
+ *   { state: 'syntax_error', error: <string> }
+ *   { state: 'import_error', error: <Error> }
+ */
+
+import { spawnSync } from 'node:child_process';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const MODULES_ROOT = join(__dirname, '..', 'modules');
+
+function check_module_syntax(module_name) {
+    const module_path = join(MODULES_ROOT, `${module_name}.js`);
+    const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' });
+    if (result.status === 0) return null;
+    return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+}
+
+export async function inspect_module(module_name) {
+    const syntax_error = check_module_syntax(module_name);
+    if (syntax_error) {
+        return { state: 'syntax_error', error: syntax_error };
+    }
+    try {
+        const mod = await import(`../modules/${module_name}.js`);
+        if (typeof mod.map_item !== 'function') {
+            return { state: 'no_map_item' };
+        }
+        return { state: 'ok', map_item: mod.map_item };
+    } catch (e) {
+        return { state: 'import_error', error: e };
+    }
+}
diff --git a/tests/map_item.test.js b/tests/map_item.test.js
index 9dee6e8..2dc1bb6 100644
--- a/tests/map_item.test.js
+++ b/tests/map_item.test.js
@@ -1,5 +1,5 @@
 /**
- * Auto-discovery test driver for module `map_item` functions.
+ * Smoke test driver for module `map_item` functions.
  *
  * Convention:
  *   tests/fixtures/<module_name>/*.ndjson
@@ -11,52 +11,36 @@
  * presents items to a map_item function, then run through the module's
  * map_item. Tests assert: function returns a non-null object, and any fields
  * listed in REQUIRED_NON_EMPTY for that module are present and non-empty.
+ *
+ * Module-level state is determined upfront by inspect_module():
+ *   - 'ok'            → register per-item tests
+ *   - 'no_map_item'   → register a single skipped test (not applicable)
+ *   - 'syntax_error'  → register a single failing test pointing at the line
+ *   - 'import_error'  → register a single failing test with the message
  */
 
 import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
-import { spawnSync } from 'node:child_process';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
-
-/**
- * Local mirror of wrap_for_map_item from js/lib.js.
- *
- * lib.js is loaded by the browser as a plain script (it defines globals
- * like traverse_data, MappedItem, wrap_for_map_item) and so cannot be
- * imported from Node. The wrap is three trivial lines with no dependencies
- * — duplicating it here is cheaper than restructuring lib.js into a module.
- * If lib.js's wrap_for_map_item ever gains real logic, this needs to track.
- */
-function wrap_for_map_item(stored_item) {
-    const { data, ...meta } = stored_item;
-    return { ...data, __import_meta: meta };
-}
+import { inspect_module } from './_module-info.js';
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const FIXTURE_ROOT = join(__dirname, 'fixtures');
-const MODULES_ROOT = join(__dirname, '..', 'modules');
-
-/**
- * Pre-validate module syntax before dynamic import.
- *
- * `await import()` on a module with a syntax error throws inside V8's module
- * linker in a way Jest's experimental-vm-modules can't always recover from
- * (worker retry loop or Node process exit). Running `node --check` first
- * gives us a clean error string we can fail the test with.
- */
-function check_module_syntax(module_name) {
-    const module_path = join(MODULES_ROOT, `${module_name}.js`);
-    const result = spawnSync(process.execPath, ['--check', module_path], {
-        encoding: 'utf8',
-    });
-    if (result.status === 0) return null;
-    return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
-}
 
 const REQUIRED_NON_EMPTY = {
     tiktok: ['id', 'author', 'unix_timestamp'],
 };
 
+/**
+ * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by
+ * the browser as a plain script and so cannot be imported from Node; this
+ * three-line mirror is cheaper than restructuring lib.js into a module.
+ */
+function wrap_for_map_item(stored_item) {
+    const { data, ...meta } = stored_item;
+    return { ...data, __import_meta: meta };
+}
+
 function list_module_dirs() {
     if (!existsSync(FIXTURE_ROOT)) return [];
     return readdirSync(FIXTURE_ROOT).filter(name => {
@@ -66,36 +50,46 @@ function list_module_dirs() {
 }
 
 const module_dirs = list_module_dirs();
+
+// Pre-pass: synchronously determine each module's state so we can branch
+// on it at describe/test registration time. Top-level await is supported
+// in Jest's experimental-vm-modules mode.
+const module_info = {};
+for (const module_name of module_dirs) {
+    module_info[module_name] = await inspect_module(module_name);
+}
+
 let total_fixtures = 0;
 
 for (const module_name of module_dirs) {
     const fixture_dir = join(FIXTURE_ROOT, module_name);
     const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
-
     if (fixture_files.length === 0) continue;
     total_fixtures += fixture_files.length;
 
-    describe(`map_item: ${module_name}`, () => {
-        let map_item;
-        let import_error;
-
-        beforeAll(async () => {
-            const syntax_error = check_module_syntax(module_name);
-            if (syntax_error) {
-                import_error = new Error(`syntax error:\n${syntax_error}`);
-                return;
-            }
-            try {
-                const mod = await import(`../modules/${module_name}.js`);
-                map_item = mod.map_item;
-                if (typeof map_item !== 'function') {
-                    import_error = new Error(`modules/${module_name}.js does not export a map_item function`);
-                }
-            } catch (e) {
-                import_error = e;
-            }
+    const info = module_info[module_name];
+
+    if (info.state === 'no_map_item') {
+        describe(`map_item: ${module_name}`, () => {
+            test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {});
+        });
+        continue;
+    }
+
+    if (info.state === 'syntax_error' || info.state === 'import_error') {
+        const msg = info.state === 'syntax_error'
+            ? `syntax error:\n${info.error}`
+            : `import failed: ${info.error.message}`;
+        describe(`map_item: ${module_name}`, () => {
+            test(`module loads`, () => { throw new Error(msg); });
         });
+        continue;
+    }
+
+    // state === 'ok' — register per-item tests
+    const map_item = info.map_item;
 
+    describe(`map_item: ${module_name}`, () => {
         for (const fixture_file of fixture_files) {
             const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
                 .split('\n')
@@ -104,9 +98,6 @@ for (const module_name of module_dirs) {
             describe(fixture_file, () => {
                 lines.forEach((line, i) => {
                     test(`item ${i} maps without throwing`, () => {
-                        if (import_error) {
-                            throw new Error(`failed to import modules/${module_name}.js: ${import_error.message}`);
-                        }
                         const stored_item = JSON.parse(line);
                         const mapped = map_item(wrap_for_map_item(stored_item));
                         expect(mapped).not.toBeNull();

From 3633cde656da3f70880ae49a2909deba3a044953 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:44:23 +0200
Subject: [PATCH 18/33] comparison testing for datasources

---
 tests/map_item_compare.test.js | 283 +++++++++++++++++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 tests/map_item_compare.test.js

diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
new file mode 100644
index 0000000..37e3e4c
--- /dev/null
+++ b/tests/map_item_compare.test.js
@@ -0,0 +1,283 @@
+/**
+ * @jest-environment node
+ *
+ * This file runs in Node test environment (not jsdom) because undici's
+ * fetch implementation uses Node-internal APIs (`clearImmediate`,
+ * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or
+ * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env
+ * has them all natively.
+ *
+ * Trade-off: no DOMParser in node env. The four modules that use
+ * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser
+ * polyfill (e.g. via linkedom) before the comparator can run against
+ * them. Other modules (including instagram) work as-is.
+ */
+/**
+ * Compare JS map_item output against 4CAT's Python map_item via the API.
+ *
+ * For every line in every fixture, runs the JS map_item locally AND sends
+ * the same stored item to 4CAT's /api/map-item/<datasource>/ endpoint, then
+ * diffs the two outputs field-by-field. Each item is its own Jest test —
+ * failures point at exactly which item and which fields diverge.
+ *
+ * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so
+ * `npm test` keeps working without 4CAT configuration. Drop real values in
+ * tests/.env to enable.
+ *
+ * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer
+ * module filename → 4CAT datasource id, for the few names that diverge).
+ *
+ * Module-level state is determined upfront by inspect_module() (no
+ * map_item / syntax errors / import errors are handled before tests are
+ * registered, so they appear once per module, not once per item).
+ */
+
+import 'dotenv/config';
+import { jest } from '@jest/globals';
+import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { inspect_module } from './_module-info.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
+const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
+const HAS_4CAT = Boolean(
+    FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here'
+);
+
+// When true (default), once any item in a module fails, subsequent items
+// in that same module skip the HTTP + map_item work and fail fast with a
+// "halted" message. Saves time when generator output is broken at the top.
+// Set FAIL_FAST=0 in env to run all items regardless.
+// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing
+// space in the variable value, which would otherwise defeat `!== '0'`.
+const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0';
+const halted_modules = new Set();
+
+const FIXTURE_ROOT = join(__dirname, 'fixtures');
+const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
+const ID_MAP = existsSync(ID_MAP_PATH)
+    ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
+    : {};
+
+function wrap_for_map_item(stored_item) {
+    const { data, ...meta } = stored_item;
+    return { ...data, __import_meta: meta };
+}
+
+async function call_4cat_map_item(datasource_id, item) {
+    const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
+        method: 'POST',
+        headers: {
+            // 4CAT accepts the raw key without a `Bearer ` prefix, per probe
+            'Authorization': FOURCAT_API_KEY,
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({ item }),
+    });
+    const text = await res.text();
+    if (!res.ok) {
+        throw new Error(`HTTP ${res.status} from 4CAT: ${text}`);
+    }
+    return JSON.parse(text);
+}
+
+// Round-trip a value through JSON so MappedItem, MissingMappedField, etc.
+// become plain JSON-compatible objects matching what 4CAT emits.
+function normalize(value) {
+    return JSON.parse(JSON.stringify(value));
+}
+
+// Recursive structural equality. Doesn't care about object key order, which
+// matters for nested values like {__missing: true, value: ""} where JS and
+// Python might emit keys in different orders.
+function deep_equal(a, b) {
+    if (a === b) return true;
+    if (a === null || b === null) return a === b;
+    if (typeof a !== typeof b) return false;
+    if (typeof a !== 'object') return false;
+    if (Array.isArray(a) !== Array.isArray(b)) return false;
+    if (Array.isArray(a)) {
+        if (a.length !== b.length) return false;
+        return a.every((v, i) => deep_equal(v, b[i]));
+    }
+    const a_keys = Object.keys(a);
+    const b_keys = Object.keys(b);
+    if (a_keys.length !== b_keys.length) return false;
+    return a_keys.every(k => k in b && deep_equal(a[k], b[k]));
+}
+
+function diff_objects(js_obj, py_obj) {
+    const diffs = [];
+    const keys = new Set([...Object.keys(js_obj ?? {}), ...Object.keys(py_obj ?? {})]);
+    for (const key of keys) {
+        const in_js = js_obj && key in js_obj;
+        const in_py = py_obj && key in py_obj;
+        if (!in_js) {
+            diffs.push({ key, kind: 'only_python', python: py_obj[key] });
+        } else if (!in_py) {
+            diffs.push({ key, kind: 'only_js', js: js_obj[key] });
+        } else if (!deep_equal(js_obj[key], py_obj[key])) {
+            diffs.push({ key, kind: 'mismatch', js: js_obj[key], python: py_obj[key] });
+        }
+    }
+    return diffs;
+}
+
+function format_diffs(diffs) {
+    return diffs.map(d => {
+        if (d.kind === 'only_js') {
+            return `  + only in JS:     ${d.key} = ${JSON.stringify(d.js)}`;
+        }
+        if (d.kind === 'only_python') {
+            return `  - only in Python: ${d.key} = ${JSON.stringify(d.python)}`;
+        }
+        return `  ~ ${d.key}\n      JS:     ${JSON.stringify(d.js)}\n      Python: ${JSON.stringify(d.python)}`;
+    }).join('\n');
+}
+
+// Pull out the first few module-frame lines from an error's stack so the
+// failure message points at where in modules/<name>.js the throw happened.
+function format_error_with_location(err) {
+    if (!err) return String(err);
+    const message = err.message || String(err);
+    const stack = err.stack || '';
+    const module_frames = stack.split('\n')
+        .filter(l => l.includes('/modules/') || l.includes('\\modules\\'))
+        .slice(0, 3)
+        .map(l => l.trim());
+    return module_frames.length
+        ? `${message}\n  ${module_frames.join('\n  ')}`
+        : message;
+}
+
+function list_module_dirs() {
+    if (!existsSync(FIXTURE_ROOT)) return [];
+    return readdirSync(FIXTURE_ROOT).filter(name => {
+        try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
+        catch { return false; }
+    });
+}
+
+// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's
+// default 5s is tight under load.
+jest.setTimeout(30000);
+
+if (!HAS_4CAT) {
+    describe('map_item compare (JS vs 4CAT Python)', () => {
+        test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {});
+    });
+} else {
+    const module_dirs = list_module_dirs();
+
+    // Pre-pass: synchronously determine each module's state so we can branch
+    // on it at registration time.
+    const module_info = {};
+    for (const module_name of module_dirs) {
+        module_info[module_name] = await inspect_module(module_name);
+    }
+
+    let any_fixtures = false;
+
+    for (const module_name of module_dirs) {
+        const fixture_dir = join(FIXTURE_ROOT, module_name);
+        const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
+        if (fixture_files.length === 0) continue;
+        any_fixtures = true;
+
+        const datasource_id = ID_MAP[module_name] ?? module_name;
+        const info = module_info[module_name];
+
+        if (info.state === 'no_map_item') {
+            // eslint-disable-next-line no-console
+            console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`);
+            continue;
+        }
+
+        if (info.state === 'syntax_error' || info.state === 'import_error') {
+            const msg = info.state === 'syntax_error'
+                ? `syntax error:\n${info.error}`
+                : `import failed: ${info.error.message}`;
+            describe(`map_item compare: ${module_name}`, () => {
+                test(`module loads`, () => { throw new Error(msg); });
+            });
+            continue;
+        }
+
+        // state === 'ok' — register per-item comparison tests
+        const map_item = info.map_item;
+
+        describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => {
+            for (const fixture_file of fixture_files) {
+                const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
+                    .split('\n')
+                    .filter(line => line.trim().length > 0);
+
+                describe(fixture_file, () => {
+                    lines.forEach((line, i) => {
+                        test(`item ${i}`, async () => {
+                            if (FAIL_FAST && halted_modules.has(module_name)) {
+                                throw new Error(
+                                    '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]'
+                                );
+                            }
+                            try {
+                                const stored_item = JSON.parse(line);
+
+                                // 4CAT side
+                                const response = await call_4cat_map_item(datasource_id, stored_item);
+
+                                // JS side
+                                let js_result;
+                                let js_error;
+                                try {
+                                    js_result = map_item(wrap_for_map_item(stored_item));
+                                } catch (e) {
+                                    js_error = e;
+                                }
+
+                                if (response.status === 'mapped') {
+                                    if (js_error) {
+                                        throw new Error(
+                                            `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}`
+                                        );
+                                    }
+                                    const js_obj = normalize(js_result);
+                                    const py_obj = normalize(response.item);
+                                    const diffs = diff_objects(js_obj, py_obj);
+                                    if (diffs.length > 0) {
+                                        throw new Error(
+                                            `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`
+                                        );
+                                    }
+                                } else if (response.status === 'skipped') {
+                                    if (!js_error) {
+                                        throw new Error(
+                                            `4CAT skipped this item ("${response.reason}") but JS produced a result`
+                                        );
+                                    }
+                                    // Both rejected — good. Skip reasons may differ in wording.
+                                } else if (response.status === 'error') {
+                                    throw new Error(`4CAT errored on this item: ${response.message}`);
+                                } else {
+                                    throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`);
+                                }
+                            } catch (e) {
+                                if (FAIL_FAST) halted_modules.add(module_name);
+                                throw e;
+                            }
+                        });
+                    });
+                });
+            }
+        });
+    }
+
+    if (!any_fixtures) {
+        describe('map_item compare (JS vs 4CAT Python)', () => {
+            test.skip('no fixtures under tests/fixtures/<module>/*.ndjson', () => {});
+        });
+    }
+}

From 7d97a0fe342e3b7f932c79fe22e9b8c6b3c25bb3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:44:35 +0200
Subject: [PATCH 19/33] list common translation errors

---
 tests/translation-errors.md | 430 ++++++++++++++++++++++++++++++++++++
 1 file changed, 430 insertions(+)
 create mode 100644 tests/translation-errors.md

diff --git a/tests/translation-errors.md b/tests/translation-errors.md
new file mode 100644
index 0000000..fcc160d
--- /dev/null
+++ b/tests/translation-errors.md
@@ -0,0 +1,430 @@
+# Auto-generator translation errors
+
+Patterns of incorrect Python → JavaScript translation observed in
+auto-generated `modules/*.js` files. Each entry has a search pattern so
+this doc doubles as a checklist when reviewing a new auto-generator PR.
+
+When an entry is fixed at the generator level (no longer appears in
+fresh output), mark it `[fixed]` and keep the entry around — useful
+history when something regresses.
+
+## How to use
+
+- Found a new pattern? Add an entry below following the template.
+- Reviewing a generator PR? `grep` each `Search pattern` against the
+  changed module files. Anything that hits is worth a manual look.
+- Iterating on the generator prompt? The "Why" lines are the
+  feedback to add — they describe the exact Python-vs-JS semantic
+  difference the LLM keeps missing.
+
+## Template
+
+```
+### <short-name>
+
+**Status:** open | fixed in generator | accepted
+
+**Why it happens:** <one-line description of the Python-vs-JS difference>
+
+**Wrong JS:**
+```js
+<the broken pattern>
+```
+
+**Correct JS:**
+```js
+<what it should look like>
+```
+
+**Example:** `modules/<file>.js:<line>`
+
+**Search pattern:** `<grep-able regex>`
+```
+
+---
+
+## Observed patterns
+
+### `in` operator on strings
+
+**Status:** open
+
+**Why it happens:** In Python, `"x" in some_string` is a substring check.
+In JavaScript, the `in` operator only works on **objects** and checks for
+property/key existence; using it with a string on the right-hand side
+throws `TypeError: cannot use 'in' operator to search for "x" in <string>`.
+
+**Wrong JS:**
+```js
+const is_polaris = '__typename' in item && 'polaris' in item.__typename.toLowerCase();
+```
+
+**Correct JS:**
+```js
+const is_polaris = '__typename' in item && item.__typename.toLowerCase().includes('polaris');
+```
+
+**Example:** `modules/instagram.js:513`
+
+**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\.` — quoted string followed
+by `in` followed by a method call. Quick rough check: `grep -E "' in [a-zA-Z]" modules/`
+
+**Watch out for partial fixes:** seen as `'polaris' in (item.__typename ?? '').toLowerCase()`
+— adding `?? ''` guards against `undefined` but the `in` operator itself
+still throws on the resulting *string*. The fix is `.includes()`, not just
+defaulting the operand.
+
+---
+
+### Python f-string syntax left in single-quoted JS strings
+
+**Status:** open
+
+**Why it happens:** Python `f"... {var} ..."` interpolates. JS uses
+template literals (backticks) with `${var}`. The auto-generator leaves the
+`{var}` notation in a regular single- or double-quoted JS string, which is
+just literal text — no interpolation happens.
+
+**Wrong JS:**
+```js
+throw new MapItemException('Unable to parse item: different user {user.id} and owner {owner.id}');
+```
+
+**Correct JS:**
+```js
+throw new MapItemException(`Unable to parse item: different user ${user.id} and owner ${owner.id}`);
+```
+
+**Example:** `modules/instagram.js:754`
+
+**Search pattern:** `'[^']*\{[a-zA-Z_$][\w$.]*\}[^']*'` or `"[^"]*\{[a-zA-Z_$][\w$.]*\}[^"]*"`
+— a non-template-literal string containing `{identifier}` or `{identifier.path}`.
+Quick check: `grep -nE "['\"][^'\"]*\{[a-zA-Z_][a-zA-Z0-9_.]*\}[^'\"]*['\"]" modules/`
+
+---
+
+### `?? {}` default that defeats subsequent truthy checks
+
+**Status:** open
+
+**Why it happens:** When porting Python's `node.get('user') or {}` (which is
+intended to make subsequent code safe to call), the generator emits
+`node.user ?? {}`. That's a *valid* Python-equivalent, **but** any following
+`if (user && owner) { ... }` guard then never short-circuits because both
+`{}` references are truthy. The check ends up reading "if user and owner
+*objects* exist" when the intent was "if user and owner data exist."
+Subsequent property accesses then compare real ids/usernames against
+`undefined` on the missing side, often throwing.
+
+**Wrong JS:**
+```js
+const user  = node.user  ?? {};
+const owner = node.owner ?? {};
+if (user && owner) {
+    if (user.id === owner.id) { /* … */ }
+    else if (user.username !== owner.username) {
+        throw new MapItemException('different user and owner');
+    }
+}
+```
+
+**Correct JS** (depending on intent — pick one):
+```js
+// (a) drop the defaults so truthy guard means "both present"
+const user  = node.user;
+const owner = node.owner;
+if (user && owner) { /* compare */ }
+```
+```js
+// (b) check for actual content, not just object identity
+const user  = node.user  ?? {};
+const owner = node.owner ?? {};
+if (Object.keys(user).length && Object.keys(owner).length) { /* compare */ }
+```
+
+**Example:** `modules/instagram.js:748-756`
+
+**Search pattern:** `\?\?\s*\{\s*\}` — any `?? {}` occurrence is worth a
+review of subsequent guards. Quick check: `grep -nE "\?\?\s*\{\s*\}" modules/`
+
+---
+
+### Bare relative path as a statement (junk auto-imports section)
+
+**Status:** open
+
+**Why it happens:** The generator emits an "auto-generated imports" marker
+block at the top of the module but writes the import target as a bare
+relative path on its own line (`../js/lib.js`) instead of a real `import`
+statement. JS parses that as `..` then `.` then `/js/lib.js` — syntax error.
+
+**Wrong JS:**
+```js
+// === auto-generated imports for map_item — DO NOT EDIT BY HAND ===
+../js/lib.js
+// === end auto-generated imports ===
+```
+
+**Correct JS** (one of):
+```js
+// === auto-generated imports — DO NOT EDIT BY HAND ===
+// Provided as globals by js/lib.js (loaded via manifest.json):
+//   MappedItem, MissingMappedField, MapItemException, traverse_data,
+//   strip_tags, normalize_url_encoding, formatUtcTimestamp
+// === end auto-generated imports ===
+```
+
+Or, if a real import is intended, an ESM import with named bindings:
+```js
+import { MappedItem, MissingMappedField } from '../js/lib.js';
+```
+
+**Example:** seen historically in `modules/tiktok.js:2`
+
+**Search pattern:** `^\.\./` at the start of a line in module files.
+Quick check: `grep -nE "^\.\." modules/*.js`
+
+---
+
+### Key-existence check (`'X' in obj`) used where Python intended value-truthiness (`obj.get('X')`)
+
+**Status:** open
+
+**Why it happens:** Python's `if node.get('usertags'):` is a *truthy check on
+the value* — returns False if the key is missing **or** if the value is
+`None`/empty/falsy. The generator translates this to `if ('usertags' in
+node)`, which in JS is a *key-existence check* — returns True even when
+the value is `null`. Subsequent property accesses on the null value then
+throw `Cannot read properties of null`.
+
+**Wrong JS:**
+```js
+const usertags = 'usertags' in node ? node.usertags.in.map(...).join(',') : '';
+// node.usertags can be null → .in.map blows up
+```
+
+**Correct JS:**
+```js
+const usertags = node.usertags ? node.usertags.in.map(...).join(',') : '';
+```
+
+**Example:** `modules/instagram.js:777`
+
+**Search pattern:** `'[^']+' in [a-zA-Z_$][\w$]*\s*\?` — quoted-string `in`
+identifier followed by `?` (ternary). Quick check:
+`grep -nE "'[^']+' in [a-zA-Z_]+ \?" modules/`
+
+---
+
+### Datetime serialization format mismatch
+
+**Status:** open
+
+**Why it happens:** Python's `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')`
+produces `"2026-05-13 21:27:31"` — space-separated, no timezone marker. JS's
+`new Date(t * 1000).toISOString()` produces `"2026-05-13T21:27:31.000Z"` — T
+separator, milliseconds, Z. The generator emits the JS `.toISOString()` form
+instead of using the existing `formatUtcTimestamp` helper from lib.js that
+mimics Python's output exactly.
+
+**Wrong JS:**
+```js
+collected_at = new Date(node.taken_at * 1000).toISOString();
+```
+
+**Correct JS:**
+```js
+collected_at = formatUtcTimestamp(node.taken_at);
+// formatUtcTimestamp is defined in js/lib.js as:
+//   new Date(unixSeconds * 1000).toISOString().replace('T', ' ').slice(0, 19)
+```
+
+**Example:** `modules/instagram.js:782`
+
+**Search pattern:** `new Date\([^)]+\)\.toISOString\(\)` — any use of
+`.toISOString()`. The helper should be used instead. Quick check:
+`grep -nE "\.toISOString\(\)" modules/`
+
+---
+
+### `re.findall` capture groups vs JS `.match` with /g flag
+
+**Status:** open
+
+**Why it happens:** Python's `re.findall(r'#(\w+)', s)` returns the **capture
+group contents**: `['lotr', 'woodart']`. JS's `s.match(/#(\w+)/g)` (with the
+global flag) returns the **full matches**: `['#lotr', '#woodart']` — capture
+groups are ignored. The generator translates the regex literally without
+adjusting for this semantic difference, so the resulting strings keep
+prefixes/wrappers that Python would have stripped.
+
+**Wrong JS:**
+```js
+hashtags: caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',')
+// produces "#lotr,#woodart"
+```
+
+**Correct JS:**
+```js
+// Option A: strip the literal prefix from each full match
+hashtags: caption.match(/#([^\s...]+)/g)?.map(h => h.slice(1)).join(',') ?? ''
+// Option B: use matchAll to get capture groups properly
+hashtags: [...caption.matchAll(/#([^\s...]+)/g)].map(m => m[1]).join(',') ?? ''
+```
+
+**Example:** `modules/instagram.js:812` (also 766, 870 — three copies)
+
+**Search pattern:** `\.match\(/[^/]*\([^/]*\)[^/]*/g\)` — any `.match()` with
+a global-flag regex containing a capture group. Quick check:
+`grep -nE "\.match\(/.*\(.*\).*\/g\)" modules/`
+
+---
+
+### `undefined` field values get dropped from JSON, but Python's `None` becomes `null`
+
+**Status:** open
+
+**Why it happens:** When `JSON.stringify` encounters an object property whose
+value is `undefined`, it **omits the key entirely** from the output. Python's
+`json.dumps` serializes `None` as `null`, keeping the key. The generator
+writes assignments like `location.city = node.location.city` where the
+right-hand side can be `undefined`, producing missing keys in JS output
+that show up as `only in Python: <field> = null` diffs against 4CAT.
+
+**Wrong JS:**
+```js
+location.city = node.location.city;  // undefined if .city missing
+// JSON.stringify({location_city: undefined}) → "{}" (key omitted)
+
+body: caption,  // null if no caption — Python returns "" here, not null
+```
+
+**Correct JS:**
+```js
+// Whichever fallback Python uses for that specific field:
+location.city = node.location.city ?? null;   // some fields → null
+body: caption ?? '',                          // other fields → ""
+```
+
+**Example:** `modules/instagram.js:745, 853` (`null` flavor),
+559, 648, 798 (`""` flavor for `body`)
+
+**Note:** Python's choice of `None` vs `""` is per-field — there's no
+universal rule. When the comparator reports `~ X  JS: null  Python: ""` use
+`?? ''`. When it reports `- only in Python: X = null` use `?? null`. The
+distinction matters because the JS output should match Python's choice
+exactly for that field.
+
+**Search pattern:** harder to grep automatically — any property assignment
+where the RHS could be `undefined`/`null` and the resulting field is
+expected to appear in the mapped output. Look at "only in Python: X = null"
+and "~ X  JS: null  Python: \"\"" diffs in the comparator output to find
+specific cases.
+
+---
+
+### Object-reference inequality used as type check
+
+**Status:** open
+
+**Why it happens:** The generator emits `caption !== new MissingMappedField('')`
+to mean "caption is not a missing-marker", but `new MissingMappedField('')`
+creates a fresh object every time, and `!==` on objects compares references.
+The expression is **always true**, so the conditional never takes the
+"missing" branch. Likely originates from Python idioms like `caption != ""`
+or `caption is not None`, mistranslated through the MissingMappedField
+abstraction.
+
+**Wrong JS:**
+```js
+hashtags: caption !== new MissingMappedField('') ? caption.match(...) : '',
+// !== between two different object references is always true
+```
+
+**Correct JS:**
+```js
+// If the intent was "if caption has content", just truthy-check it:
+hashtags: caption ? caption.match(...) : '',
+// If the intent was "if caption is not a MissingMappedField instance":
+hashtags: !(caption instanceof MissingMappedField) ? caption.match(...) : '',
+```
+
+**Example:** `modules/instagram.js:812` (and two other copies)
+
+**Search pattern:** `!== new [A-Z]` or `=== new [A-Z]` — any equality
+comparison with a freshly-constructed object. Quick check:
+`grep -nE "(!==|===) new [A-Z]" modules/`
+
+---
+
+### `.method()` chain on potentially-null result
+
+**Status:** open
+
+**Why it happens:** In Python, calling a method on `None` raises
+`AttributeError`, which 4CAT sometimes catches. In JS, calling a method on
+`null`/`undefined` throws `TypeError: Cannot read properties of null
+(reading '<method>')`. The generator emits the same dotted chain without
+optional-chaining (`?.`) protection.
+
+**Wrong JS:**
+```js
+hashtags: caption !== new MissingMappedField('')
+    ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',')
+    : '',
+```
+(here `caption` is allowed to be `null`, so `caption.match(...)` blows up
+on null caption)
+
+**Correct JS:**
+```js
+hashtags: caption
+    ? caption.match(/#([^\s!@#$%^&*()_+{}:"|<>?;',./`~]+)/g)?.join(',') ?? ''
+    : '',
+```
+
+**Example:** `modules/instagram.js:809`
+
+**Search pattern:** harder to grep — needs reading. Worth manual review of
+any field that uses `caption.match`, `something.split`, `something.join`
+without `?.` on a value that could be null/undefined.
+
+---
+
+## Generator prompt feedback (running list)
+
+Concrete things to fold into the generator's prompt over time:
+
+1. **Python `x in y` where `y` is a string** → use `y.includes(x)` in JS,
+   never `x in y`.
+2. **Python f-strings** → use JS template literals (backticks) with
+   `${...}` syntax. Never leave `{...}` in single- or double-quoted strings.
+3. **`?? {}` after a `.get(...) or {}` translation** → only use this if the
+   following code does property-access. If the following code does a
+   truthy guard (`if (x && y)`), drop the default and use just `node.user`.
+4. **Method chains on possibly-null values** → use `?.` (optional
+   chaining) instead of `.` whenever the receiver could be null/undefined.
+5. **The auto-imports header block** → emit either real `import { ... }`
+   statements with valid relative paths, or a comment-only header.
+   Never emit bare paths as JS statements.
+6. **Python `node.get('X')` truthy check** → in JS, use `node.X` (or
+   `node.X != null`), not `'X' in node`. The `in` operator checks key
+   existence, which is True even for explicit-null values.
+7. **Datetime serialization** → use the `formatUtcTimestamp` helper from
+   lib.js (which mimics Python's `strftime('%Y-%m-%d %H:%M:%S')` format),
+   not `new Date(...).toISOString()` (which has a different output shape:
+   T separator, milliseconds, Z suffix).
+8. **`re.findall` with capture groups** → in JS, `.match(/.../g)` returns
+   full matches, NOT capture groups. To get capture-group behavior, use
+   either `[...s.matchAll(/.../g)].map(m => m[1])` or post-process the
+   full matches with `.map(...)` to strip the literal parts.
+9. **Object-reference equality (`!== new X(...)`)** → never. Creating an
+   object with `new` produces a fresh reference; `===`/`!==` compares
+   identity. Use `instanceof X` for type checks, or compare values
+   directly. The MissingMappedField "is this missing?" check should be
+   `caption instanceof MissingMappedField` or just truthy-check the value.
+10. **Python `None` → JSON `null` vs JS `undefined` → omitted** — when a
+    field's value could be missing and Python returns `null` for it,
+    JS must explicitly assign `null` (not leave the value as `undefined`).
+    `JSON.stringify` drops `undefined` keys silently. Use `value ?? null`
+    when the field is expected to appear in the mapped output.

From 6ad4c134cf35d0993b2968f3b2dc832e2766794d Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:45:52 +0200
Subject: [PATCH 20/33] package.json fix

---
 tests/package.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/package.json b/tests/package.json
index 333564a..390fdd3 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -13,6 +13,7 @@
     "dotenv": "^16.4.5",
     "fake-indexeddb": "^5.0.1",
     "jest": "^29.7.0",
-    "jest-environment-jsdom": "^29.7.0"
+    "jest-environment-jsdom": "^29.7.0",
+    "undici": "^6.20.0"
   }
 }

From 11ffffbdea4b853fd88e219d719d6d7947fab6df Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:04:51 +0200
Subject: [PATCH 21/33] rm other test doc

---
 docs/test-plan.md | 162 ----------------------------------------------
 1 file changed, 162 deletions(-)
 delete mode 100644 docs/test-plan.md

diff --git a/docs/test-plan.md b/docs/test-plan.md
deleted file mode 100644
index a4265eb..0000000
--- a/docs/test-plan.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Selenium Test Harness — Improvement Plan
-
-Date: 2026-04-30
-
-Overview
-
-This document captures an actionable plan to improve the Selenium-based integration tests in `tests/test.py` for the Zeeschuimer Firefox extension. The goals are to:
-
-- Make profile handling reliable and reusable (so logged-in sessions persist across runs).
-- Preserve and export captured data per platform for offline analysis and for passing to 4CAT.
-- Add optional automated upload to a 4CAT instance for mapping/validation tests.
-- Reduce fragility caused by popups and interactive dialogs (pausing/dismissal patterns).
-- Improve robustness, error handling, and machine-readable results.
-
-Scope
-
-All changes are confined to the test harness and test metadata (`tests/test.py` and `tests/tests.json`) and to this planning document. No changes are required in the extension source for the planned items (the test harness will interact with the extension's UI pages and background DB).
-
-Phases & Changes
-
-Phase 1 — Profile management
-
-- Problem: copying an entire profile can race with a running Firefox and the current ignore rule hides potentially useful session data.
-- Changes:
-  - Detect if the selected profile directory appears locked (presence of `lock` or `.parentlock`) and warn if Firefox is running.
-  - Replace the naive ignore lambda used in `shutil.copytree` with a function that only excludes `storage`, `extensions`, and `signedInUser.json` at the profile root.
-  - Add CLI flags: `--profile-name NAME` (choose profile by display name from `profiles.ini`), `--save-profile PATH` (save the temp profile for reuse), and `--no-cleanup` (do not remove `.temp-profile` after run).
-
-Implementation note (copytree ignore example):
-
-```python
-def _profile_ignore(root, names):
-    # Only ignore these entries in the root profile dir
-    if os.path.abspath(root) == os.path.abspath(profile_dir):
-        return {"storage", "extensions", "signedInUser.json"}
-    return set()
-
-shutil.copytree(profile_dir, profile_file, ignore=_profile_ignore)
-```
-
-Phase 2 — Data preservation & export
-
-- Problem: `reset-all` wipes the DB before each URL; no artifacts are kept for post-mortem or mapping tests.
-- Decision: export a single combined NDJSON file per platform containing items collected while testing that platform.
-- Changes:
-  - Add CLI `--export-dir PATH` (default `./zeeschuimer-exports/{timestamp}/`).
-  - Before clicking `reset-all` for each URL, read the current DB contents from the extension background page (Dexie) via `execute_async_script` and append those items to a per-platform in-memory list in Python. After all URLs for a platform are done, write `{export-dir}/{platform}.ndjson`.
-  - Optionally add `--no-reset` to skip the `reset-all` call entirely (default behavior remains to reset before each URL).
-
-Execute_async_script pattern (example):
-
-```python
-script = '''
-const cb = arguments[0];
-background.db.items.toArray().then(items => cb(JSON.stringify(items))).catch(e => cb(JSON.stringify({error: String(e)})));
-'''
-items_json = driver.execute_async_script(script)
-items = json.loads(items_json)
-```
-
-Phase 3 — 4CAT integration (optional)
-
-- Problem: mapping tests live in 4CAT and need NDJSON input.
-- Changes:
-  - Add CLI flags: `--4cat-url URL` and `--4cat-key KEY` (API key). Require both for upload.
-  - After writing the per-platform NDJSON, POST it to `{4cat_url.rstrip('/')}/api/import-dataset/` with header `X-Zeeschuimer-Platform: {platform}` and `Authorization: {key}` (confirm header with your 4CAT instance; alternative is to trigger the extension UI upload button when cookie-based auth is required).
-  - Do not fail the test run on 4CAT errors — print status and continue.
-
-Example upload with `requests`:
-
-```python
-import requests
-with open(ndjson_path, 'rb') as f:
-    headers = {
-        'X-Zeeschuimer-Platform': platform,
-        'Authorization': f'{fourcat_key}'
-    }
-    r = requests.post(f"{fourcat_url.rstrip('/')}/api/import-dataset/", headers=headers, data=f)
-    # check r.status_code and r.text for details
-```
-
-Phase 4 — Interactive controls & popup dismissals
-
-- Problem: cookie banners, paywall prompts, and other popups frequently interfere with automated navigation and can cause false failures.
-- Decision: pause by default **once per platform** (not before every URL) so the tester can clear residual prompts; provide opt-out and finer-grained options.
-- Changes:
-  - CLI flags: `--no-interactive` (disable all pauses), `--pause-before-url` (pause before each URL), `--pause-on-fail` (pause on failure), `--extra-wait N` (add N seconds to every wait), `--screenshot-dir PATH` (capture screenshots on fail/warning).
-  - Add a `dismiss-selectors` optional field in `tests.json` per URL: a list of CSS selectors to click to dismiss known popups. Example:
-
-```json
-"dismiss-selectors": ["button.cookie-accept", ".modal .close"]
-```
-
-  - Add per-URL `timeout` (page load timeout override).
-
-Phase 5 — Runner robustness & reporting
-
-- Problem: unhandled exceptions abort the run; final runtime is calculated incorrectly; no machine-readable results.
-- Changes:
-  - Wrap each URL test body in try/except, increment `failed` on exceptions, and continue.
-  - Move the global `start_time = time.time()` to before the outer platform loop so the final elapsed time is for the full run.
-  - Add CLI flags: `--results-file PATH` (write JSON summary), `--resume-from PLATFORM` (skip earlier platforms), and `--screenshot-dir PATH` (as noted).
-  - Fix small test metadata issues (e.g., `more-after-scrolll` typo in `tests.json`).
-
-tests.json schema additions
-
-- Per-URL optional fields:
-  - `dismiss-selectors`: array of CSS selectors to click after page load
-  - `timeout`: numeric page load timeout seconds for this URL
-  - `extra-wait`: per-URL additional wait seconds
-
-CLI flags (summary)
-
-- `--profiledir PATH` — explicit profile path (existing)
-- `--profile-name NAME` — choose Firefox profile by display name
-- `--save-profile PATH` — persist the copied profile for reuse
-- `--no-cleanup` — keep `.temp-profile`
-- `--export-dir PATH` — where to write NDJSON exports
-- `--no-reset` — do not click `reset-all` between URLs
-- `--4cat-url URL` — base URL for 4CAT server
-- `--4cat-key KEY` — API key for 4CAT uploads
-- `--4cat-per-url` — upload per URL instead of per platform (optional)
-- `--no-interactive` — disable pausing (default is to pause per-platform)
-- `--pause-before-url` — pause before each URL
-- `--pause-on-fail` — pause when a test fails
-- `--extra-wait N` — add N seconds to every URL wait
-- `--screenshot-dir PATH` — save screenshots on fail/warning
-- `--results-file PATH` — write machine-readable results JSON
-- `--resume-from PLATFORM` — resume a run from a platform
-
-Verification checklist
-
-1. `python tests/test.py --sources instagram.com --export-dir ./exports` -> `exports/instagram.com.ndjson` exists and contains NDJSON with captured items.
-2. `python tests/test.py --save-profile .saved-profile --login` -> create a saved profile that can be reused with `--profiledir .saved-profile`.
-3. Run with default interactive behavior and confirm one pause per platform.
-4. `python tests/test.py --results-file results.json` -> JSON summary produced with per-URL status and counts.
-5. Test 4CAT upload using a local mock server and `--4cat-url http://localhost:8000 --4cat-key KEY`.
-
-Implementation steps (recommended order)
-
-1. Docs and small fixes (this document + tests.json typo fix).
-2. Profile management changes (`--profile-name`, improved copy ignore, `--save-profile`, lock detection).
-3. Export behavior: `--export-dir` + `execute_async_script` collection and NDJSON write.
-4. Runner robustness: try/except around URL loop, `--results-file`, fix `start_time` placement.
-5. Interactive and dismissal features (`dismiss-selectors`, pause flags, screenshots).
-6. 4CAT upload integration (optional, requires confirmation of auth header).
-
-Estimated effort: 6–10 hours of focused work to implement and test everything end-to-end; can be split into 3-4 incremental PRs.
-
-Open questions / confirmations needed
-
-- Confirm 4CAT API key header format (currently suggested: `Authorization: {key}`). If your 4CAT requires cookie-based auth, we should emulate the extension upload button via Selenium instead.
-- Confirm desired default for interactive mode. (Current recommendation: pause once per platform by default; provide `--no-interactive` to run fully headless.)
-
-Next steps
-
-- I have created a matching TODO list in the session tracker and written this document to `docs/test-plan.md`.
-- If you want, I can start implementing Phase 1 (profile management) in `tests/test.py` now and submit incremental changes.
-
----
-
-Requested file: `docs/test-plan.md`

From 6cc61003e95be381b191baae1486f989a2ed3e71 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:05:55 +0200
Subject: [PATCH 22/33] map_item.test.js verify modules import and map_item
 exists only

---
 tests/map_item.test.js | 134 ++++++++++-------------------------------
 1 file changed, 31 insertions(+), 103 deletions(-)

diff --git a/tests/map_item.test.js b/tests/map_item.test.js
index 2dc1bb6..774c083 100644
--- a/tests/map_item.test.js
+++ b/tests/map_item.test.js
@@ -1,121 +1,49 @@
 /**
- * Smoke test driver for module `map_item` functions.
+ * Load-only smoke for every module under `modules/*.js`.
  *
- * Convention:
- *   tests/fixtures/<module_name>/*.ndjson
+ * For each module file, runs `inspect_module()` and asserts the module:
+ *   - parses (no SyntaxError)
+ *   - imports without throwing
+ *   - either exports a `map_item` function, or doesn't (both are fine here)
  *
- * <module_name> matches a file in modules/ (e.g. "tiktok" maps to modules/tiktok.js).
- * Each .ndjson line is one Zeeschuimer-stored item exported from the popup.
+ * No data is fed through `map_item`. That work belongs in the comparator
+ * (Tier 2 — `npm run test:compare`), where real items pulled from a 4CAT
+ * dataset provide both the input and the expected output.
  *
- * Each item is wrapped via wrap_for_map_item to mirror how 4CAT's importer
- * presents items to a map_item function, then run through the module's
- * map_item. Tests assert: function returns a non-null object, and any fields
- * listed in REQUIRED_NON_EMPTY for that module are present and non-empty.
- *
- * Module-level state is determined upfront by inspect_module():
- *   - 'ok'            → register per-item tests
- *   - 'no_map_item'   → register a single skipped test (not applicable)
- *   - 'syntax_error'  → register a single failing test pointing at the line
- *   - 'import_error'  → register a single failing test with the message
+ * Catches: parse errors, import-time throws, broken top-level statements.
+ * Does NOT catch: anything that requires running `map_item` on real input.
  */
 
-import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { readdirSync } from 'node:fs';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { inspect_module } from './_module-info.js';
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
-const FIXTURE_ROOT = join(__dirname, 'fixtures');
-
-const REQUIRED_NON_EMPTY = {
-    tiktok: ['id', 'author', 'unix_timestamp'],
-};
-
-/**
- * Local mirror of wrap_for_map_item from js/lib.js. lib.js is loaded by
- * the browser as a plain script and so cannot be imported from Node; this
- * three-line mirror is cheaper than restructuring lib.js into a module.
- */
-function wrap_for_map_item(stored_item) {
-    const { data, ...meta } = stored_item;
-    return { ...data, __import_meta: meta };
-}
-
-function list_module_dirs() {
-    if (!existsSync(FIXTURE_ROOT)) return [];
-    return readdirSync(FIXTURE_ROOT).filter(name => {
-        try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
-        catch { return false; }
-    });
-}
+const MODULES_ROOT = join(__dirname, '..', 'modules');
 
-const module_dirs = list_module_dirs();
+const module_files = readdirSync(MODULES_ROOT)
+    .filter(f => f.endsWith('.js') && !f.startsWith('_'));
 
-// Pre-pass: synchronously determine each module's state so we can branch
-// on it at describe/test registration time. Top-level await is supported
-// in Jest's experimental-vm-modules mode.
 const module_info = {};
-for (const module_name of module_dirs) {
-    module_info[module_name] = await inspect_module(module_name);
+for (const file of module_files) {
+    const name = file.replace(/\.js$/, '');
+    module_info[name] = await inspect_module(name);
 }
 
-let total_fixtures = 0;
-
-for (const module_name of module_dirs) {
-    const fixture_dir = join(FIXTURE_ROOT, module_name);
-    const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
-    if (fixture_files.length === 0) continue;
-    total_fixtures += fixture_files.length;
-
-    const info = module_info[module_name];
-
-    if (info.state === 'no_map_item') {
-        describe(`map_item: ${module_name}`, () => {
-            test.skip(`modules/${module_name}.js does not export a map_item function — nothing to smoke test`, () => {});
+describe('module load smoke', () => {
+    for (const file of module_files) {
+        const name = file.replace(/\.js$/, '');
+        test(`modules/${file} loads cleanly`, () => {
+            const info = module_info[name];
+            if (info.state === 'syntax_error') {
+                throw new Error(`syntax error in modules/${file}:\n${info.error}`);
+            }
+            if (info.state === 'import_error') {
+                throw new Error(`import failed for modules/${file}: ${info.error.message}`);
+            }
+            // 'ok' or 'no_map_item' — both acceptable at this tier.
+            expect(['ok', 'no_map_item']).toContain(info.state);
         });
-        continue;
     }
-
-    if (info.state === 'syntax_error' || info.state === 'import_error') {
-        const msg = info.state === 'syntax_error'
-            ? `syntax error:\n${info.error}`
-            : `import failed: ${info.error.message}`;
-        describe(`map_item: ${module_name}`, () => {
-            test(`module loads`, () => { throw new Error(msg); });
-        });
-        continue;
-    }
-
-    // state === 'ok' — register per-item tests
-    const map_item = info.map_item;
-
-    describe(`map_item: ${module_name}`, () => {
-        for (const fixture_file of fixture_files) {
-            const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
-                .split('\n')
-                .filter(line => line.trim().length > 0);
-
-            describe(fixture_file, () => {
-                lines.forEach((line, i) => {
-                    test(`item ${i} maps without throwing`, () => {
-                        const stored_item = JSON.parse(line);
-                        const mapped = map_item(wrap_for_map_item(stored_item));
-                        expect(mapped).not.toBeNull();
-                        expect(typeof mapped).toBe('object');
-                        for (const field of REQUIRED_NON_EMPTY[module_name] ?? []) {
-                            expect(mapped[field]).toBeDefined();
-                            expect(mapped[field]).not.toBe('');
-                            expect(mapped[field]).not.toBeNull();
-                        }
-                    });
-                });
-            });
-        }
-    });
-}
-
-if (total_fixtures === 0) {
-    describe('map_item', () => {
-        test.skip('no fixtures found under tests/fixtures/<module_name>/*.ndjson', () => {});
-    });
-}
+});

From a090675c162573b3ae8633584010464d3d264bdc Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:06:24 +0200
Subject: [PATCH 23/33] remove old fixtures and 4cat probe

---
 tests/__pycache__/test.cpython-39.pyc | Bin 7345 -> 0 bytes
 tests/fixtures/.gitignore             |   5 -
 tests/fixtures/README.md              |  29 ------
 tests/probe-4cat.mjs                  | 140 --------------------------
 4 files changed, 174 deletions(-)
 delete mode 100644 tests/__pycache__/test.cpython-39.pyc
 delete mode 100644 tests/fixtures/.gitignore
 delete mode 100644 tests/fixtures/README.md
 delete mode 100644 tests/probe-4cat.mjs

diff --git a/tests/__pycache__/test.cpython-39.pyc b/tests/__pycache__/test.cpython-39.pyc
deleted file mode 100644
index 745e2b4aaad921a459372bb50b39980c50a68136..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7345
zcmai3-E$k)b>CeqKnNl!lA<VDww1LkN`yoJvK4<(AC@dpksV1^L@BZ_NyAIrOJKpp
zE_8Px5-Z6`L$@!DCQSO$#!&|BL#FLZUOLlsI+MSk(}z6b$>TnGrZeqKUpk()>F?ZK
zfYgUo?Cjn9aqc<ip6^TALqj<Yzkm9NhfcbvY5z)toxfoW&fyIP0mn6_ah)5k&UE!P
zn4!KYmQr7nnd+NnY4y#p48AFMppj)+o%WjUu11dKR6p(RZVa+P)z7#?jbXM&^#|OM
z#wZ(A{j59I*vs~+{w{aCG0w(SKj-diOt1;n-|g;iOtMMUA9P=69AF1jf5?5Y@gjRs
z*M#1AX-i`-i<j#M`7qzZNBHQ&6y|9>#`iup9_Rb`1mDjmADRio3;Y0oamS9A_{-Zn
zI^O|iiy!3Q;nuH=&MRVKL*uV(8SId->aT9<vL+5tZ$s}K-qJSp;8QWh)!G*tfAvvn
zLkE>3{IL20jve)WFJ2Ur;?Rb^p|fd`7t<S%_NoC~zrfz9$9BZ4hgVIk{x?>SsMTX)
znjhuUpKCnNk3CEQ%k)KU>2-EojCW>4x>FFxxAb~(OJ^nQIW9__6MSY%2S-@>LVIKq
zmQDT6@3C5>zBh(m30U<PG^`tniW;$_SUI7_Q~WjcJ-}bbX<y?fxAe~Ikj_bQQk~t9
zSA@~|uE0FbY}DTnN5x70UB%}cN*`}RQg3bOKRdwQ7OBoV;%#vF)@_X&re<mz(3m&&
znzm^KKRu!8>iHNgf0MuUxrUvyol_h;E^3h5X>j#U=M10ioE2|%-rdr+^exB}bWd*@
z@=L6qQfsH_)bV+|EM`w9Cpn|O*$s`I6MH-7#e3r91~5Q^$2%AJSw6940N3%(#Vv3k
zj*D~U<KSR!GbK-m9Q8JI&)^2TB=*+7CoYL|{N0BJyDTo(-v_P({2U|%*+8>-e*Pih
zy4?9dyvg6g{P&4lHFJeuNP1WK#iVzQ&#B%NzodA)+<Aq6PtBN}5BcQ;>be-L|A4>0
zWz=u5dH#Wzhi<?BD8+7q&MV?(3{}6?`60iO^l!6|aQ>?br&a$LC;2bI__ahHBlP7u
zTfn=BcS)EX6T1C`e+ZqHx3v1Fd~Q-3*O(1?FK=lVwaxU>_@cq~u}TjXr@z(V;v=%6
zgwGH7^?vUMVqZt_*ZGYtW534d_s1|(!d5(--ZbT}#XfQRuC97hVjs?QGuGXvd8a5U
z30-sPFF|*ORYkQw^ApU}m?Pp{x5Nte-V!zFKCQ4ni?bgp&hla(v~+u1;~(wUSiOIS
z!|@p+OG&h*V4(rwKp!7NM+VK@*1R`OP3)<!Zl>Gy*k-{)9n^1&QO@`RU##kU>0t^p
zpTvImvE~^kG=-;3cD9+8{}P=B71F62&%>Gx))0-3hnSL<e-E6a3hSqj48E`t`zPCg
zZGwXd#eokF{<-Hhib7L3+<pNzDDJd$VEK=D9Jf`&%fvk<g(Quk#hF)An;Ch8FUNEI
z?JrDTiC3uq$n<isZv_u83=)lwY#Goj=lHU)FTtJ_v_z(JmkT~ZKEPK*2w7AgrEo?>
zN45rQTUV#9ku+Fav^NLjcppom2esGfX@S>OkMKQrZd+UY@ML0>PuS@5xm!<k`Ccc&
zxz}JRtNivBY14gVvU}h)D}O5PLEACJ`VNT3qtsg(VAf)Jt)_9VU#k(xSy!WVv4+@A
z^Y>sc9`}3p!khawwR?dd#&^N{w-`$T|E1{d{8H__vu$ZyiU9ltk5UPSC52(h`@24E
z6I>rkFi?+ZKaC-L3PWwjS+cU;$EO2dUEwoS?@4^ECpfVbMaJ*;PrH^tEGk|Xz288l
z>rY58EC28q#XtPNGW)a-Z*K0Axo<gV`>AtIKgC}x)2I)b@Cm=XNAU&m>34X4qvF)%
z&V7|NE_WWN?{-e?#kwi+nOG2uVyOxbP(F4q$+=VfejH83CzBd~0Pp)OsUfELkD#R=
ziv{I}#<ry(mj20pcsVjk4D%DQl)&_(4`4PPgZW%6CNO;+0On6)7;emK`9F(}<Zg-}
z46GGjT1<$bQfoO4A#=HTA*>s7=PzEJD>iuUYpN%&<@N5sht1IOy@0y&Lf&;M0$t<1
zHrkw6uZZpCitqYTa4hUv6U$sWcZI|t%k5BvAcB4-Xa}Ka#Cw~z$6JAdF|XTU4YOv|
z^_Qc;LZK=B6~`6ak<r8@M~W4H-Ad-by32*^YI&n;)`Z*a4sfwzx7;uqDio@svg*gQ
zqrHUH3Vmzm<Upa|`c=n^Ub<~Np#>f++jadli&!CGwR(x=f)J-Ehq5K|DM=-@+=Hf@
zu`48y2zWx0WOS%U<%9VTuUW*kRq?&hc09+cS}{yceTww}8+R27f!~r95kzm!#~mU|
z5Hr}>XxN27G;Il~@l)1v+e%J1qd<t+Xy&uh`DXK?9onTEzTE(w(p5ilT-Poo!YW;l
z1sIf$MJGNhU2&GBE!(uqbt*O;z?yG0o4yR6u_JG=f!hw^)>^<a7j3x)X^nqYDmD@t
zE3ra=dQU<o(TNr9wmd(?sO9nE*E-}Cy?xE!p*fnjuG>Li`JS-EI<ez~ZrfUOpsJmE
zFGiDBp;M;`c_x}Cs|AAD3L)<OsGq`n0dFvfCe%9meWZmt8N7ZipXwTZ&^3e54!hYP
zEH@Hc)cN0PasnH^IeB4r@fNfnEOOfwi#StN+6@xxVzlidi%N&8EjHWT>}l6-EOYzp
zJ3#TDcr`hR<|eE%wJO@(0V!brfrwHCYdRX8`nWLFC`@r{>hkQ=)!C`}$jDpKI~Quc
zACR3i{2;WXs0fdw0bArsLer2n797tCYr=9J^zCK;uE<Bjx75BwG*<L3PVkrj$*6V+
zxI&Z!?A-?@X~DBZ1s2{WFp$ed$1eEIHnGw}i3J-r*MOD4cHkGW;S`I-u3n6WinP>o
z)QUk8NPg6(SE(e6P8C(?U+YkZ0d=qe@+`Y5D1L;(3n;uqqk&WP1b^QnMJPv6x`8zH
z^{(DlgSgobgi6%csq+FF3M|+ScYVw<hy@d`VuN&3)$VIcBM<cZdPm#Rq<LTexpBvw
zkF5Ym(m^PlCP*5^6J34f37Xy9X_7xYw0ZUxuqo2=FnZtUGttX8lduW{g_%mL>DnQ_
zjbPTg_aHK6tY|m}fmmhBo3)~`8!b=82)J|{CyLSN6}#nCYNUq5lcT9XR9eyr+eL~k
z=oN|0N?7)#gE&@h`tYGPxE_GyC_Z-w{5U8_MzNUJ*{;40BLp>JyI~Ci{+goM*Xz`z
z(L@L5ya32Sct={WoB(>aQRhWdu)9_`%f)i53bNH#OE?9&QYoxCK_Q6=)nEP<{r>UY
z=whi9Ai^)43*q15%%L4v4=t5S#bT+%!-WtgFr7bh=zT8~(z9Lb)~)w1_Sbndf|yh2
zo5^gc6zwZD{bs9Ka%jhjT@l4v*l<}Yw3nlyo<x<aMwhN5!w1$JL0a|-wQav;)$F@)
zCBIsQkC9^`_QJRN`cg1!wiouI@;CaKsKWh9s{!XM-f0Qho)@n2p7X9deY6+?mn#YZ
zVpydHl+9LnG+%`B=j=*t8h;J2&YZRGS<cF|C{}7nP)L#(V*t&qPUo!$r>qD0Q_<1q
z5P3BU3c|COkrNA1YH_g`z40wHUU@bVkN^!U8nk9-tzgw@Hlf7Gq)~M8*@UN$F1Nza
z_lgjs2n#S%kW6?!^^I;tDeUf=YsjN5vY~~q{^qw!EYlX0Pno%lJWXC@hoKCjBUTeW
z51IO_5WZK1GQye-@xz2*SR`z+v}mMaHz7t!g0YNS*MOSXT=?pjoqzw`&z7tUF$5B0
z2rUSXh-5uS)q(Hcg{n!(3GQwdfl+OZvdbb|6T-8iw5kQZ(F;*#;gb`Ecb1|fQiLt(
zS>Gm|S1G3ig=oK3^jeLA13wEOA;<HCya}U?MjMDp1!Z}Kpdx*w?sG_{NUX(p_+ESZ
z#LTziC&LE581%Ams<kkGstRe~_vO1^Q*4WJmLjXvIGPxFyn&_?wfyQYmRPzfaZIS+
z4k#_C(<jS+{@W$s=}lVE)C?*nWN|9AXDrwNvXeyWP#zQ`C)+t3gVEY^pb9$)#&Dhw
zC#gtbQ$BheDNCV(jApi(d5ZE=D542V{eZR4n-a%x3|d7&LRg|b+hcXKXf%$xRCJ-r
zAVZ3F_hK}xG)fg;Ux~67;;--VATkP8D+8G?d<A)0(EtoOK%P$KAVe$rz*@5<#q_|M
zR-DJmQHI`=?~H})(DHgiB>ez&&Xa=3CureOP>8i+J8lb**0f?hQTk(e^Z6fj2g+p_
zYq`v_wydIt3<S#rwN~i3%%tM8Ym&RkF`S}nNWl(~lX*_7QS4Xj$TF(jE+$DtQW||O
zV3KxZiU41F21Y7s5k;2Ul1!HE>4U@zl41wj+nX(K-zq3|$%eXDNRm9u_JHI7hLUQF
zZnhV><t_pl@PlF#b-$#)k#`5E>!SMIP4HA6vf=ZxszUj7l8<C5L^L*Nb6$qb)B?*X
z(a{)%qiR!d2sW5hMrFFC0H}XQ>1M<_N`s)AZX$z*T@%BQ?e2l2)?(RY5Oh<JI4(jc
zy#;3j$ut{Uk-n--5mOpXxtG`-rxT>rELE;@g_PIL)_tTH9E+jERIzQFF{%()Mm9qD
zI?Gm2&{3~z`c2`nw91Ms<9GztH7VSp7>lnUD1K0jhisT6OhL7*pt~8WVNf}nM73+8
zvJK=>CB~#axgJZQn*xKhJ1jS0KSBzxBG_1~Nw>cpQvut(<2ELb5UPO$75?#{E{$x+
z@gN4*Eki55$A<A&h)N4+;NMOYydZRNwolPQPAStt%1It!V>BHD_UHCEaPat|QKoE#
z!@5E=+29I-aTy1u%+AsuT|a;GGF$|qo0+Ya!`Yr4^?$h~UA8Mu^5xKHIZv#W<7m!?
z<8K)Wjk_XX1HDdK-6p$fRTQvH95Gmi+*$B$wihkvwrR^yG29)6twH({7ZBSm2Tp{F
z4nELocmXqN638ggkZ+(GMGVCyEhbNQ4mbnkp6#vS9MTO~ig<_fgdxQyG67B}9=RDO
z-?`pBo(*1@pD)kPU75RZ^TQi#Pm(YDA{umpGIwApb{ob!am}A^Rcdpw{Q5{B!FH4E
z3oao`q~IYBWhwkR&N^VkUO_DbTTYR;q=MEm+l35Tjt@3MU6mI|k4TzqD3tA_5B|Gp
zS#Jf{*9g$BxVYAI%{wiq6_^@3s&=~ENxClJvexi<%N1wowivi*&2(1pKgOW`jK|38
zBY4JCn;kH8<9{=$%>U3%n#~&7)Sy00SPW`0gZH43GsdtJk2$92^ju~f*pC?#`lO!G
zC$J}r?=a|P4UF~S)PdAk3UGREAWN+=q7Py`sAmjNAS$}4o-FVZ)e*zgjX#>8jTUbP
z7)OB7#OjznnHm9>5wr%r6Z(j8BsFXp`Zz}Pj_4Wlm_C~tS9@}Q%%)65i+1&%3Ggti
r)7nAsdQd+C-g9`Zeye5-%!0d@)ao#;8W#8)2S0R@tbPE`VLbl_#VpU)

diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore
deleted file mode 100644
index 8e89a83..0000000
--- a/tests/fixtures/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-# Ignore everything in this directory
-*
-# Except these files
-!.gitignore
-!README.md
\ No newline at end of file
diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md
deleted file mode 100644
index d24fe06..0000000
--- a/tests/fixtures/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Test fixtures for `map_item`
-
-Real captured items used to exercise each module's auto-generated `map_item`
-function.
-
-## Layout
-
-```
-tests/fixtures/
-  <module_name>/
-    <whatever>.ndjson
-    <whatever-else>.ndjson
-```
-
-`<module_name>` matches the filename in `modules/` without `.js` —
-e.g. `tiktok/` → `modules/tiktok.js`, `pinterest/` → `modules/pinterest.js`.
-You can drop multiple `.ndjson` files in a module folder; each gets its own
-`describe` block and each line becomes its own `test`.
-
-Filenames are free-form — the auto-export filename from the popup
-(`zeeschuimer-export-<platform>-<timestamp>.ndjson`) is fine.
-
-## Privacy / committing
-
-These files contain real captured platform data — usernames, post
-content, URLs, sometimes images and other PII. 
-
-If we want to create test exports or annonomize real exports, add them to 
-.gitignore.
\ No newline at end of file
diff --git a/tests/probe-4cat.mjs b/tests/probe-4cat.mjs
deleted file mode 100644
index 0bf4e4d..0000000
--- a/tests/probe-4cat.mjs
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Manually exercise 4CAT's /api/map-item/ endpoint against a fixture item.
- *
- * Usage:
- *   node probe-4cat.mjs <module_name> [<fixture_filename>] [--index N]
- *
- * <module_name> is the Zeeschuimer module filename without `.js` (e.g.
- *   "tiktok", "pinterest"). If <fixture_filename> is omitted, the first
- *   .ndjson in tests/fixtures/<module_name>/ is used. --index selects which
- *   line of the fixture to send (default 0).
- *
- * Requires tests/.env with FOURCAT_URL and FOURCAT_API_KEY.
- */
-
-import 'dotenv/config';
-import { readFileSync, existsSync, readdirSync } from 'node:fs';
-import { join, dirname } from 'node:path';
-import { fileURLToPath } from 'node:url';
-
-const __dirname = dirname(fileURLToPath(import.meta.url));
-
-const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
-const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
-
-if (!FOURCAT_URL || !FOURCAT_API_KEY || FOURCAT_API_KEY === 'your-api-key-here') {
-    console.error('error: FOURCAT_URL and FOURCAT_API_KEY must be set in tests/.env');
-    console.error('       (copy tests/.env.example to tests/.env and fill in real values)');
-    process.exit(1);
-}
-
-const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
-const ID_MAP = existsSync(ID_MAP_PATH)
-    ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
-    : {};
-
-function auth_headers() {
-    return { 'Authorization': `${FOURCAT_API_KEY}` };
-}
-
-async function list_datasources() {
-    const res = await fetch(`${FOURCAT_URL}/api/datasources/`, { headers: auth_headers() });
-    if (!res.ok) {
-        throw new Error(`GET /api/datasources/ → ${res.status}: ${await res.text()}`);
-    }
-    const body = await res.json();
-    return body.datasources ?? [];
-}
-
-async function map_item(datasource_id, item) {
-    const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
-        method: 'POST',
-        headers: { ...auth_headers(), 'Content-Type': 'application/json' },
-        body: JSON.stringify({ item }),
-    });
-    const text = await res.text();
-    let body;
-    try { body = JSON.parse(text); } catch { body = { raw: text }; }
-    return { status_code: res.status, body };
-}
-
-function parse_args(argv) {
-    const args = { module: null, fixture: null, index: 0 };
-    const positional = [];
-    for (let i = 2; i < argv.length; i++) {
-        if (argv[i] === '--index') {
-            args.index = parseInt(argv[++i], 10);
-        } else if (argv[i].startsWith('--index=')) {
-            args.index = parseInt(argv[i].split('=')[1], 10);
-        } else {
-            positional.push(argv[i]);
-        }
-    }
-    args.module = positional[0];
-    args.fixture = positional[1];
-    return args;
-}
-
-async function main() {
-    const args = parse_args(process.argv);
-    if (!args.module) {
-        console.error('Usage: node probe-4cat.mjs <module_name> [<fixture_filename>] [--index N]');
-        process.exit(1);
-    }
-
-    const datasource_id = ID_MAP[args.module] ?? args.module;
-    const fixture_dir = join(__dirname, 'fixtures', args.module);
-
-    if (!existsSync(fixture_dir)) {
-        console.error(`error: no fixture dir at ${fixture_dir}`);
-        process.exit(1);
-    }
-
-    const candidates = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
-    if (candidates.length === 0) {
-        console.error(`error: no .ndjson fixtures under ${fixture_dir}`);
-        process.exit(1);
-    }
-    const fixture_name = args.fixture ?? candidates[0];
-    const fixture_path = join(fixture_dir, fixture_name);
-    if (!existsSync(fixture_path)) {
-        console.error(`error: fixture ${fixture_path} not found`);
-        process.exit(1);
-    }
-
-    const lines = readFileSync(fixture_path, 'utf8').split('\n').filter(l => l.trim().length > 0);
-    if (args.index >= lines.length) {
-        console.error(`error: --index ${args.index} but fixture has ${lines.length} items`);
-        process.exit(1);
-    }
-    const item = JSON.parse(lines[args.index]);
-
-    console.log(`Module:        ${args.module}`);
-    console.log(`Datasource id: ${datasource_id}${ID_MAP[args.module] ? ' (mapped via zeeschuimer-to-4cat.json)' : ''}`);
-    console.log(`URL:           ${FOURCAT_URL}/api/map-item/${datasource_id}/`);
-    console.log(`Fixture:       ${fixture_name}, item ${args.index} (item_id=${item.item_id ?? item.id})`);
-    console.log('');
-
-    const { status_code, body } = await map_item(datasource_id, item);
-    console.log(`HTTP ${status_code}`);
-    console.log(JSON.stringify(body, null, 2));
-
-    if (status_code === 404) {
-        console.error('');
-        console.error('Hint: datasource id may be wrong. Available Zeeschuimer-origin datasources:');
-        try {
-            const datasources = await list_datasources();
-            datasources
-                .filter(d => d.is_from_zeeschuimer && d.has_map_item)
-                .forEach(d => console.error(`  - ${d.id}  (${d.name})`));
-        } catch (e) {
-            console.error(`  (couldn't fetch list: ${e.message})`);
-        }
-        process.exit(2);
-    }
-}
-
-main().catch(e => {
-    console.error(`probe failed: ${e.message}`);
-    process.exit(2);
-});

From c62a7e796db9bc3e1f7cb12f78fc50cbfa37e60c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:06:47 +0200
Subject: [PATCH 24/33] update lib.js note on new endpoint

---
 js/lib.js | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/js/lib.js b/js/lib.js
index c618a6a..518a6fa 100644
--- a/js/lib.js
+++ b/js/lib.js
@@ -59,7 +59,11 @@ class MissingMappedField {
     }
 
     // Mirror 4CAT's API serialization so JSON.stringify produces the same
-    // tagged form on both sides. See docs/4cat-map-item-api.md.
+    // tagged form on both sides: 4CAT's /api/dataset/<key>/items/ endpoint,
+    // when called with `missing_fields=keep`, emits missing values as
+    // `{ __missing: true, value: <fallback> }`. Matching that shape here
+    // lets the map_item comparator deep-equal both sides without special
+    // handling.
     toJSON() {
         return { __missing: true, value: this.value };
     }

From 234f1ce4377ceedf64777054b303e01d84293a2c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:07:21 +0200
Subject: [PATCH 25/33] update tests/.env.example (comments and dataset keys)

---
 tests/.env.example | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/.env.example b/tests/.env.example
index 2e021bb..137a52b 100644
--- a/tests/.env.example
+++ b/tests/.env.example
@@ -1,9 +1,23 @@
-# 4CAT API config for the map_item comparison tests.
+# 4CAT API config for the map_item comparator (`npm run test:compare`).
 # Copy this file to .env in this directory and fill in real values.
 # .env is gitignored; .env.example is the committed template.
 
-# Base URL of the 4CAT instance to hit. No trailing slash.
+# Base URL of the 4CAT instance to hit. No trailing slash. Default ports:
+#   :80   for nginx (production)
+#   :4000 for the Flask dev server
 FOURCAT_URL=http://localhost
 
-# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your user.
+# API key for that 4CAT instance. Get one from the 4CAT UI; tied to your
+# user. 4CAT accepts the raw key as the Authorization header value (no
+# `Bearer ` prefix).
 FOURCAT_API_KEY=your-api-key-here
+
+# Comma-separated list of dataset keys (the 32-char ids from 4CAT dataset
+# URLs) to compare. The comparator pulls inputs from /download/<key> and
+# expected outputs from
+# /api/dataset/<key>/items/?annotations=no&missing_fields=keep&stream=true
+# for each. Datasource is read from each dataset's metadata.
+#
+# `npm run test:compare -- <key>` narrows a single run to one key; the key
+# must still be listed here.
+FOURCAT_DATASETS=key1,key2,key3

From e0d0fb834983456aafadf4f1f9708855aa502b1c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:08:31 +0200
Subject: [PATCH 26/33] note on _loader.js for `wrap_for_map_item`

---
 modules/_loader.js | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/_loader.js b/modules/_loader.js
index afae2d7..ceb0080 100644
--- a/modules/_loader.js
+++ b/modules/_loader.js
@@ -1,3 +1,8 @@
+// Load-order dependency: `wrap_for_map_item` (used below) is a free global
+// defined in js/lib.js, which manifest.json loads as a plain background
+// script before this module. There is no import for it here on purpose —
+// MV2 background scripts share one global scope. If lib.js stops being
+// loaded first, the mapper wrapper below will ReferenceError.
 async function load() {
     const imported_modules = [
         await import("./tiktok.js"),

From f2341d6e798a39f777d13e5c60af81d360ae6714 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:09:51 +0200
Subject: [PATCH 27/33] fix my test environment; scripts vs libraries

---
 .gitignore                    |  2 +
 tests/_module-info.js         | 36 ++++++++++++------
 tests/jest.compare.config.cjs | 20 ++++++++++
 tests/jest.config.cjs         |  3 ++
 tests/package-lock.json       | 70 +++++++++++++++++++++++++++++------
 tests/package.json            | 12 +++---
 tests/run-compare.mjs         | 43 +++++++++++++++++++++
 tests/setup-globals.cjs       | 52 +++++++++++---------------
 8 files changed, 179 insertions(+), 59 deletions(-)
 create mode 100644 tests/jest.compare.config.cjs
 create mode 100644 tests/run-compare.mjs

diff --git a/.gitignore b/.gitignore
index fea65f3..4d495c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
 .temp-profile
 tests/.env
 tests/.env.local
+__pycache__/
+*.pyc
 
 # logs
 geckodriver.log
diff --git a/tests/_module-info.js b/tests/_module-info.js
index e261e4e..e6866a3 100644
--- a/tests/_module-info.js
+++ b/tests/_module-info.js
@@ -7,6 +7,9 @@
  *      the dynamic importer).
  *   2. Dynamically importing it and checking for a `map_item` export.
  *
+ * Results are cached per module name so test files that load this helper
+ * via separate Jest workers/files don't pay the spawnSync cost twice.
+ *
  * Returns one of four states the test driver can branch on:
  *   { state: 'ok',           map_item: <fn> }
  *   { state: 'no_map_item' }
@@ -21,25 +24,36 @@ import { fileURLToPath } from 'node:url';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const MODULES_ROOT = join(__dirname, '..', 'modules');
 
+const syntax_cache = new Map();
+const inspect_cache = new Map();
+
 function check_module_syntax(module_name) {
+    if (syntax_cache.has(module_name)) return syntax_cache.get(module_name);
     const module_path = join(MODULES_ROOT, `${module_name}.js`);
     const result = spawnSync(process.execPath, ['--check', module_path], { encoding: 'utf8' });
-    if (result.status === 0) return null;
-    return (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+    const out = result.status === 0
+        ? null
+        : (result.stderr || result.stdout || `exit code ${result.status}`).trim();
+    syntax_cache.set(module_name, out);
+    return out;
 }
 
 export async function inspect_module(module_name) {
+    if (inspect_cache.has(module_name)) return inspect_cache.get(module_name);
     const syntax_error = check_module_syntax(module_name);
+    let result;
     if (syntax_error) {
-        return { state: 'syntax_error', error: syntax_error };
-    }
-    try {
-        const mod = await import(`../modules/${module_name}.js`);
-        if (typeof mod.map_item !== 'function') {
-            return { state: 'no_map_item' };
+        result = { state: 'syntax_error', error: syntax_error };
+    } else {
+        try {
+            const mod = await import(`../modules/${module_name}.js`);
+            result = typeof mod.map_item === 'function'
+                ? { state: 'ok', map_item: mod.map_item }
+                : { state: 'no_map_item' };
+        } catch (e) {
+            result = { state: 'import_error', error: e };
         }
-        return { state: 'ok', map_item: mod.map_item };
-    } catch (e) {
-        return { state: 'import_error', error: e };
     }
+    inspect_cache.set(module_name, result);
+    return result;
 }
diff --git a/tests/jest.compare.config.cjs b/tests/jest.compare.config.cjs
new file mode 100644
index 0000000..070e2ff
--- /dev/null
+++ b/tests/jest.compare.config.cjs
@@ -0,0 +1,20 @@
+// Tier 2 — live comparator against a 4CAT instance.
+//
+// Runs only `map_item_compare.test.js`. Requires FOURCAT_URL,
+// FOURCAT_API_KEY, and FOURCAT_DATASETS to be set in tests/.env. Hard-errors
+// rather than silently skipping if env is missing.
+//
+// Env is jsdom so that the four modules using `strip_tags` (gab, pinterest,
+// rednote, truth) have a native DOMParser. The comparator uses cross-fetch
+// to provide a jsdom-friendly fetch (jsdom doesn't ship fetch and undici
+// crashes inside jsdom).
+module.exports = {
+  testEnvironment: 'jsdom',
+  testMatch: ['**/map_item_compare.test.js'],
+  testPathIgnorePatterns: ['/node_modules/'],
+  transform: {},
+  moduleFileExtensions: ['js', 'json'],
+  setupFiles: ['<rootDir>/setup-globals.cjs'],
+  testTimeout: 30000,
+  verbose: true
+};
diff --git a/tests/jest.config.cjs b/tests/jest.config.cjs
index ea72b10..239abbc 100644
--- a/tests/jest.config.cjs
+++ b/tests/jest.config.cjs
@@ -1,6 +1,9 @@
+// Default Jest config — Tier 1 only (duplicate-behavior + load-only smoke).
+// The comparator is excluded; invoke it via `npm run test:compare`.
 module.exports = {
   testEnvironment: 'jsdom',
   testMatch: ['**/*.test.js'],
+  testPathIgnorePatterns: ['/node_modules/', 'map_item_compare\\.test\\.js$'],
   transform: {},
   moduleFileExtensions: ['js', 'json'],
   collectCoverageFrom: ['*.test.js'],
diff --git a/tests/package-lock.json b/tests/package-lock.json
index 7758e9f..ada8011 100644
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
@@ -8,12 +8,12 @@
       "name": "zeeschuimer-db-tests",
       "version": "1.0.0",
       "devDependencies": {
+        "cross-fetch": "^4.0.0",
         "dexie": "^3.2.4",
         "dotenv": "^16.4.5",
         "fake-indexeddb": "^5.0.1",
         "jest": "^29.7.0",
-        "jest-environment-jsdom": "^29.7.0",
-        "undici": "^6.20.0"
+        "jest-environment-jsdom": "^29.7.0"
       }
     },
     "node_modules/@babel/code-frame": {
@@ -1599,6 +1599,16 @@
         "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
       }
     },
+    "node_modules/cross-fetch": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-4.1.0.tgz",
+      "integrity": "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "node-fetch": "^2.7.0"
+      }
+    },
     "node_modules/cross-spawn": {
       "version": "7.0.6",
       "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
@@ -3481,6 +3491,52 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/node-fetch": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
+      "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "whatwg-url": "^5.0.0"
+      },
+      "engines": {
+        "node": "4.x || >=6.0.0"
+      },
+      "peerDependencies": {
+        "encoding": "^0.1.0"
+      },
+      "peerDependenciesMeta": {
+        "encoding": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/node-fetch/node_modules/tr46": {
+      "version": "0.0.3",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
+      "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/node-fetch/node_modules/webidl-conversions": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
+      "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
+      "dev": true,
+      "license": "BSD-2-Clause"
+    },
+    "node_modules/node-fetch/node_modules/whatwg-url": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
+      "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "tr46": "~0.0.3",
+        "webidl-conversions": "^3.0.0"
+      }
+    },
     "node_modules/node-int64": {
       "version": "0.4.0",
       "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz",
@@ -4198,16 +4254,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/undici": {
-      "version": "6.26.0",
-      "resolved": "https://registry.npmjs.org/undici/-/undici-6.26.0.tgz",
-      "integrity": "sha512-4yqz8a3n5HmGTlsbADNtr/dJlhkh/55Rq798G6ibiULcXbDtaLpTl1pvdqcbFfeoj3iSi52lePFM7h9H21cw/A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.17"
-      }
-    },
     "node_modules/undici-types": {
       "version": "7.16.0",
       "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
diff --git a/tests/package.json b/tests/package.json
index 390fdd3..763321c 100644
--- a/tests/package.json
+++ b/tests/package.json
@@ -1,19 +1,19 @@
 {
   "name": "zeeschuimer-db-tests",
   "version": "1.0.0",
-  "description": "Unit tests for Zeeschuimer duplicate handling logic",
+  "description": "Unit tests for Zeeschuimer duplicate handling logic and map_item generator output",
   "type": "module",
   "scripts": {
-    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js",
-    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --watch",
-    "probe": "node probe-4cat.mjs"
+    "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs",
+    "test:watch": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.cjs --watch",
+    "test:compare": "node run-compare.mjs"
   },
   "devDependencies": {
+    "cross-fetch": "^4.0.0",
     "dexie": "^3.2.4",
     "dotenv": "^16.4.5",
     "fake-indexeddb": "^5.0.1",
     "jest": "^29.7.0",
-    "jest-environment-jsdom": "^29.7.0",
-    "undici": "^6.20.0"
+    "jest-environment-jsdom": "^29.7.0"
   }
 }
diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs
new file mode 100644
index 0000000..69240ab
--- /dev/null
+++ b/tests/run-compare.mjs
@@ -0,0 +1,43 @@
+/**
+ * Launcher for the Tier 2 map_item comparator (`npm run test:compare`).
+ *
+ *   npm run test:compare              -> compares every key in FOURCAT_DATASETS
+ *   npm run test:compare -- <key>     -> narrows the run to a single key
+ *   npm run test:compare -- <key> -t "id=123"   -> key + forwarded jest flags
+ *
+ * Why this exists instead of invoking jest directly: jest treats any bare
+ * positional argument as a test-path-pattern filter. A 4CAT dataset key
+ * (`5daeba72a2dfbb5ed8c855f824a61570`) matches no test file path, so
+ * `jest <key>` silently discovers zero tests and exits "green" having run
+ * nothing. This launcher intercepts the first non-flag argument, hands it to
+ * the comparator through the COMPARE_DATASET env var, and forwards only the
+ * remaining flags to jest — so the key never reaches jest's argv.
+ */
+
+import { spawn } from 'node:child_process';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const args = process.argv.slice(2);
+
+// First non-flag arg (if any) is the dataset key to narrow to. Everything
+// that looks like a flag is forwarded to jest verbatim.
+const dataset_key = args.find(a => !a.startsWith('-'));
+const jest_flags = args.filter(a => a !== dataset_key);
+
+const env = { ...process.env };
+if (dataset_key) env.COMPARE_DATASET = dataset_key;
+
+const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js');
+const child = spawn(
+    process.execPath,
+    ['--experimental-vm-modules', jest_bin, '--config', 'jest.compare.config.cjs', ...jest_flags],
+    { stdio: 'inherit', cwd: __dirname, env },
+);
+
+child.on('exit', code => process.exit(code ?? 1));
+child.on('error', err => {
+    console.error(`failed to launch jest: ${err.message}`);
+    process.exit(1);
+});
diff --git a/tests/setup-globals.cjs b/tests/setup-globals.cjs
index 6793cc0..b55e659 100644
--- a/tests/setup-globals.cjs
+++ b/tests/setup-globals.cjs
@@ -4,50 +4,42 @@
  * loads lib.js as a plain script.
  *
  * map_item bodies reference these as free identifiers (MappedItem,
- * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without this
- * shim they'd hit ReferenceError as soon as a test invokes map_item.
+ * MissingMappedField, strip_tags, normalize_url_encoding, ...). Without
+ * this shim they'd hit ReferenceError as soon as a test invokes map_item.
  *
- * Approach: read lib.js, wrap it in a new Function() body that returns the
- * named helpers, call the function, and assign the returned object onto
- * globalThis. (Earlier attempt with vm.runInThisContext failed because in
- * the jsdom env the vm context's global differs from jsdom's window.)
- *
- * If a new helper is added to lib.js, append its name to EXPOSED_NAMES.
+ * Names are auto-discovered from lib.js by regex-matching top-level
+ * `function name(...)` and `class Name ...` declarations. Adding a helper
+ * to lib.js makes it available to tests without touching this file.
  */
 
 const fs = require('node:fs');
 const path = require('node:path');
 
-const EXPOSED_NAMES = [
-    'traverse_data',
-    'MappedItem',
-    'MissingMappedField',
-    'MapItemException',
-    'wrap_for_map_item',
-    'strip_tags',
-    'normalize_url_encoding',
-    'formatUtcTimestamp',
-];
-
 const lib_source = fs.readFileSync(
     path.join(__dirname, '..', 'js', 'lib.js'),
     'utf8',
 );
 
+// Match `function name(` and `class Name {` / `class Name extends` at
+// column 0 of a line. lib.js is a classic script with all top-level
+// declarations unindented; requiring column 0 keeps nested helpers (like
+// the `_traverse_data` IIFE inside `traverse_data`) from being exposed.
+const NAME_PATTERN = /^(?:function|class)\s+([A-Za-z_$][A-Za-z0-9_$]*)\b/gm;
+const EXPOSED_NAMES = Array.from(
+    lib_source.matchAll(NAME_PATTERN),
+    m => m[1],
+);
+
+if (EXPOSED_NAMES.length === 0) {
+    throw new Error(
+        'setup-globals.cjs: no top-level function/class declarations found in js/lib.js — ' +
+        'auto-discovery regex may be broken. Tests will ReferenceError if not fixed.'
+    );
+}
+
 const factory = new Function(`
 ${lib_source}
 return { ${EXPOSED_NAMES.join(', ')} };
 `);
 
 Object.assign(globalThis, factory());
-
-// jsdom doesn't expose fetch and Jest's jsdom env shadows Node's global
-// fetch, so the comparator can't hit 4CAT without help. Polyfill from
-// undici (a Node-friendly HTTP client, separately installable on npm —
-// distinct from the undici bundled internally by Node, which isn't
-// require()-able by name).
-// Note: tests that use fetch (e.g. map_item_compare.test.js) declare
-// `@jest-environment node` at the top of the file. Node env has fetch
-// natively. Don't try to polyfill into jsdom — undici's internals use
-// Node-specific globals that jsdom shadows (clearImmediate,
-// markResourceTiming, fast timers), and polyfilling them all is brittle.

From e39ad4276e93b7792d852a55c83ce2cbf9c805d4 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:10:12 +0200
Subject: [PATCH 28/33] update map_item_compare.test.js for new 4CAT endpoints

---
 tests/README.md                | 193 ++++++++++---
 tests/map_item_compare.test.js | 505 +++++++++++++++++++++------------
 2 files changed, 478 insertions(+), 220 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index f1188e2..cd35e0a 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,31 +1,42 @@
 ## Tests for Zeeschuimer
 
-This folder contains **testing** code for Zeeschuimer.
+This folder contains testing code for Zeeschuimer. There are three suites,
+each with a different purpose and a different runtime environment:
 
-### Integration Tests (Selenium)
+| Suite                            | Tests                                                     | Environment        | When it runs                    | Needs                                  |
+|----------------------------------|-----------------------------------------------------------|--------------------|---------------------------------|----------------------------------------|
+| Selenium integration             | Page captures real items from each supported platform     | Real Firefox       | Reviewer-supervised, manual     | Firefox profile, sometimes a human     |
+| Duplicate-behavior unit (Jest)   | DB merge / keep / update semantics in isolation           | jsdom + fake-IDB   | `npm test` (every push)         | None                                   |
+| Module load smoke (Jest, Tier 1) | Each `modules/*.js` parses and imports cleanly            | jsdom              | `npm test` (every push)         | None                                   |
+| `map_item` comparator (Jest, Tier 2) | JS `map_item` output matches 4CAT's Python mapping per item | jsdom + cross-fetch | `npm run test:compare` (on demand) | Live 4CAT, API key, dataset key(s) |
 
-The Python + Selenium tests visit pages on supported platforms
-and see how many items are captured. If the amount of items captured is 
-unexpectedly low or high, this is flagged and may indicate that Zeeschuimer no
-longer properly captures data from the platform.
+Hermetic suites (no external dependencies) live in `npm test`. Anything that
+requires a real browser, a 4CAT server, or a human in the loop is opt-in.
 
-These tests are **supervised** i.e. they require monitoring by a human and 
+### Integration tests (Selenium)
+
+The Python + Selenium tests visit pages on supported platforms and see how
+many items are captured. If the amount of items captured is unexpectedly
+low or high, this is flagged and may indicate that Zeeschuimer no longer
+properly captures data from the platform.
+
+These tests are **supervised** — they require monitoring by a human and
 cannot run fully autonomously, since some platforms (TikTok in particular)
 occasionally show CAPTCHAs that need to be completed for a test to run
 successfully. This is also why Selenium does not run a headless Firefox.
 
-The amount of items returned per page is somewhat variable for most platforms,
-so if the number is slightly lower or higher than expected this is not 
-necessarily a problem (but worth checking).
+The amount of items returned per page is somewhat variable for most
+platforms, so if the number is slightly lower or higher than expected this
+is not necessarily a problem (but worth checking).
 
-Additionally, most platforms require logging in before (full) access to the UI
-is available. The testing script borrows a Firefox profile directory from 
-elsewhere on the system to do this. It will try to find one automatically but
-you can also pass one with the `--profiledir` argument. The idea is that you
-log in to the various sites (Instagram, etc) in your 'normal' Firefox, and the
-tests then borrow that login to interface with the website.
+Most platforms require logging in before (full) access to the UI is
+available. The testing script borrows a Firefox profile directory from
+elsewhere on the system to do this. It will try to find one automatically
+but you can also pass one with the `--profiledir` argument. Log in to the
+various sites (Instagram, etc) in your 'normal' Firefox, and the tests then
+borrow that login.
 
-Run `test.py` to run tests. Required non-standard libraries are in 
+Run `test.py` to run tests. Required non-standard libraries are in
 `requirements.txt`.
 
 Tests are defined in `tests.json` with the following structure:
@@ -35,49 +46,139 @@ Tests are defined in `tests.json` with the following structure:
   "platform id as in zeeschuimer (e.g. 'tiktok.com')": {
     "test case (e.g. 'Home feed')": {
       "url": {
-        "expected": 0,  # amount of items expected to be captured on this page
-        "more-after-scroll": false,  # whether scrolling is supposed to load more items (currently unsupported)
-        "wait": 10  # wait time before checking number of items (optional, default 5)
-      } # more URLS can be added per test case
+        "expected": 0,
+        "more-after-scroll": false,
+        "wait": 10
+      }
     }
   }
 }
 ```
 
-### Unit Tests (Jest)
-
-The JavaScript unit tests verify duplicate-handling logic in isolation using 
-a mocked Dexie database. These tests ensure that when the duplicate behavior 
-setting is changed, the correct existing record is selected for updates.
+### Jest suites
 
 **Prerequisites**
-- Node.js (v18 or later) and npm must be installed
+- Node.js (v18 or later) and npm
+- `cd tests && npm install`
+
+**Recommended: develop the tests inside Docker.** On Windows the global
+permission model can make `npm install` / `npm test` awkward to run from
+an arbitrary shell, and an agentic assistant working in auto-mode will
+hit deny-rules before it can do a `cross-fetch`-style dependency spike.
+Any minimal `node:20`-or-newer image with this repo mounted in is
+enough — install what you need, run `npm install`, run `npm test` and
+`npm run test:compare`. The host's `tests/.env` is picked up via the
+mount, and `FOURCAT_URL` can point at a 4CAT reachable from the
+container (`host.docker.internal` on Windows/Mac, the host IP on
+Linux).
+
+#### Duplicate-behavior unit tests
+
+Verify duplicate-handling logic in isolation using a mocked Dexie database.
+Ensures that when the duplicate behavior setting is changed, the correct
+existing record is selected for updates.
+
+Coverage:
+- Schema upgrade backfills `last_updated` from `timestamp_collected`
+- Compound index correctly selects most recent item by `last_updated`
+- Forward-looking behavior: "keep" → "update" targets newest record
+- Forward-looking behavior: "update" → "keep" creates new records
+- Merge: shallow merge preserves fields from both records
+- Skip: no modifications occur when duplicate found
+- Platform isolation: same `item_id` on different platforms are independent
+- Tie-breaker: when `last_updated` is equal, prefer higher `id`
+
+#### Module load smoke (Tier 1)
+
+For every file under `modules/*.js`, `tests/map_item.test.js` asserts the
+module parses and imports without throwing. Modules with a `map_item`
+export and modules without one both pass this tier — the goal is purely to
+catch a generator that emits a syntax error or an import-time throw.
+
+No data is run through `map_item` here; that work belongs in the
+comparator.
+
+#### `map_item` comparator (Tier 2)
+
+For every 4CAT dataset key listed in `FOURCAT_DATASETS`,
+`tests/map_item_compare.test.js`:
 
-**Setup**
+1. fetches `/api/dataset/<key>/metadata/` to learn the datasource id
+2. translates that id to a Zeeschuimer module name via
+   `zeeschuimer-to-4cat.json` (used in reverse)
+3. fetches `/download/<key>` (NDJSON inputs, already wrapped via
+   `wrap_for_map_item` by Zeeschuimer pre-upload) and
+   `/api/dataset/<key>/items/?annotations=no&missing_fields=keep&stream=true`
+   (expected outputs from 4CAT's Python `map_item`, as NDJSON — `stream=true`
+   avoids the JSON form's `limit=100` pagination)
+4. pairs items by `id` (or by index with a warning if `id` is missing on
+   either side), runs each input through the local `map_item`, and
+   field-by-field diffs against the expected output (4CAT's API-only
+   aggregate `missing_fields` key is excluded; per-field `{__missing:true}`
+   markers are still compared)
 
-1. Install Node.js dependencies:
-   ```bash
-   cd tests
-   npm install
-   ```
+The comparator does **not** exercise `wrap_for_map_item` itself — Zeeschuimer
+applies it pre-storage and `/download/<key>` returns post-wrap items. This
+is an accepted gap; see `docs/map-item-test-plan.md`.
 
-**Running tests**
+**Configuration:** copy `tests/.env.example` to `tests/.env` and set:
+- `FOURCAT_URL` — base URL of the 4CAT instance (no trailing slash)
+- `FOURCAT_API_KEY` — raw API key (no `Bearer ` prefix)
+- `FOURCAT_DATASETS` — comma-separated list of dataset keys
+
+The comparator hard-errors at startup if any of these are missing.
+
+**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in
+every dataset; default is to halt subsequent items in a dataset once one
+has failed.
+
+### Running
 
 ```bash
+# everything that's hermetic — duplicate-behavior unit + module load smoke
 npm test
-```
 
-For watch mode during development:
-```bash
+# watch mode for the same
 npm run test:watch
+
+# the comparator — every dataset key in FOURCAT_DATASETS
+npm run test:compare
+
+# the comparator narrowed to one dataset key (must still appear in
+# FOURCAT_DATASETS — protects against typos)
+npm run test:compare -- <dataset_key>
 ```
 
-**Test coverage**
-- Schema upgrade backfills `last_updated` from `timestamp_collected`
-- Compound index correctly selects most recent item by `last_updated`
-- Forward-looking behavior: switching from "keep" to "update" targets newest record
-- Forward-looking behavior: switching from "update" to "keep" creates new records
-- Merge behavior: shallow merge preserves fields from both records
-- Skip behavior: no modifications occur when duplicate found
-- Platform isolation: same `item_id` on different platforms are independent
-- Tie-breaker: when `last_updated` is equal, prefer higher `id`
+### Where does a new test go?
+
+- **Pure data transformation, no live external state, runs anywhere.**
+  Duplicate-behavior unit suite (DB logic) or the Tier 1 smoke
+  (`map_item` static checks).
+- **Field-by-field correctness against 4CAT's Python `map_item`.** Tier 2
+  comparator. Add a dataset to `FOURCAT_DATASETS` that covers the case;
+  the comparator will pick it up.
+- **End-to-end user flow in the extension.** Selenium.
+
+### Why the environments differ
+
+The two Jest tiers run in **jsdom** rather than node env. The reasoning:
+
+- `map_item` bodies are pure data transformation, but four of them
+  (`gab`, `pinterest`, `rednote`, `truth`) call `strip_tags`, which
+  invokes `new DOMParser()`. jsdom provides a spec-compliant native
+  `DOMParser`; node env doesn't.
+- jsdom doesn't ship `fetch`. The standard workaround
+  (`undici`) crashes inside jsdom because it pokes at
+  `clearImmediate` / `markResourceTiming` / fast-now timers that jsdom
+  shadows. `cross-fetch` wraps `node-fetch` v2 internally and doesn't
+  hit those Node internals, so it works in jsdom — the comparator
+  imports `cross-fetch/polyfill` to assign `globalThis.fetch`.
+
+The tradeoff is parser parity. `cross-fetch`-via-`node-fetch` and
+jsdom's `DOMParser` are not byte-equal to Firefox's Gecko `DOMParser`,
+which is what runs in production. Whitespace handling around `<br>` and
+block elements is the usual suspect. If the comparator emits false-
+positive diffs on text fields for the four `strip_tags` modules, the
+right fix is to normalise whitespace in the comparator's `deep_equal`
+rather than chase parser parity. The Selenium tier sits above and
+provides the real-Gecko fidelity check.
diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
index 37e3e4c..86ab707 100644
--- a/tests/map_item_compare.test.js
+++ b/tests/map_item_compare.test.js
@@ -1,40 +1,60 @@
 /**
- * @jest-environment node
+ * Compare JS map_item output against 4CAT's Python map_item via dataset keys.
  *
- * This file runs in Node test environment (not jsdom) because undici's
- * fetch implementation uses Node-internal APIs (`clearImmediate`,
- * `markResourceTiming`, fast-now timers, etc.) that jsdom shadows or
- * doesn't expose. Polyfilling them into jsdom is whack-a-mole; node env
- * has them all natively.
+ * For each 4CAT dataset key in FOURCAT_DATASETS, this test:
+ *   1. fetches /api/dataset/<key>/metadata/ to learn the datasource id
+ *   2. translates that id back to a Zeeschuimer module name via
+ *      zeeschuimer-to-4cat.json (used in reverse)
+ *   3. inspects the local module (must export map_item)
+ *   4. fetches in parallel, both as NDJSON:
+ *        /download/<key>                       -> INPUTS (post-wrap)
+ *        /api/dataset/<key>/items/?annotations=no&missing_fields=keep&stream=true
+ *                                              -> mapped EXPECTED OUTPUTS
+ *   5. pairs items by `id`, runs each input through the local map_item, and
+ *      deep-equals the result against the corresponding expected output.
  *
- * Trade-off: no DOMParser in node env. The four modules that use
- * `strip_tags` (gab, pinterest, rednote, truth) will need a DOMParser
- * polyfill (e.g. via linkedom) before the comparator can run against
- * them. Other modules (including instagram) work as-is.
- */
-/**
- * Compare JS map_item output against 4CAT's Python map_item via the API.
+ * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array
+ * form paginates at `limit=100`, silently dropping rows on larger datasets.
+ * `annotations=no` drops processor-added fields; `missing_fields=keep` keeps
+ * unmapped fields as `{ __missing: true, value: "" }` markers (matching the JS
+ * side) and additionally adds a comma-joined `missing_fields` summary key.
+ * That summary is API-only — the JS map_item never emits it — so it is
+ * excluded from the diff (see API_ONLY_FIELDS); the per-field markers it
+ * summarizes are still compared.
  *
- * For every line in every fixture, runs the JS map_item locally AND sends
- * the same stored item to 4CAT's /api/map-item/<datasource>/ endpoint, then
- * diffs the two outputs field-by-field. Each item is its own Jest test —
- * failures point at exactly which item and which fields diverge.
+ * Items from /download/<key> already have `wrap_for_map_item` applied by
+ * Zeeschuimer pre-upload, so they're fed to map_item directly without
+ * re-wrapping. The trade-off is that this comparator does not exercise
+ * `wrap_for_map_item` itself — see docs/map-item-test-plan.md for the
+ * accepted-gap rationale.
  *
- * Skips itself entirely if FOURCAT_URL / FOURCAT_API_KEY aren't set, so
- * `npm test` keeps working without 4CAT configuration. Drop real values in
- * tests/.env to enable.
+ * Environment notes (fetch + DOMParser):
+ *   - jsdom env so `strip_tags` (used by gab/pinterest/rednote/truth) has
+ *     a native DOMParser.
+ *   - jsdom doesn't ship `fetch`. Spiked three candidates on 2026-06-03
+ *     under node:20-alpine:
+ *       * `undici`     — crashes at import in jsdom (pokes at
+ *                        clearImmediate/markResourceTiming/fast-now
+ *                        timers that jsdom shadows).
+ *       * `node-fetch` v3 — imports clean but `res.text()` throws
+ *                        `ReferenceError: TextDecoder is not defined`
+ *                        (jsdom doesn't expose TextDecoder as a global).
+ *       * `cross-fetch/polyfill` — clean import + working round-trip.
+ *     So this file imports `cross-fetch/polyfill`, which assigns
+ *     `globalThis.fetch` when undefined.
  *
- * Datasource id mapping: tests/zeeschuimer-to-4cat.json (Zeeschuimer
- * module filename → 4CAT datasource id, for the few names that diverge).
+ * Invocation:
+ *   npm run test:compare                 # runs every key in FOURCAT_DATASETS
+ *   npm run test:compare -- <key>        # narrows to one key (must be in
+ *                                        #   FOURCAT_DATASETS to avoid typos)
  *
- * Module-level state is determined upfront by inspect_module() (no
- * map_item / syntax errors / import errors are handled before tests are
- * registered, so they appear once per module, not once per item).
+ * Hard-errors at registration time if FOURCAT_URL, FOURCAT_API_KEY, or
+ * FOURCAT_DATASETS is missing — by Tier 2 contract these are required.
  */
 
+import 'cross-fetch/polyfill';
 import 'dotenv/config';
-import { jest } from '@jest/globals';
-import { readdirSync, readFileSync, statSync, existsSync } from 'node:fs';
+import { readFileSync, existsSync } from 'node:fs';
 import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { inspect_module } from './_module-info.js';
@@ -43,56 +63,100 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
 
 const FOURCAT_URL = process.env.FOURCAT_URL?.replace(/\/$/, '');
 const FOURCAT_API_KEY = process.env.FOURCAT_API_KEY;
-const HAS_4CAT = Boolean(
-    FOURCAT_URL && FOURCAT_API_KEY && FOURCAT_API_KEY !== 'your-api-key-here'
-);
 
-// When true (default), once any item in a module fails, subsequent items
-// in that same module skip the HTTP + map_item work and fail fast with a
-// "halted" message. Saves time when generator output is broken at the top.
-// Set FAIL_FAST=0 in env to run all items regardless.
-// Trim because cmd.exe's `set FAIL_FAST=0 && ...` includes the trailing
-// space in the variable value, which would otherwise defeat `!== '0'`.
-const FAIL_FAST = (process.env.FAIL_FAST ?? '').trim() !== '0';
-const halted_modules = new Set();
+// Hard-fail if env is missing — Tier 2 contract.
+function require_env(name, value, placeholder_values = []) {
+    if (!value || placeholder_values.includes(value)) {
+        throw new Error(
+            `${name} is not configured. Set it in tests/.env (see tests/.env.example).`
+        );
+    }
+    return value;
+}
+require_env('FOURCAT_URL', FOURCAT_URL);
+require_env('FOURCAT_API_KEY', FOURCAT_API_KEY, ['your-api-key-here']);
+
+const FOURCAT_DATASETS = require_env(
+    'FOURCAT_DATASETS',
+    process.env.FOURCAT_DATASETS,
+    ['key1,key2,key3'],
+)
+    .split(',')
+    .map(k => k.trim())
+    .filter(k => k.length > 0);
+
+if (FOURCAT_DATASETS.length === 0) {
+    throw new Error('FOURCAT_DATASETS parsed as empty. Set a comma-separated list of dataset keys in tests/.env.');
+}
+
+// Optional narrowing to a single dataset key. The `npm run test:compare --
+// <key>` form is handled by run-compare.mjs, which sets COMPARE_DATASET; jest
+// itself would mis-read a bare key as a test-path-pattern filter and silently
+// run nothing. A narrowed key must still be declared in FOURCAT_DATASETS —
+// erroring on an unlisted key catches typos and keeps the dataset list the
+// single source of truth.
+const COMPARE_DATASET = process.env.COMPARE_DATASET?.trim() || undefined;
+if (COMPARE_DATASET && !FOURCAT_DATASETS.includes(COMPARE_DATASET)) {
+    throw new Error(
+        `COMPARE_DATASET=${COMPARE_DATASET} is not listed in FOURCAT_DATASETS. ` +
+        `Add it to tests/.env before narrowing the run to it.`
+    );
+}
+
+const DATASET_KEYS_TO_RUN = COMPARE_DATASET ? [COMPARE_DATASET] : FOURCAT_DATASETS;
 
-const FIXTURE_ROOT = join(__dirname, 'fixtures');
+// 4CAT datasource id -> Zeeschuimer module name. The on-disk map is
+// authored in the natural direction (zeeschuimer -> 4cat); flip here.
 const ID_MAP_PATH = join(__dirname, 'zeeschuimer-to-4cat.json');
-const ID_MAP = existsSync(ID_MAP_PATH)
+const ZEESCHUIMER_TO_4CAT = existsSync(ID_MAP_PATH)
     ? JSON.parse(readFileSync(ID_MAP_PATH, 'utf8'))
     : {};
+const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries(
+    Object.entries(ZEESCHUIMER_TO_4CAT)
+        .filter(([k]) => !k.startsWith('_'))
+        .map(([z, f]) => [f, z])
+);
+
+// When true (default), comparison of a dataset stops at its first failing
+// item; the remaining items are reported as a single skipped "halted"
+// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 &&
+// ...` in cmd.exe includes the trailing space; treat both '0' and 'false'
+// (case-insensitive) as off.
+const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase();
+const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false';
 
-function wrap_for_map_item(stored_item) {
-    const { data, ...meta } = stored_item;
-    return { ...data, __import_meta: meta };
+function auth_headers(extra = {}) {
+    return {
+        // 4CAT accepts the raw key without a `Bearer ` prefix.
+        'Authorization': FOURCAT_API_KEY,
+        ...extra,
+    };
 }
 
-async function call_4cat_map_item(datasource_id, item) {
-    const res = await fetch(`${FOURCAT_URL}/api/map-item/${datasource_id}/`, {
-        method: 'POST',
-        headers: {
-            // 4CAT accepts the raw key without a `Bearer ` prefix, per probe
-            'Authorization': FOURCAT_API_KEY,
-            'Content-Type': 'application/json',
-        },
-        body: JSON.stringify({ item }),
-    });
+async function fetch_json(url) {
+    const res = await fetch(url, { headers: auth_headers() });
     const text = await res.text();
-    if (!res.ok) {
-        throw new Error(`HTTP ${res.status} from 4CAT: ${text}`);
-    }
+    if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`);
     return JSON.parse(text);
 }
 
-// Round-trip a value through JSON so MappedItem, MissingMappedField, etc.
-// become plain JSON-compatible objects matching what 4CAT emits.
+async function fetch_ndjson(url) {
+    const res = await fetch(url, { headers: auth_headers() });
+    const text = await res.text();
+    if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`);
+    return text
+        .split('\n')
+        .filter(line => line.trim().length > 0)
+        .map((line, i) => {
+            try { return JSON.parse(line); }
+            catch (e) { throw new Error(`bad NDJSON at line ${i} of ${url}: ${e.message}`); }
+        });
+}
+
 function normalize(value) {
     return JSON.parse(JSON.stringify(value));
 }
 
-// Recursive structural equality. Doesn't care about object key order, which
-// matters for nested values like {__missing: true, value: ""} where JS and
-// Python might emit keys in different orders.
 function deep_equal(a, b) {
     if (a === b) return true;
     if (a === null || b === null) return a === b;
@@ -138,8 +202,6 @@ function format_diffs(diffs) {
     }).join('\n');
 }
 
-// Pull out the first few module-frame lines from an error's stack so the
-// failure message points at where in modules/<name>.js the throw happened.
 function format_error_with_location(err) {
     if (!err) return String(err);
     const message = err.message || String(err);
@@ -153,131 +215,226 @@ function format_error_with_location(err) {
         : message;
 }
 
-function list_module_dirs() {
-    if (!existsSync(FIXTURE_ROOT)) return [];
-    return readdirSync(FIXTURE_ROOT).filter(name => {
-        try { return statSync(join(FIXTURE_ROOT, name)).isDirectory(); }
-        catch { return false; }
-    });
-}
+// Pair inputs and expected outputs by `id`. Falls back to index pairing
+// (with a logged warning) if either side is missing the field on its
+// first item.
+function pair_items(inputs, outputs, dataset_key) {
+    const probe_in = inputs[0];
+    const probe_out = outputs[0];
+    const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null;
+    const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null;
 
-// Per-test timeout: each test does one HTTP round-trip to 4CAT. Jest's
-// default 5s is tight under load.
-jest.setTimeout(30000);
+    if (!has_id_in || !has_id_out) {
+        // eslint-disable-next-line no-console
+        console.warn(
+            `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` +
+            `side — falling back to index pairing for this dataset.`
+        );
+        const n = Math.min(inputs.length, outputs.length);
+        return {
+            mode: 'index',
+            pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })),
+            input_count: inputs.length,
+            output_count: outputs.length,
+            unmatched_inputs: [],
+            unmatched_outputs: [],
+        };
+    }
 
-if (!HAS_4CAT) {
-    describe('map_item compare (JS vs 4CAT Python)', () => {
-        test.skip('FOURCAT_URL / FOURCAT_API_KEY not configured — set them in tests/.env to enable', () => {});
-    });
-} else {
-    const module_dirs = list_module_dirs();
-
-    // Pre-pass: synchronously determine each module's state so we can branch
-    // on it at registration time.
-    const module_info = {};
-    for (const module_name of module_dirs) {
-        module_info[module_name] = await inspect_module(module_name);
+    const by_id_out = new Map();
+    for (const item of outputs) by_id_out.set(String(item.id), item);
+
+    const pairs = [];
+    const unmatched_inputs = [];
+    for (const input of inputs) {
+        const expected = by_id_out.get(String(input.id));
+        if (expected) {
+            pairs.push({ input, expected, id: input.id });
+            by_id_out.delete(String(input.id));
+        } else {
+            unmatched_inputs.push(input.id);
+        }
     }
+    return {
+        mode: 'id',
+        pairs,
+        input_count: inputs.length,
+        output_count: outputs.length,
+        unmatched_inputs,
+        unmatched_outputs: Array.from(by_id_out.keys()),
+    };
+}
+
+// 4CAT exposes the datasource via `metadata.type`, which is the datasource
+// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`,
+// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare
+// id, which we then translate to a Zeeschuimer module via
+// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens
+// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string.
+function extract_datasource_id(metadata) {
+    const type = metadata?.type;
+    if (!type) return null;
+    return type.replace(/-(search|import)$/, '');
+}
 
-    let any_fixtures = false;
+// Fields 4CAT's API attaches to every mapped item that the JS map_item never
+// produces, so they would otherwise diff as spurious "only_python" entries.
+// `missing_fields` is a comma-joined summary of which fields came back as
+// MissingMappedField — redundant with the per-field `{__missing:true}`
+// markers, which ARE compared.
+const API_ONLY_FIELDS = new Set(['missing_fields']);
 
-    for (const module_name of module_dirs) {
-        const fixture_dir = join(FIXTURE_ROOT, module_name);
-        const fixture_files = readdirSync(fixture_dir).filter(f => f.endsWith('.ndjson'));
-        if (fixture_files.length === 0) continue;
-        any_fixtures = true;
+function strip_api_fields(obj) {
+    if (!obj || typeof obj !== 'object' || Array.isArray(obj)) return obj;
+    const out = {};
+    for (const k of Object.keys(obj)) {
+        if (!API_ONLY_FIELDS.has(k)) out[k] = obj[k];
+    }
+    return out;
+}
 
-        const datasource_id = ID_MAP[module_name] ?? module_name;
-        const info = module_info[module_name];
+// Run each paired input through the local map_item and diff the result
+// against 4CAT's expected output. With FAIL_FAST on (default), stop at the
+// first failing item and record how many were left unchecked — so one bad
+// item yields a single failure plus one skipped "halted" placeholder, not N
+// failures.
+function compare_pairs(pairs, map_item) {
+    const results = [];
+    let halted_count = 0;
+    for (let i = 0; i < pairs.length; i++) {
+        const { input, expected, id } = pairs[i];
+        let message = null;
+        try {
+            let js_result;
+            try {
+                js_result = map_item(input);
+            } catch (e) {
+                throw new Error(`JS map_item threw: ${format_error_with_location(e)}`);
+            }
+            const diffs = diff_objects(
+                strip_api_fields(normalize(js_result)),
+                strip_api_fields(normalize(expected)),
+            );
+            if (diffs.length > 0) {
+                message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`;
+            }
+        } catch (e) {
+            message = e.message;
+        }
+        results.push({ id, ok: message === null, message });
+        if (message !== null && FAIL_FAST) {
+            halted_count = pairs.length - (i + 1);
+            break;
+        }
+    }
+    return { results, halted_count };
+}
 
-        if (info.state === 'no_map_item') {
-            // eslint-disable-next-line no-console
-            console.log(`[compare] skipping ${module_name}: modules/${module_name}.js does not export a map_item`);
-            continue;
+// Pre-pass: for each dataset, fetch metadata + items and run the comparison
+// up front, so tests register with knowable counts and a deterministic
+// pass/fail per item. Fetch/setup failures become a single "setup" failure
+// inside that dataset's describe.
+const dataset_state = {};
+for (const key of DATASET_KEYS_TO_RUN) {
+    try {
+        const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`);
+        const datasource_id = extract_datasource_id(metadata);
+        if (!datasource_id) {
+            throw new Error(
+                `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)`
+            );
         }
+        const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id;
+        const module_state = await inspect_module(module_name);
 
-        if (info.state === 'syntax_error' || info.state === 'import_error') {
-            const msg = info.state === 'syntax_error'
-                ? `syntax error:\n${info.error}`
-                : `import failed: ${info.error.message}`;
-            describe(`map_item compare: ${module_name}`, () => {
-                test(`module loads`, () => { throw new Error(msg); });
-            });
-            continue;
+        if (module_state.state === 'ok') {
+            // Both sides as NDJSON. `stream=true` on the items endpoint avoids
+            // the JSON-array form's default `limit=100` pagination, which would
+            // silently drop rows (and break id-pairing) on larger datasets.
+            const [inputs, outputs] = await Promise.all([
+                fetch_ndjson(`${FOURCAT_URL}/download/${key}`),
+                fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`),
+            ]);
+            const pairing = pair_items(inputs, outputs, key);
+            const comparison = compare_pairs(pairing.pairs, module_state.map_item);
+            dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison };
+        } else {
+            dataset_state[key] = { metadata, datasource_id, module_name, module_state };
         }
+    } catch (e) {
+        dataset_state[key] = { error: e };
+    }
+}
 
-        // state === 'ok' — register per-item comparison tests
-        const map_item = info.map_item;
-
-        describe(`map_item compare: ${module_name} (4CAT id: ${datasource_id})`, () => {
-            for (const fixture_file of fixture_files) {
-                const lines = readFileSync(join(fixture_dir, fixture_file), 'utf8')
-                    .split('\n')
-                    .filter(line => line.trim().length > 0);
-
-                describe(fixture_file, () => {
-                    lines.forEach((line, i) => {
-                        test(`item ${i}`, async () => {
-                            if (FAIL_FAST && halted_modules.has(module_name)) {
-                                throw new Error(
-                                    '[halted after prior failure in this module — set FAIL_FAST=0 to run all items]'
-                                );
-                            }
-                            try {
-                                const stored_item = JSON.parse(line);
-
-                                // 4CAT side
-                                const response = await call_4cat_map_item(datasource_id, stored_item);
-
-                                // JS side
-                                let js_result;
-                                let js_error;
-                                try {
-                                    js_result = map_item(wrap_for_map_item(stored_item));
-                                } catch (e) {
-                                    js_error = e;
-                                }
-
-                                if (response.status === 'mapped') {
-                                    if (js_error) {
-                                        throw new Error(
-                                            `4CAT mapped this item but JS threw: ${format_error_with_location(js_error)}`
-                                        );
-                                    }
-                                    const js_obj = normalize(js_result);
-                                    const py_obj = normalize(response.item);
-                                    const diffs = diff_objects(js_obj, py_obj);
-                                    if (diffs.length > 0) {
-                                        throw new Error(
-                                            `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`
-                                        );
-                                    }
-                                } else if (response.status === 'skipped') {
-                                    if (!js_error) {
-                                        throw new Error(
-                                            `4CAT skipped this item ("${response.reason}") but JS produced a result`
-                                        );
-                                    }
-                                    // Both rejected — good. Skip reasons may differ in wording.
-                                } else if (response.status === 'error') {
-                                    throw new Error(`4CAT errored on this item: ${response.message}`);
-                                } else {
-                                    throw new Error(`unexpected 4CAT response status: ${JSON.stringify(response)}`);
-                                }
-                            } catch (e) {
-                                if (FAIL_FAST) halted_modules.add(module_name);
-                                throw e;
-                            }
-                        });
-                    });
-                });
-            }
+for (const dataset_key of DATASET_KEYS_TO_RUN) {
+    const info = dataset_state[dataset_key];
+
+    if (info.error) {
+        describe(`map_item compare: dataset ${dataset_key}`, () => {
+            test('setup', () => { throw info.error; });
         });
+        continue;
     }
 
-    if (!any_fixtures) {
-        describe('map_item compare (JS vs 4CAT Python)', () => {
-            test.skip('no fixtures under tests/fixtures/<module>/*.ndjson', () => {});
+    const { datasource_id, module_name, module_state, pairing, comparison } = info;
+    const label = `${dataset_key} (datasource: ${datasource_id}, module: ${module_name})`;
+
+    if (module_state.state === 'no_map_item') {
+        describe(`map_item compare: ${label}`, () => {
+            test.skip(`modules/${module_name}.js has no map_item — nothing to compare`, () => {});
         });
+        continue;
     }
+    if (module_state.state === 'syntax_error' || module_state.state === 'import_error') {
+        const msg = module_state.state === 'syntax_error'
+            ? `syntax error:\n${module_state.error}`
+            : `import failed: ${module_state.error.message}`;
+        describe(`map_item compare: ${label}`, () => {
+            test('module loads', () => { throw new Error(msg); });
+        });
+        continue;
+    }
+
+    describe(`map_item compare: ${label}`, () => {
+        test('pairing', () => {
+            const messages = [];
+            if (pairing.input_count !== pairing.output_count) {
+                messages.push(
+                    `input count ${pairing.input_count} != output count ${pairing.output_count}`
+                );
+            }
+            if (pairing.unmatched_inputs.length) {
+                const shown = pairing.unmatched_inputs.slice(0, 5).join(', ');
+                const extra = pairing.unmatched_inputs.length > 5
+                    ? ` (+${pairing.unmatched_inputs.length - 5} more)`
+                    : '';
+                messages.push(`unmatched input ids: ${shown}${extra}`);
+            }
+            if (pairing.unmatched_outputs.length) {
+                const shown = pairing.unmatched_outputs.slice(0, 5).join(', ');
+                const extra = pairing.unmatched_outputs.length > 5
+                    ? ` (+${pairing.unmatched_outputs.length - 5} more)`
+                    : '';
+                messages.push(`unmatched output ids: ${shown}${extra}`);
+            }
+            if (pairing.mode === 'index') {
+                messages.push(`paired by index (no usable 'id' field) — diffs may be misaligned`);
+            }
+            if (messages.length) throw new Error(messages.join('\n'));
+        });
+
+        comparison.results.forEach(({ id, ok, message }, i) => {
+            test(`item ${i} (id=${id})`, () => {
+                if (!ok) throw new Error(message);
+            });
+        });
+
+        if (comparison.halted_count > 0) {
+            test.skip(
+                `halted after first failure — ${comparison.halted_count} later item(s) not compared ` +
+                `(set FAIL_FAST=0 to compare all)`,
+                () => {},
+            );
+        }
+    });
 }

From d7fcb4c72deb18de311d6056e521b156be299457 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 15:48:12 +0200
Subject: [PATCH 29/33] fast_fail OR --all for tests

---
 tests/README.md                | 18 +++++++++++++++---
 tests/map_item_compare.test.js |  9 +++++----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index cd35e0a..beaee44 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -128,9 +128,18 @@ is an accepted gap; see `docs/map-item-test-plan.md`.
 
 The comparator hard-errors at startup if any of these are missing.
 
-**Optional knob:** `FAIL_FAST=0` (or `FAIL_FAST=false`) runs every item in
-every dataset; default is to halt subsequent items in a dataset once one
-has failed.
+**Optional knob:** by default the comparator halts a dataset at its first
+failing item (reporting the rest as one skipped "halted" placeholder). To
+compare *every* item, pass `--all`:
+
+```bash
+npm run test:compare -- <dataset_key> --all
+```
+
+`FAIL_FAST=0` (or `FAIL_FAST=false`) does the same, but prefer `--all`: an
+inline `FAIL_FAST=0 npm run …` does not reliably reach node when npm/node is
+the Windows binary run through WSL interop, and isn't env syntax in cmd.exe.
+A CLI flag crosses every shell.
 
 ### Running
 
@@ -147,6 +156,9 @@ npm run test:compare
 # the comparator narrowed to one dataset key (must still appear in
 # FOURCAT_DATASETS — protects against typos)
 npm run test:compare -- <dataset_key>
+
+# compare every item instead of halting at the first failure
+npm run test:compare -- <dataset_key> --all
 ```
 
 ### Where does a new test go?
diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
index 86ab707..2ca1d27 100644
--- a/tests/map_item_compare.test.js
+++ b/tests/map_item_compare.test.js
@@ -119,9 +119,10 @@ const FOURCAT_TO_ZEESCHUIMER = Object.fromEntries(
 
 // When true (default), comparison of a dataset stops at its first failing
 // item; the remaining items are reported as a single skipped "halted"
-// placeholder rather than one failure each. Trim because `set FAIL_FAST=0 &&
-// ...` in cmd.exe includes the trailing space; treat both '0' and 'false'
-// (case-insensitive) as off.
+// placeholder rather than one failure each. Disable it with the `--all`
+// launcher flag (preferred — crosses every shell) or FAIL_FAST=0. Trim
+// because `set FAIL_FAST=0 && ...` in cmd.exe includes the trailing space;
+// treat both '0' and 'false' (case-insensitive) as off.
 const FAIL_FAST_RAW = (process.env.FAIL_FAST ?? '').trim().toLowerCase();
 const FAIL_FAST = FAIL_FAST_RAW !== '0' && FAIL_FAST_RAW !== 'false';
 
@@ -432,7 +433,7 @@ for (const dataset_key of DATASET_KEYS_TO_RUN) {
         if (comparison.halted_count > 0) {
             test.skip(
                 `halted after first failure — ${comparison.halted_count} later item(s) not compared ` +
-                `(set FAIL_FAST=0 to compare all)`,
+                `(pass --all, or set FAIL_FAST=0, to compare every item)`,
                 () => {},
             );
         }

From 4f9e69c3dc8e38ed98b4d0fe17f8f413a0b7c40a Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 16:34:30 +0200
Subject: [PATCH 30/33] use headers for datasource

---
 tests/README.md                |  3 +-
 tests/map_item_compare.test.js | 63 ++++++++++++++++++----------------
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index beaee44..f203b60 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -103,7 +103,8 @@ comparator.
 For every 4CAT dataset key listed in `FOURCAT_DATASETS`,
 `tests/map_item_compare.test.js`:
 
-1. fetches `/api/dataset/<key>/metadata/` to learn the datasource id
+1. sends a HEAD to the items endpoint and reads the datasource id from its
+   `X-4CAT-Dataset-Datasource` response header (no metadata-endpoint call)
 2. translates that id to a Zeeschuimer module name via
    `zeeschuimer-to-4cat.json` (used in reverse)
 3. fetches `/download/<key>` (NDJSON inputs, already wrapped via
diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
index 2ca1d27..681076c 100644
--- a/tests/map_item_compare.test.js
+++ b/tests/map_item_compare.test.js
@@ -2,7 +2,8 @@
  * Compare JS map_item output against 4CAT's Python map_item via dataset keys.
  *
  * For each 4CAT dataset key in FOURCAT_DATASETS, this test:
- *   1. fetches /api/dataset/<key>/metadata/ to learn the datasource id
+ *   1. HEADs the items endpoint to read the datasource id from the
+ *      `X-4CAT-Dataset-*` response headers (no metadata-endpoint dependency)
  *   2. translates that id back to a Zeeschuimer module name via
  *      zeeschuimer-to-4cat.json (used in reverse)
  *   3. inspects the local module (must export map_item)
@@ -134,11 +135,10 @@ function auth_headers(extra = {}) {
     };
 }
 
-async function fetch_json(url) {
-    const res = await fetch(url, { headers: auth_headers() });
-    const text = await res.text();
-    if (!res.ok) throw new Error(`HTTP ${res.status} from ${url}: ${text}`);
-    return JSON.parse(text);
+async function fetch_headers(url) {
+    const res = await fetch(url, { method: 'HEAD', headers: auth_headers() });
+    if (!res.ok) throw new Error(`HTTP ${res.status} from HEAD ${url}`);
+    return res.headers;
 }
 
 async function fetch_ndjson(url) {
@@ -266,16 +266,18 @@ function pair_items(inputs, outputs, dataset_key) {
     };
 }
 
-// 4CAT exposes the datasource via `metadata.type`, which is the datasource
-// id with a `-search` or `-import` suffix appended (e.g. `tiktok-search`,
-// `xiaohongshu-comments-import`). Strip the trailing suffix to get the bare
-// id, which we then translate to a Zeeschuimer module via
-// FOURCAT_TO_ZEESCHUIMER. Datasource ids themselves may contain hyphens
-// (e.g. `xiaohongshu-comments`), so the strip is anchored to end-of-string.
-function extract_datasource_id(metadata) {
-    const type = metadata?.type;
-    if (!type) return null;
-    return type.replace(/-(search|import)$/, '');
+// Recover the datasource id from a dataset's response headers. 4CAT exposes it
+// directly as `X-4CAT-Dataset-Datasource`. Older responses may only carry
+// `X-4CAT-Dataset-Type` (the datasource id with a `-search`/`-import` suffix),
+// so fall back to stripping that — anchored to end-of-string because
+// datasource ids can themselves contain hyphens (e.g. `xiaohongshu-comments`).
+// The result is translated to a Zeeschuimer module via FOURCAT_TO_ZEESCHUIMER.
+function datasource_id_from_headers(headers) {
+    const datasource = headers.get('x-4cat-dataset-datasource');
+    if (datasource) return datasource.trim();
+    const type = headers.get('x-4cat-dataset-type');
+    if (type) return type.trim().replace(/-(search|import)$/, '');
+    return null;
 }
 
 // Fields 4CAT's API attaches to every mapped item that the JS map_item never
@@ -331,36 +333,39 @@ function compare_pairs(pairs, map_item) {
     return { results, halted_count };
 }
 
-// Pre-pass: for each dataset, fetch metadata + items and run the comparison
-// up front, so tests register with knowable counts and a deterministic
-// pass/fail per item. Fetch/setup failures become a single "setup" failure
-// inside that dataset's describe.
+// Pre-pass: for each dataset, resolve the datasource (HEAD), fetch items, and
+// run the comparison up front, so tests register with knowable counts and a
+// deterministic pass/fail per item. Fetch/setup failures become a single
+// "setup" failure inside that dataset's describe.
 const dataset_state = {};
 for (const key of DATASET_KEYS_TO_RUN) {
     try {
-        const metadata = await fetch_json(`${FOURCAT_URL}/api/dataset/${key}/metadata/`);
-        const datasource_id = extract_datasource_id(metadata);
+        // The same items URL serves double duty: a HEAD reveals the datasource
+        // (via X-4CAT-Dataset-* headers) with no body; the GET pulls the mapped
+        // rows. `stream=true` avoids the JSON form's limit=100 pagination, which
+        // would silently drop rows (and break id-pairing) on larger datasets.
+        const items_url = `${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`;
+        const headers = await fetch_headers(items_url);
+        const datasource_id = datasource_id_from_headers(headers);
         if (!datasource_id) {
             throw new Error(
-                `metadata for ${key} has no datasource id (checked parameters.datasource, datasource, type)`
+                `no datasource id in response headers for ${key} ` +
+                `(looked for X-4CAT-Dataset-Datasource / X-4CAT-Dataset-Type)`
             );
         }
         const module_name = FOURCAT_TO_ZEESCHUIMER[datasource_id] ?? datasource_id;
         const module_state = await inspect_module(module_name);
 
         if (module_state.state === 'ok') {
-            // Both sides as NDJSON. `stream=true` on the items endpoint avoids
-            // the JSON-array form's default `limit=100` pagination, which would
-            // silently drop rows (and break id-pairing) on larger datasets.
             const [inputs, outputs] = await Promise.all([
                 fetch_ndjson(`${FOURCAT_URL}/download/${key}`),
-                fetch_ndjson(`${FOURCAT_URL}/api/dataset/${key}/items/?annotations=no&missing_fields=keep&stream=true`),
+                fetch_ndjson(items_url),
             ]);
             const pairing = pair_items(inputs, outputs, key);
             const comparison = compare_pairs(pairing.pairs, module_state.map_item);
-            dataset_state[key] = { metadata, datasource_id, module_name, module_state, pairing, comparison };
+            dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison };
         } else {
-            dataset_state[key] = { metadata, datasource_id, module_name, module_state };
+            dataset_state[key] = { datasource_id, module_name, module_state };
         }
     } catch (e) {
         dataset_state[key] = { error: e };

From 8b918d46ba99f2939610a5f0e34fbf0e3aa434bd Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 3 Jun 2026 16:35:08 +0200
Subject: [PATCH 31/33] add the --all instead of just fail_fail

---
 tests/run-compare.mjs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/run-compare.mjs b/tests/run-compare.mjs
index 69240ab..57efb66 100644
--- a/tests/run-compare.mjs
+++ b/tests/run-compare.mjs
@@ -3,6 +3,7 @@
  *
  *   npm run test:compare              -> compares every key in FOURCAT_DATASETS
  *   npm run test:compare -- <key>     -> narrows the run to a single key
+ *   npm run test:compare -- <key> --all   -> compare every item (no fail-fast)
  *   npm run test:compare -- <key> -t "id=123"   -> key + forwarded jest flags
  *
  * Why this exists instead of invoking jest directly: jest treats any bare
@@ -21,13 +22,22 @@ import { dirname, join } from 'node:path';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const args = process.argv.slice(2);
 
-// First non-flag arg (if any) is the dataset key to narrow to. Everything
-// that looks like a flag is forwarded to jest verbatim.
+// First non-flag arg (if any) is the dataset key to narrow to.
 const dataset_key = args.find(a => !a.startsWith('-'));
-const jest_flags = args.filter(a => a !== dataset_key);
+const flags = args.filter(a => a !== dataset_key);
+
+// `--all` (alias `--no-fail-fast`) compares every item instead of halting at
+// the first failure. It's offered as a flag, not only via the FAIL_FAST env
+// var, because `FAIL_FAST=0 npm run ...` does not reliably reach node when
+// npm/node is the Windows binary invoked through WSL interop, and isn't env
+// syntax at all in cmd.exe. A CLI flag crosses every shell; the env var still
+// works where it propagates.
+const disable_fail_fast = flags.includes('--all') || flags.includes('--no-fail-fast');
+const jest_flags = flags.filter(f => f !== '--all' && f !== '--no-fail-fast');
 
 const env = { ...process.env };
 if (dataset_key) env.COMPARE_DATASET = dataset_key;
+if (disable_fail_fast) env.FAIL_FAST = '0';
 
 const jest_bin = join(__dirname, 'node_modules', 'jest', 'bin', 'jest.js');
 const child = spawn(

From 00f0369d12804e397202a7206d25b9b864414c82 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@proton.me>
Date: Thu, 4 Jun 2026 16:46:51 +0200
Subject: [PATCH 32/33] map_item_compare.test.js: compare based on mapped `id`
 field not raw `id`

---
 tests/map_item_compare.test.js | 98 ++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
index 681076c..2d1403b 100644
--- a/tests/map_item_compare.test.js
+++ b/tests/map_item_compare.test.js
@@ -11,8 +11,11 @@
  *        /download/<key>                       -> INPUTS (post-wrap)
  *        /api/dataset/<key>/items/?annotations=no&missing_fields=keep&stream=true
  *                                              -> mapped EXPECTED OUTPUTS
- *   5. pairs items by `id`, runs each input through the local map_item, and
- *      deep-equals the result against the corresponding expected output.
+ *   5. runs each input through the local map_item, then pairs by the
+ *      resulting MAPPED `id` — which can differ from the raw input id (e.g.
+ *      instagram maps to the post shortcode, not the numeric pk) — and
+ *      deep-equals each mapped result against the corresponding expected
+ *      output.
  *
  * The items endpoint is fetched with `stream=true` (NDJSON): its JSON-array
  * form paginates at `limit=100`, silently dropping rows on larger datasets.
@@ -216,25 +219,49 @@ function format_error_with_location(err) {
         : message;
 }
 
-// Pair inputs and expected outputs by `id`. Falls back to index pairing
-// (with a logged warning) if either side is missing the field on its
-// first item.
-function pair_items(inputs, outputs, dataset_key) {
-    const probe_in = inputs[0];
+// Map each input through the local map_item, then pair the mapped result
+// against the expected output by `id`. Pairing MUST key on the mapped id:
+// some modules emit an `id` that differs from the raw input id — instagram,
+// for instance, maps to the post shortcode (`node.code`), not the numeric pk
+// — so pairing raw input ids against the API's already-mapped ids would match
+// nothing. Falls back to index pairing (with a logged warning) if either side
+// lacks a usable id. A throw inside map_item is captured per-item and surfaced
+// later as that item's failure.
+function map_and_pair(inputs, outputs, map_item, dataset_key) {
+    // Map every input up front so pairing can key on the mapped id.
+    const mapped = inputs.map(input => {
+        try {
+            return { input, js_result: map_item(input), error: null };
+        } catch (e) {
+            return {
+                input,
+                js_result: null,
+                error: new Error(`JS map_item threw: ${format_error_with_location(e)}`),
+            };
+        }
+    });
+
+    const probe_mapped = mapped.find(m => m.js_result)?.js_result;
     const probe_out = outputs[0];
-    const has_id_in = probe_in && 'id' in probe_in && probe_in.id != null;
+    const has_id_mapped = probe_mapped && 'id' in probe_mapped && probe_mapped.id != null;
     const has_id_out = probe_out && 'id' in probe_out && probe_out.id != null;
 
-    if (!has_id_in || !has_id_out) {
+    if (!has_id_mapped || !has_id_out) {
         // eslint-disable-next-line no-console
         console.warn(
-            `[compare] ${dataset_key}: no usable 'id' on ${!has_id_in ? '/download' : '/items'} ` +
+            `[compare] ${dataset_key}: no usable 'id' on ${!has_id_mapped ? 'map_item output' : '/items'} ` +
             `side — falling back to index pairing for this dataset.`
         );
-        const n = Math.min(inputs.length, outputs.length);
+        const n = Math.min(mapped.length, outputs.length);
         return {
             mode: 'index',
-            pairs: Array.from({ length: n }, (_, i) => ({ input: inputs[i], expected: outputs[i], id: i })),
+            pairs: Array.from({ length: n }, (_, i) => ({
+                input: mapped[i].input,
+                js_result: mapped[i].js_result,
+                error: mapped[i].error,
+                expected: outputs[i],
+                id: i,
+            })),
             input_count: inputs.length,
             output_count: outputs.length,
             unmatched_inputs: [],
@@ -247,13 +274,19 @@ function pair_items(inputs, outputs, dataset_key) {
 
     const pairs = [];
     const unmatched_inputs = [];
-    for (const input of inputs) {
-        const expected = by_id_out.get(String(input.id));
+    for (const m of mapped) {
+        // Key on the mapped id when mapping succeeded; for a throw (no mapped
+        // id available) fall back to the raw input id so a pass-through-id
+        // module still surfaces the failure against its expected output.
+        const lookup_id = m.js_result && m.js_result.id != null
+            ? String(m.js_result.id)
+            : (m.input && m.input.id != null ? String(m.input.id) : null);
+        const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined;
         if (expected) {
-            pairs.push({ input, expected, id: input.id });
-            by_id_out.delete(String(input.id));
+            pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id });
+            by_id_out.delete(lookup_id);
         } else {
-            unmatched_inputs.push(input.id);
+            unmatched_inputs.push(lookup_id);
         }
     }
     return {
@@ -296,24 +329,21 @@ function strip_api_fields(obj) {
     return out;
 }
 
-// Run each paired input through the local map_item and diff the result
-// against 4CAT's expected output. With FAIL_FAST on (default), stop at the
-// first failing item and record how many were left unchecked — so one bad
-// item yields a single failure plus one skipped "halted" placeholder, not N
-// failures.
-function compare_pairs(pairs, map_item) {
+// Diff each paired (already-mapped) JS result against 4CAT's expected output.
+// map_item was run up front during pairing — so we could key on the mapped id
+// — so here we only diff, or report an input whose map_item threw. With
+// FAIL_FAST on (default), stop at the first failing item and record how many
+// were left unchecked — so one bad item yields a single failure plus one
+// skipped "halted" placeholder, not N failures.
+function compare_pairs(pairs) {
     const results = [];
     let halted_count = 0;
     for (let i = 0; i < pairs.length; i++) {
-        const { input, expected, id } = pairs[i];
+        const { id, js_result, error, expected } = pairs[i];
         let message = null;
-        try {
-            let js_result;
-            try {
-                js_result = map_item(input);
-            } catch (e) {
-                throw new Error(`JS map_item threw: ${format_error_with_location(e)}`);
-            }
+        if (error) {
+            message = error.message;
+        } else {
             const diffs = diff_objects(
                 strip_api_fields(normalize(js_result)),
                 strip_api_fields(normalize(expected)),
@@ -321,8 +351,6 @@ function compare_pairs(pairs, map_item) {
             if (diffs.length > 0) {
                 message = `${diffs.length} field(s) differ between JS and 4CAT:\n${format_diffs(diffs)}`;
             }
-        } catch (e) {
-            message = e.message;
         }
         results.push({ id, ok: message === null, message });
         if (message !== null && FAIL_FAST) {
@@ -361,8 +389,8 @@ for (const key of DATASET_KEYS_TO_RUN) {
                 fetch_ndjson(`${FOURCAT_URL}/download/${key}`),
                 fetch_ndjson(items_url),
             ]);
-            const pairing = pair_items(inputs, outputs, key);
-            const comparison = compare_pairs(pairing.pairs, module_state.map_item);
+            const pairing = map_and_pair(inputs, outputs, module_state.map_item, key);
+            const comparison = compare_pairs(pairing.pairs);
             dataset_state[key] = { datasource_id, module_name, module_state, pairing, comparison };
         } else {
             dataset_state[key] = { datasource_id, module_name, module_state };

From c7bb9ac9b2c7e046ef25d15b1ea07217e3fbeabc Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@proton.me>
Date: Thu, 4 Jun 2026 17:15:05 +0200
Subject: [PATCH 33/33] map_item_compare.test.js: still show errors on failed
 `id` matches

---
 tests/map_item_compare.test.js | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/map_item_compare.test.js b/tests/map_item_compare.test.js
index 2d1403b..8e06979 100644
--- a/tests/map_item_compare.test.js
+++ b/tests/map_item_compare.test.js
@@ -275,15 +275,21 @@ function map_and_pair(inputs, outputs, map_item, dataset_key) {
     const pairs = [];
     const unmatched_inputs = [];
     for (const m of mapped) {
-        // Key on the mapped id when mapping succeeded; for a throw (no mapped
-        // id available) fall back to the raw input id so a pass-through-id
-        // module still surfaces the failure against its expected output.
-        const lookup_id = m.js_result && m.js_result.id != null
-            ? String(m.js_result.id)
-            : (m.input && m.input.id != null ? String(m.input.id) : null);
+        // A throw produces no mapped id to pair on. Surface it as its own
+        // failing item (labelled with the raw input id) rather than burying it
+        // in the unmatched-id list — otherwise an id-transforming module hides
+        // the actual map_item error behind a generic "unmatched input" report.
+        if (m.error) {
+            const label = m.input && m.input.id != null ? String(m.input.id) : '(no id)';
+            pairs.push({ input: m.input, js_result: null, error: m.error, expected: null, id: label });
+            continue;
+        }
+        // Key on the mapped id; a successful map whose id matches no output is
+        // a genuine pairing miss and goes to unmatched_inputs.
+        const lookup_id = m.js_result && m.js_result.id != null ? String(m.js_result.id) : null;
         const expected = lookup_id != null ? by_id_out.get(lookup_id) : undefined;
         if (expected) {
-            pairs.push({ input: m.input, js_result: m.js_result, error: m.error, expected, id: lookup_id });
+            pairs.push({ input: m.input, js_result: m.js_result, error: null, expected, id: lookup_id });
             by_id_out.delete(lookup_id);
         } else {
             unmatched_inputs.push(lookup_id);