Skip to content

Commit d56ea20

Browse files
committed
pin explorer to curated csv slice with on-disk recording manifest
1 parent 0edf715 commit d56ea20

7 files changed

Lines changed: 2327 additions & 17 deletions

File tree

src/components/pages/ExplorerPage.svelte

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,8 @@
121121
Browse {facets.stats.total.toLocaleString()} performance workloads
122122
</h1>
123123
<p class="page-desc">
124-
Every workload in this preview slice of FormulaCode, indexed by
125-
repository, optimization level, and task. Click a card to see the
126-
benchmark code, per-agent speedups, and a link to the agent's
127-
terminal session recording.
124+
This is a preview slice of the FormulaCode workload dataset, while we work
125+
on <a href="https://api.formulacode.org/"><code>api.formulacode.org</code></a>. Every workload is indexed by repository, optimization level, and task. Click a card to see the benchmark code, per-agent speedups, and a link to the agent's terminal session recording.
128126
</p>
129127
130128
<div class="stats-strip mini">

src/data/website_data_codes.explorer.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/data/website_data_lite.explorer.csv

Lines changed: 2194 additions & 0 deletions
Large diffs are not rendered by default.

src/routes/explorer/+page.server.js

Lines changed: 100 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,106 @@
1-
import rawRows from "$data/website_data_lite.csv";
2-
import codesData from "$data/website_data_codes.json";
1+
import fs from "node:fs";
2+
import path from "node:path";
3+
// The explorer ships a curated slice pinned to commit 8907cb99, where the
4+
// CSV's agent_recording paths line up with the .cast files actually present
5+
// under src/data/recordings/. The site-wide CSV (website_data_lite.csv) was
6+
// later swapped to a different experiment whose recordings were never
7+
// uploaded, which is why we load a separate copy here.
8+
import rawRows from "$data/website_data_lite.explorer.csv";
9+
import codesData from "$data/website_data_codes.explorer.json";
310
import { buildWorkloads, summarizeFacets } from "$utils/explorer.js";
11+
import { AGENT_IDS } from "$utils/agents.js";
12+
13+
const RECORDINGS_DIR = path.resolve("src/data/recordings");
14+
15+
// Per the historical CSV at 8907cb99 (which was committed alongside these
16+
// .cast files), the on-disk agent-N suffixes map as follows:
17+
const AGENT_NUM_TO_ID = {
18+
"2": AGENT_IDS.HUMAN,
19+
"3": AGENT_IDS.CLAUDE,
20+
"4": AGENT_IDS.GPT5
21+
};
22+
23+
function buildRecordingManifest(root) {
24+
const manifest = {};
25+
if (!fs.existsSync(root)) return manifest;
26+
for (const ts of fs.readdirSync(root)) {
27+
const tsPath = path.join(root, ts);
28+
if (!fs.statSync(tsPath).isDirectory()) continue;
29+
for (const taskId of fs.readdirSync(tsPath)) {
30+
const taskPath = path.join(tsPath, taskId);
31+
if (!fs.statSync(taskPath).isDirectory()) continue;
32+
for (const runDir of fs.readdirSync(taskPath)) {
33+
const runPath = path.join(taskPath, runDir);
34+
if (!fs.statSync(runPath).isDirectory()) continue;
35+
if (!fs.existsSync(path.join(runPath, "sessions", "agent.cast")))
36+
continue;
37+
const m = runDir.match(/\.agent-(\d+)-/);
38+
if (!m) continue;
39+
const agentId = AGENT_NUM_TO_ID[m[1]];
40+
if (!agentId) continue;
41+
if (!manifest[taskId]) manifest[taskId] = {};
42+
manifest[taskId][agentId] =
43+
`/recordings/${ts}/${taskId}/${runDir}/sessions/agent.cast`;
44+
}
45+
}
46+
}
47+
return manifest;
48+
}
49+
50+
// task_id like "pandas_dev-pandas_12" → repo_name "pandas_dev-pandas".
51+
// The last `_<number>` segment is the task index; everything before is repo.
52+
function repoFromTaskId(taskId) {
53+
const m = taskId.match(/^(.*)_\d+$/);
54+
return m ? m[1] : taskId;
55+
}
56+
57+
// Build stub workloads for task_ids that have recordings on disk but no row
58+
// in the CSV. They land in the explorer as bare cards: task_id, repo, and
59+
// the per-agent recording links — no speedups or benchmark code to show.
60+
function synthesizeOrphans(manifest, csvTaskIds) {
61+
const out = [];
62+
let synthId = 1_000_000;
63+
for (const [taskId, recordings] of Object.entries(manifest)) {
64+
if (csvTaskIds.has(taskId)) continue;
65+
out.push({
66+
id: String(synthId++),
67+
task_id: taskId,
68+
level: "",
69+
benchmark_name: taskId,
70+
repo_name: repoFromTaskId(taskId),
71+
agent_id: "",
72+
"agent/nop": null,
73+
"oracle/nop": null,
74+
agent_recording: null,
75+
__orphanRecordings: recordings
76+
});
77+
}
78+
return out;
79+
}
480

581
export async function load() {
6-
const workloads = buildWorkloads(rawRows, codesData);
82+
const recordings = buildRecordingManifest(RECORDINGS_DIR);
83+
const csvTaskIds = new Set(rawRows.map((r) => r.task_id));
84+
const orphanWorkloads = synthesizeOrphans(recordings, csvTaskIds);
85+
const workloads = [
86+
...buildWorkloads(rawRows, codesData),
87+
...orphanWorkloads.map((o) => ({
88+
key: `${o.task_id}::::`,
89+
task_id: o.task_id,
90+
level: o.level,
91+
benchmark_name: o.benchmark_name,
92+
repo_name: o.repo_name,
93+
id: o.id,
94+
oracle: null,
95+
agents: {},
96+
recordings: o.__orphanRecordings,
97+
codeText: null,
98+
codeFqName: null,
99+
bestAgentId: null,
100+
bestAgentSpeedup: null,
101+
beatsOracle: false
102+
}))
103+
];
7104
const facets = summarizeFacets(workloads);
8105
return {
9106
workloads,

src/routes/player/[...recordingPath]/+page.js

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
import allBenchmarkData from "$data/website_data_lite.csv";
1+
import mainBenchmarkData from "$data/website_data_lite.csv";
2+
import explorerBenchmarkData from "$data/website_data_lite.explorer.csv";
23

34
export const ssr = false;
45
export const prerender = false;
56

7+
// Search both the main scrollytelling CSV and the explorer's curated slice.
8+
// The two cover different experiment runs; the explorer CSV is what carries
9+
// the agent_recording paths that actually resolve to files on disk.
10+
const allBenchmarkData = [...mainBenchmarkData, ...explorerBenchmarkData];
11+
612
export async function load({ params, url }) {
713
const rawRecordingPath = params.recordingPath || "";
814
// Trim any trailing slash that SvelteKit's catch-all may include so the
@@ -23,12 +29,16 @@ export async function load({ params, url }) {
2329
);
2430
}
2531

26-
// Lazy load the heavy code data
32+
// Lazy load the heavy code data — try both the main and explorer maps
33+
// since the benchmark may have come from either CSV.
2734
if (benchmark) {
2835
try {
29-
const module = await import("$data/website_data_codes.json");
30-
const heavyData = module.default;
31-
const extra = heavyData[benchmark.id];
36+
const [mainMod, explorerMod] = await Promise.all([
37+
import("$data/website_data_codes.json"),
38+
import("$data/website_data_codes.explorer.json")
39+
]);
40+
const extra =
41+
mainMod.default[benchmark.id] ?? explorerMod.default[benchmark.id];
3242
if (extra) {
3343
benchmark = { ...benchmark, ...extra };
3444
}

src/utils/explorer.js

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,15 @@ export function levelShort(level) {
5050
* (task_id, level, benchmark_name) with per-agent stats.
5151
*
5252
* The CSV structure has each benchmark replicated across the three agents
53-
* (oracle, claude, gpt-5). Oracle's `agent/nop` ≡ `oracle/nop`. The agent
54-
* rows additionally carry a recording path; oracle rows do not.
53+
* (oracle, claude, gpt-5). Oracle's `agent/nop` ≡ `oracle/nop`.
54+
*
55+
* `recordingManifest` is an optional `{task_id: {agent_id: url}}` map built
56+
* from what's actually on disk. When provided, workloads whose task_id has
57+
* no on-disk recording are dropped entirely, and the per-agent recording
58+
* URLs come from the manifest rather than the CSV (the CSV's paths point
59+
* to a now-stale filename scheme).
5560
*/
56-
export function buildWorkloads(rows, codeMap) {
61+
export function buildWorkloads(rows, codeMap, recordingManifest = null) {
5762
const groups = new Map();
5863

5964
for (const r of rows) {
@@ -62,6 +67,8 @@ export function buildWorkloads(rows, codeMap) {
6267
const name = r.benchmark_name;
6368
if (!taskId || !level || !name) continue;
6469

70+
if (recordingManifest && !recordingManifest[taskId]) continue;
71+
6572
const key = `${taskId}::${level}::${name}`;
6673
let wl = groups.get(key);
6774
if (!wl) {
@@ -74,7 +81,9 @@ export function buildWorkloads(rows, codeMap) {
7481
id: r.id,
7582
oracle: null,
7683
agents: {},
77-
recordings: {}
84+
recordings: recordingManifest
85+
? { ...(recordingManifest[taskId] ?? {}) }
86+
: {}
7887
};
7988
groups.set(key, wl);
8089
}
@@ -88,7 +97,7 @@ export function buildWorkloads(rows, codeMap) {
8897
if (agentId && agentNop !== null) {
8998
wl.agents[agentId] = agentNop;
9099
}
91-
if (agentId && r.agent_recording) {
100+
if (!recordingManifest && agentId && r.agent_recording) {
92101
wl.recordings[agentId] = r.agent_recording;
93102
}
94103
}

static/recordings

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../src/data/recordings

0 commit comments

Comments
 (0)