|
| 1 | +<script> |
| 2 | + import HeatmapTable from "./HeatmapTable.svelte"; |
| 3 | + import PaperFigureCaption from "./PaperFigureCaption.svelte"; |
| 4 | + import { findings, needsExport } from "$utils/findings.js"; |
| 5 | +
|
| 6 | + export let title = "Agents improve runtime but underperform experts"; |
| 7 | + export let description = |
| 8 | + "Every evaluated agent–model achieves geomean speedup > 1× on FormulaCode-V, but all configurations finish behind the human expert (negative advantage)."; |
| 9 | +
|
| 10 | + const data = findings.f1_leaderboard; |
| 11 | + const rows = (data.rows || []).map((r, i) => ({ |
| 12 | + ...r, |
| 13 | + _key: `${r.agent}-${r.model}-${i}` |
| 14 | + })); |
| 15 | +
|
| 16 | + const columns = [ |
| 17 | + { key: "rp_rank", label: "RP", numeric: true, decimals: 0 }, |
| 18 | + { key: "agent", label: "Agent" }, |
| 19 | + { key: "model", label: "Model" }, |
| 20 | + { |
| 21 | + key: "advantage", |
| 22 | + label: "Advantage", |
| 23 | + numeric: true, |
| 24 | + decimals: 4, |
| 25 | + signed: true, |
| 26 | + color: "diverging" |
| 27 | + }, |
| 28 | + { |
| 29 | + key: "speedup_geomean", |
| 30 | + label: "Speedup (geomean)", |
| 31 | + numeric: true, |
| 32 | + decimals: 4, |
| 33 | + suffix: "×", |
| 34 | + color: "sequential" |
| 35 | + } |
| 36 | + ]; |
| 37 | +</script> |
| 38 | + |
| 39 | +<section class="f1"> |
| 40 | + <header class="f1-head"> |
| 41 | + <h3 class="f1-title">{title}</h3> |
| 42 | + <p class="f1-desc">{description}</p> |
| 43 | + </header> |
| 44 | + |
| 45 | + <HeatmapTable |
| 46 | + {columns} |
| 47 | + {rows} |
| 48 | + caption="Global leaderboard on FormulaCode-V. Negative advantage = trails human expert; positive = beats expert." |
| 49 | + rowLabelCols={3} |
| 50 | + /> |
| 51 | + |
| 52 | + <PaperFigureCaption |
| 53 | + artifact="Table 1 (Global leaderboard)" |
| 54 | + arxivUrl={data._arxiv} |
| 55 | + needsExport={needsExport(data)} |
| 56 | + /> |
| 57 | +</section> |
| 58 | + |
| 59 | +<style> |
| 60 | + .f1 { |
| 61 | + display: flex; |
| 62 | + flex-direction: column; |
| 63 | + gap: 10px; |
| 64 | + } |
| 65 | +
|
| 66 | + .f1-head { |
| 67 | + display: flex; |
| 68 | + flex-direction: column; |
| 69 | + gap: 4px; |
| 70 | + } |
| 71 | +
|
| 72 | + .f1-title { |
| 73 | + margin: 0; |
| 74 | + font-family: var(--sans); |
| 75 | + font-size: 1.05rem; |
| 76 | + color: var(--text-primary); |
| 77 | + } |
| 78 | +
|
| 79 | + .f1-desc { |
| 80 | + margin: 0; |
| 81 | + font-family: var(--sans); |
| 82 | + font-size: 0.9rem; |
| 83 | + color: var(--text-muted); |
| 84 | + line-height: 1.55; |
| 85 | + } |
| 86 | +</style> |
0 commit comments