From 6a779abb73dec7b3e789e8c5a2c55c6d4f1fbe58 Mon Sep 17 00:00:00 2001
From: IanM <ianmag@cs.washington.edu>
Date: Tue, 7 Apr 2026 18:47:13 -0700
Subject: [PATCH 1/3] example first draft

---
 README.md                               |  42 +++
 docs/serialized_tasks.md                | 338 ++++++++++++++++++++++++
 examples/serialize_task_example.py      | 111 ++++++++
 src/olmo_eval/evals/suites/code.py      |   1 +
 src/olmo_eval/evals/tasks/serialized.py |  28 +-
 5 files changed, 517 insertions(+), 3 deletions(-)
 create mode 100644 docs/serialized_tasks.md
 create mode 100644 examples/serialize_task_example.py

diff --git a/README.md b/README.md
index 77cbd0ffa..f0504685d 100644
--- a/README.md
+++ b/README.md
@@ -625,6 +625,48 @@ register_regime("my_task", "3shot", num_fewshot=3)
 
 Usage: `olmo-eval run -t my_task:bpb:3shot`
 
+### Serialized Tasks
+
+If you already have evaluation data in a static format -- for example,
+migrated from another framework or generated by a custom script -- you can
+add a task without writing a new Task class. The built-in `SerializedTask`
+loads a JSONL file where each line carries the fields needed to build both
+an `Instance` (for scoring) and an `LMRequest` (for inference). You just
+register a variant pointing at your file and choose your metrics:
+
+```python
+register_variant(
+    "serialized", "my_task_name",
+    data_source=DataSource(path="s3://my-bucket/my_task.jsonl"),
+    metrics=(MyMetric(),),
+)
+```
+
+Each JSONL record looks like:
+
+```jsonc
+{
+  "doc_id": 0,                                    // unique int id
+  "question": "Write a function ...",              // raw question text
+  "gold_answers": ["def foo(): ..."],              // correct answer(s)
+  "choices": null,                                 // MC answer choices, or null
+  "metadata": {"task_name": "my_task", "id": 1},   // arbitrary scorer-visible metadata
+  "request_type": "loglikelihood",                 // "completion", "loglikelihood", or "chat"
+  "prompt": "You are an expert ...\n\n",           // formatted prompt string
+  "messages": null,                                // chat messages list, or null
+  "continuations": ["def foo(): ..."]              // strings to score (loglikelihood)
+}
+```
+
+The serialized file captures what goes into the model (the fully formatted
+prompt, including baked-in few-shot examples and chat templates). It does
+**not** capture SamplingParams, Scorers, Metrics, or answer extraction
+logic -- those are configured on the olmo-eval side.
+
+For a full walkthrough -- including how to migrate a task from oe-eval and
+how to create a new serialized task from scratch -- see
+[docs/serialized_tasks.md](docs/serialized_tasks.md).
+
 ## Tool-Augmented Evaluation
 
 olmo-eval supports evaluating models with tool use through the **Harness** abstraction. This enables comparing baseline model performance against tool-augmented performance on the same tasks.
diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md
new file mode 100644
index 000000000..a3f172de9
--- /dev/null
+++ b/docs/serialized_tasks.md
@@ -0,0 +1,338 @@
+# Serialized Tasks
+
+A serialized task lets you add an evaluation to olmo-eval without
+implementing a new Task class. You provide a JSONL file containing
+pre-formatted instances and model requests; the built-in `SerializedTask`
+class handles loading and plumbing. All you need to do is register a variant
+that points at your data and specifies the metrics you want.
+
+The goal is to keep this path as simple as possible for common task shapes
+(completion, loglikelihood, chat). More exotic task types that need custom
+runtime behavior (per-instance sampling params, dynamic prompt construction,
+multi-request-per-instance patterns) may still require a full Task subclass,
+but we aim to grow `SerializedTask` over time to cover the common cases.
+
+## JSONL schema at a glance
+
+Each line is a JSON object with two groups of fields -- one that maps to an
+`Instance` (used for scoring) and one that maps to an `LMRequest` (sent to
+the model):
+
+**Loglikelihood / BPB example** (single continuation scored against the prompt):
+
+```json
+{
+  "doc_id": 0,
+  "question": "Write a function to find the maximum element in a list.",
+  "gold_answers": ["def max_element(lst):\n    return max(lst)"],
+  "choices": null,
+  "metadata": {"task_name": "mbpp:3shot:bpb", "id": 601, "test": "assert max_element([1,2,3]) == 3"},
+  "request_type": "loglikelihood",
+  "prompt": "You are an expert Python programmer...\n\n",
+  "messages": null,
+  "continuations": ["def max_element(lst):\n    return max(lst)"]
+}
+```
+
+**Multiple-choice example** (one continuation per choice, scored by logprob):
+
+```json
+{
+  "doc_id": 0,
+  "question": "What is the powerhouse of the cell?",
+  "gold_answers": ["Mitochondria"],
+  "choices": ["Nucleus", "Mitochondria", "Ribosome", "Golgi apparatus"],
+  "metadata": {"task_name": "sciq", "gold_idx": 1},
+  "request_type": "loglikelihood",
+  "prompt": "What is the powerhouse of the cell?\n  A. Nucleus\n  B. Mitochondria\n  C. Ribosome\n  D. Golgi apparatus\nAnswer:",
+  "messages": null,
+  "continuations": [" A", " B", " C", " D"]
+}
+```
+
+**Chat example** (generation request with message list):
+
+```json
+{
+  "doc_id": 0,
+  "question": "What is the capital of France?",
+  "gold_answers": ["Paris"],
+  "choices": null,
+  "metadata": {"task_name": "trivia_chat"},
+  "request_type": "chat",
+  "prompt": null,
+  "messages": [
+    {"role": "system", "content": "Answer concisely."},
+    {"role": "user", "content": "What is the capital of France?"}
+  ],
+  "continuations": null
+}
+```
+
+**Instance-level fields** (for scoring):
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `doc_id` | `int` | Unique numeric ID within the file |
+| `question` | `str` | Raw question text (best-effort; may not be a clean question for all task types) |
+| `gold_answers` | `list[str]` | Valid gold answers; first element maps to `Instance.gold_answer` |
+| `choices` | `list[str] \| null` | Answer choices for MC tasks, null otherwise |
+| `metadata` | `dict` | Arbitrary per-instance metadata; always includes `task_name`, may include `gold_idx` for MC, plus any scorer-specific fields |
+
+**LMRequest-level fields** (for inference):
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `request_type` | `str` | `"completion"`, `"loglikelihood"`, or `"chat"` |
+| `prompt` | `str \| null` | Formatted prompt string (completion / loglikelihood) |
+| `messages` | `list[dict] \| null` | Chat messages list (chat requests) |
+| `continuations` | `list[str] \| null` | Continuation strings for loglikelihood scoring |
+
+### What gets serialized and what does not
+
+The JSONL captures **what goes into the model**: the fully formatted prompt
+or chat messages, including baked-in few-shot examples, chat templates, and
+continuation strings. It also captures Instance fields needed by scorers
+(question, gold answers, choices, metadata).
+
+The JSONL does **not** capture:
+
+- **SamplingParams** (max_tokens, temperature, stop_sequences, etc.) --
+  these are runtime concerns configured via `TaskConfig` or CLI overrides.
+- **Formatter, Scorer, and Metric definitions** -- these live in olmo-eval
+  task registrations, not in the data.
+- **Answer extraction logic** -- implemented by scorers in olmo-eval.
+- **system_prompt** -- supported by `LMRequest` but not currently read from
+  metadata by the default `SerializedTask` (easy to add by subclassing).
+
+---
+
+## Example 1: Migrating a task from oe-eval-internal
+
+This walkthrough covers how the `olmo3:base_easy:code:bpb` suite was
+migrated from oe-eval-internal using serialized data. Use it as a reference
+if you have a task that already exists in oe-eval.
+
+### Step 1: Serialize the data
+
+The script
+[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/main/oe_eval/serialize_benchmark.py)
+converts oe-eval tasks into JSONL files. It loads a task through oe-eval's
+own `load_task` / `build_all_requests` pipeline, then extracts the formatted
+prompt and instance fields into `SerializedRecord` objects
+(see lines 43-81 of that script for the schema dataclass).
+
+```bash
+python -m oe_eval.serialize_benchmark \
+    --task-suite olmo3:base_easy:code_bpb \
+    --output-dir /tmp/serialized_benchmarks
+```
+
+This produces one JSONL file per leaf task plus a `manifest.json`. The
+script handles collapsing multiple-choice request instances into a single
+record with `choices` and `continuations` lists, and extracts metadata
+fields like `test` and `entry_point` from the underlying dataset document
+(see lines 221-256 for the extraction logic).
+
+Note that this script does not necessarily cover every task type in oe-eval
+out of the box. It was written to migrate the code BPB suite and serves as
+an example of how to approach serialization for other tasks. Tasks with
+unusual request types (e.g. `generate_until_and_loglikelihood`) or
+non-standard `doc_to_target()` implementations may need adjustments -- see
+the [Limitations](#limitations-and-gotchas) section below.
+
+### Step 2: Upload the data
+
+Upload the JSONL files to a location accessible by `DataLoader` (S3, GCS,
+or a local path):
+
+```bash
+aws s3 cp /tmp/serialized_benchmarks/ s3://my-bucket/serialized/ --recursive
+```
+
+### Step 3: Register variants in olmo-eval
+
+Each JSONL file becomes a variant of the `serialized` base task. Add
+registrations in
+[`src/olmo_eval/evals/tasks/serialized.py`](../src/olmo_eval/evals/tasks/serialized.py).
+Here is what the code BPB migration looks like (simplified):
+
+```python
+from olmo_eval.common.metrics import BPBMetricInstanceAvg
+from olmo_eval.data import DataSource
+from olmo_eval.evals.tasks.common import register_variant
+
+_S3_BASE = "s3://ai2-llm/ianm/oe-eval-serialized/olmo3_base_easy_code_bpb"
+_BPB_METRICS = (BPBMetricInstanceAvg(),)
+
+register_variant(
+    "serialized",
+    "codex_humaneval_3shot_bpb",
+    data_source=DataSource(path=f"{_S3_BASE}/codex_humaneval_3shot_bpb__none.jsonl"),
+    metrics=_BPB_METRICS,
+)
+
+register_variant(
+    "serialized",
+    "mbpp_3shot_bpb",
+    data_source=DataSource(path=f"{_S3_BASE}/mbpp_3shot_bpb__none.jsonl"),
+    metrics=_BPB_METRICS,
+    limit=500,
+)
+```
+
+Each call overrides `TaskConfig` fields on the `serialized` base task.
+The kwargs you can pass are any `TaskConfig` field: `data_source`,
+`metrics`, `limit`, `primary_metric`, `sampling_params`, etc.
+
+For the full set of registrations including the per-language multilingual
+MBPP variants, see
+[`serialized.py` lines 96-149](../src/olmo_eval/evals/tasks/serialized.py).
+
+### Step 4: Define a suite
+
+Group related serialized tasks into a suite in
+[`src/olmo_eval/evals/suites/code.py`](../src/olmo_eval/evals/suites/code.py):
+
+```python
+OLMO3_BASE_EASY_CODE_BPB_SERIALIZED = register(
+    Suite(
+        name="olmo3:base_easy:code:bpb:serialized",
+        tasks=(
+            "serialized:codex_humaneval_3shot_bpb",
+            "serialized:mbpp_3shot_bpb",
+            _SERIALIZED_MT_MBPP_V2FIX_BPB,
+        ),
+        aggregation=AggregationStrategy.AVERAGE_OF_AVERAGES,
+        description="OLMo3 base_easy code BPB suite from serialized data",
+    )
+)
+```
+
+### Step 5: Run it
+
+```bash
+olmo-eval run -t serialized:codex_humaneval_3shot_bpb -m my-model
+# or the whole suite:
+olmo-eval run -s olmo3:base_easy:code:bpb:serialized -m my-model
+```
+
+---
+
+## Example 2: Creating a new serialized task from scratch
+
+This walkthrough covers creating a serialized task that does not come from
+oe-eval -- you write a standalone script to produce the JSONL.
+
+### Step 1: Write a serialization script
+
+See
+[`examples/serialize_task_example.py`](../examples/serialize_task_example.py)
+for a complete, runnable example. It loads the SciQ dataset from
+HuggingFace (a simple 4-choice science QA benchmark) and writes a JSONL
+file. The key function is `serialize_doc`, which builds one record per
+example:
+
+```python
+def serialize_doc(doc_id, doc):
+    # ... shuffle choices, find gold_idx ...
+    return {
+        "doc_id": doc_id,
+        "question": doc["question"],
+        "gold_answers": [doc["correct_answer"]],
+        "choices": choices,
+        "metadata": {"task_name": "sciq_serialized", "gold_idx": gold_idx},
+        "request_type": "loglikelihood",
+        "prompt": make_prompt(doc, choices),
+        "messages": None,
+        "continuations": [f" {label}" for label in "ABCD"],
+    }
+```
+
+Run it:
+
+```bash
+pip install datasets
+python examples/serialize_task_example.py --output /tmp/sciq_serialized.jsonl
+```
+
+### Step 2: Register the variant
+
+Add a registration pointing at your JSONL. For a multiple-choice task
+scored by logprob:
+
+```python
+from olmo_eval.common.metrics import AccuracyMetric
+from olmo_eval.common.scorers import MultipleChoiceScorer
+from olmo_eval.data import DataSource
+from olmo_eval.evals.tasks.common import register_variant
+
+register_variant(
+    "serialized",
+    "sciq_mc",
+    data_source=DataSource(path="/tmp/sciq_serialized.jsonl"),
+    metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+)
+```
+
+### Step 3: Run it
+
+```bash
+olmo-eval run -t serialized:sciq_mc -m my-model
+```
+
+---
+
+## Passing task-specific metadata
+
+Scorers and metrics sometimes need per-instance data beyond the basic
+Instance fields. For example, code execution scoring needs `test` and
+`entry_point`, and IFEval constraint checking needs `instruction_id_list`.
+
+Put these fields in the `metadata` dict of each JSONL record:
+
+```json
+{"metadata": {"test": "assert foo(1) == 2", "entry_point": "foo"}, ...}
+```
+
+The `SerializedTask` passes `metadata` through to `Instance.metadata`
+unchanged, so they are available at scoring time via
+`instance.metadata["test"]`.
+
+If you need behavior the default `SerializedTask` does not provide -- for
+example, a custom `extract_answer` method, per-instance `SamplingParams`,
+or reading `system_prompt` from metadata into `LMRequest` -- subclass
+`SerializedTask` and register your subclass as a new base task.
+
+---
+
+## Limitations and gotchas
+
+Serialization works well for straightforward completion, loglikelihood, and
+chat tasks. More complex task types can introduce complications:
+
+- **Baked-in formatting is model-specific.** The serialized prompt includes
+  any chat template, FIM tokens, or context-window truncation applied during
+  serialization. A JSONL file produced for one model family may not be valid
+  for another. This is usually fine since evaluation suites tend to be
+  model-specific already.
+
+- **Dual-mode tasks (generation + BPB).** Some oe-eval tasks issue both a
+  `generate_until` and a `loglikelihood` request per instance. These need
+  two separate JSONL files -- one per request type -- registered as separate
+  variants with different metrics.
+
+- **Multiple-choice PMI normalization.** Unconditioned-prompt requests (used
+  for `acc_norm`) are not serialized. If needed, the olmo-eval consumer
+  would construct them at runtime from the `continuations` list and a
+  task-level unconditioned prompt string.
+
+- **Rolling / corpus perplexity.** Short-document perplexity can be
+  serialized as loglikelihood with `prompt=""` and a single continuation.
+  True sliding-window rolling for long documents requires olmo-eval
+  infrastructure that does not exist yet.
+
+- **Gold answer reliability.** When serializing from oe-eval,
+  `doc_to_target()` extracts `gold_answers`, but this method is not always
+  the canonical answer source for every task. Verify the serialized answers
+  match what scoring expects before running at scale.
diff --git a/examples/serialize_task_example.py b/examples/serialize_task_example.py
new file mode 100644
index 000000000..d3e74689f
--- /dev/null
+++ b/examples/serialize_task_example.py
@@ -0,0 +1,111 @@
+"""Example: serialize a HuggingFace dataset to the JSONL format used by SerializedTask.
+
+This standalone script does not depend on oe-eval.  It loads the SciQ
+dataset (a simple science multiple-choice QA benchmark), formats each
+item as a 0-shot loglikelihood request, and writes a JSONL file that
+can be consumed directly by ``SerializedTask`` in olmo-eval.
+
+Usage:
+    pip install datasets
+    python examples/serialize_task_example.py --output /tmp/sciq_serialized.jsonl
+
+After producing the file you can register it in olmo-eval:
+
+    register_variant(
+        "serialized",
+        "sciq_mc",
+        data_source=DataSource(path="/tmp/sciq_serialized.jsonl"),
+        metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+    )
+
+See docs/serialized_tasks.md for the full guide.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+from typing import Any
+
+
+def load_sciq_test() -> list[dict[str, Any]]:
+    """Load the SciQ test split from HuggingFace."""
+    from datasets import load_dataset
+
+    ds = load_dataset("allenai/sciq", split="test")
+    return list(ds)
+
+
+def make_prompt(doc: dict[str, Any], choices: list[str]) -> str:
+    """Build a simple 0-shot MC prompt."""
+    labels = "ABCD"
+    lines = [doc["question"]]
+    for i, choice in enumerate(choices):
+        lines.append(f"  {labels[i]}. {choice}")
+    lines.append("Answer:")
+    return "\n".join(lines)
+
+
+def serialize_doc(doc_id: int, doc: dict[str, Any]) -> dict[str, Any]:
+    """Convert one SciQ document into a serialized record dict.
+
+    SciQ has three distractor fields and one correct_answer field.
+    We assemble the four choices, shuffle them, and record the gold
+    index in metadata.
+    """
+    choices = [doc["distractor1"], doc["distractor2"], doc["distractor3"], doc["correct_answer"]]
+    gold_idx = 3  # correct_answer is last before shuffle
+
+    rng = random.Random(doc_id)
+    order = list(range(4))
+    rng.shuffle(order)
+    choices = [choices[i] for i in order]
+    gold_idx = order.index(3)
+
+    prompt = make_prompt(doc, choices)
+
+    # Each continuation is one answer choice; the model scores each via
+    # loglikelihood and the highest-scoring continuation is the prediction.
+    labels = "ABCD"
+    continuations = [f" {labels[i]}" for i in range(len(choices))]
+
+    return {
+        # -- Instance-level fields (for scoring) --
+        "doc_id": doc_id,
+        "question": doc["question"],
+        "gold_answers": [doc["correct_answer"]],
+        "choices": choices,
+        "metadata": {
+            "task_name": "sciq_serialized",
+            "gold_idx": gold_idx,
+            "support": doc.get("support", ""),
+        },
+        # -- LMRequest-level fields (for inference) --
+        "request_type": "loglikelihood",
+        "prompt": prompt,
+        "messages": None,
+        "continuations": continuations,
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--output", type=str, required=True, help="Path for the output JSONL file")
+    args = parser.parse_args()
+
+    docs = load_sciq_test()
+    print(f"Loaded {len(docs)} SciQ test examples")
+
+    with open(args.output, "w") as f:
+        for doc_id, doc in enumerate(docs):
+            record = serialize_doc(doc_id, doc)
+            f.write(json.dumps(record) + "\n")
+
+    print(f"Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/olmo_eval/evals/suites/code.py b/src/olmo_eval/evals/suites/code.py
index ac996b640..7cc08c85e 100644
--- a/src/olmo_eval/evals/suites/code.py
+++ b/src/olmo_eval/evals/suites/code.py
@@ -75,6 +75,7 @@
         description="OLMo3 base_easy code BPB suite from serialized data",
     )
 )
+
 # =============================================================================
 # MULTIPL_E Suites
 # =============================================================================
diff --git a/src/olmo_eval/evals/tasks/serialized.py b/src/olmo_eval/evals/tasks/serialized.py
index 24c6342ea..98e2da38d 100644
--- a/src/olmo_eval/evals/tasks/serialized.py
+++ b/src/olmo_eval/evals/tasks/serialized.py
@@ -2,8 +2,16 @@
 
 These tasks bypass the standard Formatter pipeline because the serialized
 data already contains both raw Instance fields (for scoring) and fully
-formatted LMRequest fields (for inference).  The serialized JSONL is
-produced by oe-eval-internal's serialize_benchmark.py.
+formatted LMRequest fields (for inference).  The serialized JSONL can be
+produced by oe-eval-internal's serialize_benchmark.py or by any script
+that emits the schema documented in ``docs/serialized_tasks.md``.
+
+To add a new serialized task, call ``register_variant`` with a
+``DataSource`` pointing at your JSONL file and the desired metrics.
+You can add registrations directly in this file or in a new module
+that imports ``SerializedTask``.  See the existing registrations at
+the bottom of this file, ``examples/serialize_task_example.py``, and
+``docs/serialized_tasks.md`` for full walkthroughs.
 
 Top-level JSONL fields (used for Instance / LMRequest building):
     doc_id, question, gold_answers, choices, metadata,
@@ -16,7 +24,8 @@
 from collections.abc import Iterator
 from typing import Any
 
-from olmo_eval.common.metrics import BPBMetricInstanceAvg
+from olmo_eval.common.metrics import AccuracyMetric, BPBMetricInstanceAvg
+from olmo_eval.common.scorers import MultipleChoiceScorer
 from olmo_eval.common.types import Instance, LMRequest, RequestType
 from olmo_eval.data import DataLoader, DataSource
 from olmo_eval.evals.tasks.common import Task, register, register_variant
@@ -140,3 +149,16 @@ def format_request(self, instance: Instance) -> LMRequest:
         data_source=DataSource(path=f"{_S3_BASE}/mt_mbpp_v2fix_{_lang}.jsonl"),
         metrics=_BPB_METRICS,
     )
+
+# =============================================================================
+# Example: SciQ multiple-choice (from examples/serialize_task_example.py)
+# =============================================================================
+
+register_variant(
+    "serialized",
+    "sciq_mc",
+    data_source=DataSource(
+        path="s3://ai2-llm/ianm/oe-eval-serialized/examples/sciq_serialized.jsonl"
+    ),
+    metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+)

From bd339193220edc9144bc52bab4a1b4c59fc91f13 Mon Sep 17 00:00:00 2001
From: IanM <ianmag@cs.washington.edu>
Date: Tue, 7 Apr 2026 19:49:55 -0700
Subject: [PATCH 2/3] fix metric

---
 docs/serialized_tasks.md                | 5 ++---
 examples/serialize_task_example.py      | 2 +-
 src/olmo_eval/evals/tasks/serialized.py | 5 ++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md
index a3f172de9..1055ee9d9 100644
--- a/docs/serialized_tasks.md
+++ b/docs/serialized_tasks.md
@@ -262,8 +262,7 @@ Add a registration pointing at your JSONL. For a multiple-choice task
 scored by logprob:
 
 ```python
-from olmo_eval.common.metrics import AccuracyMetric
-from olmo_eval.common.scorers import MultipleChoiceScorer
+from olmo_eval.common.metrics import LogprobMCAccuracyMetric
 from olmo_eval.data import DataSource
 from olmo_eval.evals.tasks.common import register_variant
 
@@ -271,7 +270,7 @@ register_variant(
     "serialized",
     "sciq_mc",
     data_source=DataSource(path="/tmp/sciq_serialized.jsonl"),
-    metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+    metrics=(LogprobMCAccuracyMetric(),),
 )
 ```
 
diff --git a/examples/serialize_task_example.py b/examples/serialize_task_example.py
index d3e74689f..c3b7685be 100644
--- a/examples/serialize_task_example.py
+++ b/examples/serialize_task_example.py
@@ -15,7 +15,7 @@
         "serialized",
         "sciq_mc",
         data_source=DataSource(path="/tmp/sciq_serialized.jsonl"),
-        metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+        metrics=(LogprobMCAccuracyMetric(),),
     )
 
 See docs/serialized_tasks.md for the full guide.
diff --git a/src/olmo_eval/evals/tasks/serialized.py b/src/olmo_eval/evals/tasks/serialized.py
index 98e2da38d..d468ee3fb 100644
--- a/src/olmo_eval/evals/tasks/serialized.py
+++ b/src/olmo_eval/evals/tasks/serialized.py
@@ -24,8 +24,7 @@
 from collections.abc import Iterator
 from typing import Any
 
-from olmo_eval.common.metrics import AccuracyMetric, BPBMetricInstanceAvg
-from olmo_eval.common.scorers import MultipleChoiceScorer
+from olmo_eval.common.metrics import BPBMetricInstanceAvg, LogprobMCAccuracyMetric
 from olmo_eval.common.types import Instance, LMRequest, RequestType
 from olmo_eval.data import DataLoader, DataSource
 from olmo_eval.evals.tasks.common import Task, register, register_variant
@@ -160,5 +159,5 @@ def format_request(self, instance: Instance) -> LMRequest:
     data_source=DataSource(
         path="s3://ai2-llm/ianm/oe-eval-serialized/examples/sciq_serialized.jsonl"
     ),
-    metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),),
+    metrics=(LogprobMCAccuracyMetric(),),
 )

From 487ca103194c4e74092c251abf0abf27736a16aa Mon Sep 17 00:00:00 2001
From: IanM <ianmag@cs.washington.edu>
Date: Tue, 7 Apr 2026 20:26:07 -0700
Subject: [PATCH 3/3] edits

---
 docs/serialized_tasks.md | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md
index 1055ee9d9..c3ea20d44 100644
--- a/docs/serialized_tasks.md
+++ b/docs/serialized_tasks.md
@@ -74,7 +74,7 @@ the model):
 | Field | Type | Description |
 |-------|------|-------------|
 | `doc_id` | `int` | Unique numeric ID within the file |
-| `question` | `str` | Raw question text (best-effort; may not be a clean question for all task types) |
+| `question` | `str` | Raw question text |
 | `gold_answers` | `list[str]` | Valid gold answers; first element maps to `Instance.gold_answer` |
 | `choices` | `list[str] \| null` | Answer choices for MC tasks, null otherwise |
 | `metadata` | `dict` | Arbitrary per-instance metadata; always includes `task_name`, may include `gold_idx` for MC, plus any scorer-specific fields |
@@ -99,11 +99,9 @@ The JSONL does **not** capture:
 
 - **SamplingParams** (max_tokens, temperature, stop_sequences, etc.) --
   these are runtime concerns configured via `TaskConfig` or CLI overrides.
-- **Formatter, Scorer, and Metric definitions** -- these live in olmo-eval
+- **Scorer and Metric definitions** -- these live in olmo-eval
   task registrations, not in the data.
 - **Answer extraction logic** -- implemented by scorers in olmo-eval.
-- **system_prompt** -- supported by `LMRequest` but not currently read from
-  metadata by the default `SerializedTask` (easy to add by subclassing).
 
 ---
 
@@ -116,7 +114,7 @@ if you have a task that already exists in oe-eval.
 ### Step 1: Serialize the data
 
 The script
-[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/main/oe_eval/serialize_benchmark.py)
+[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/ianm-serialize-bench-data-vllm-schema/oe_eval/serialize_benchmark.py)
 converts oe-eval tasks into JSONL files. It loads a task through oe-eval's
 own `load_task` / `build_all_requests` pipeline, then extracts the formatted
 prompt and instance fields into `SerializedRecord` objects
@@ -131,8 +129,7 @@ python -m oe_eval.serialize_benchmark \
 This produces one JSONL file per leaf task plus a `manifest.json`. The
 script handles collapsing multiple-choice request instances into a single
 record with `choices` and `continuations` lists, and extracts metadata
-fields like `test` and `entry_point` from the underlying dataset document
-(see lines 221-256 for the extraction logic).
+fields like `test` and `entry_point` from the underlying dataset document.
 
 Note that this script does not necessarily cover every task type in oe-eval
 out of the box. It was written to migrate the code BPB suite and serves as
@@ -185,10 +182,6 @@ Each call overrides `TaskConfig` fields on the `serialized` base task.
 The kwargs you can pass are any `TaskConfig` field: `data_source`,
 `metrics`, `limit`, `primary_metric`, `sampling_params`, etc.
 
-For the full set of registrations including the per-language multilingual
-MBPP variants, see
-[`serialized.py` lines 96-149](../src/olmo_eval/evals/tasks/serialized.py).
-
 ### Step 4: Define a suite
 
 Group related serialized tasks into a suite in
@@ -310,13 +303,12 @@ or reading `system_prompt` from metadata into `LMRequest` -- subclass
 Serialization works well for straightforward completion, loglikelihood, and
 chat tasks. More complex task types can introduce complications:
 
-- **Baked-in formatting is model-specific.** The serialized prompt includes
+- **Baked-in formatting is model-specific.** If the serialized prompt includes
   any chat template, FIM tokens, or context-window truncation applied during
   serialization. A JSONL file produced for one model family may not be valid
-  for another. This is usually fine since evaluation suites tend to be
-  model-specific already.
+  for another.
 
-- **Dual-mode tasks (generation + BPB).** Some oe-eval tasks issue both a
+- **Dual-mode tasks (generation + BPB).** Some tasks issue both a
   `generate_until` and a `loglikelihood` request per instance. These need
   two separate JSONL files -- one per request type -- registered as separate
   variants with different metrics.