From 6a779abb73dec7b3e789e8c5a2c55c6d4f1fbe58 Mon Sep 17 00:00:00 2001 From: IanM Date: Tue, 7 Apr 2026 18:47:13 -0700 Subject: [PATCH 1/3] example first draft --- README.md | 42 +++ docs/serialized_tasks.md | 338 ++++++++++++++++++++++++ examples/serialize_task_example.py | 111 ++++++++ src/olmo_eval/evals/suites/code.py | 1 + src/olmo_eval/evals/tasks/serialized.py | 28 +- 5 files changed, 517 insertions(+), 3 deletions(-) create mode 100644 docs/serialized_tasks.md create mode 100644 examples/serialize_task_example.py diff --git a/README.md b/README.md index 77cbd0ffa..f0504685d 100644 --- a/README.md +++ b/README.md @@ -625,6 +625,48 @@ register_regime("my_task", "3shot", num_fewshot=3) Usage: `olmo-eval run -t my_task:bpb:3shot` +### Serialized Tasks + +If you already have evaluation data in a static format -- for example, +migrated from another framework or generated by a custom script -- you can +add a task without writing a new Task class. The built-in `SerializedTask` +loads a JSONL file where each line carries the fields needed to build both +an `Instance` (for scoring) and an `LMRequest` (for inference). You just +register a variant pointing at your file and choose your metrics: + +```python +register_variant( + "serialized", "my_task_name", + data_source=DataSource(path="s3://my-bucket/my_task.jsonl"), + metrics=(MyMetric(),), +) +``` + +Each JSONL record looks like: + +```jsonc +{ + "doc_id": 0, // unique int id + "question": "Write a function ...", // raw question text + "gold_answers": ["def foo(): ..."], // correct answer(s) + "choices": null, // MC answer choices, or null + "metadata": {"task_name": "my_task", "id": 1}, // arbitrary scorer-visible metadata + "request_type": "loglikelihood", // "completion", "loglikelihood", or "chat" + "prompt": "You are an expert ...\n\n", // formatted prompt string + "messages": null, // chat messages list, or null + "continuations": ["def foo(): ..."] // strings to score (loglikelihood) +} +``` + +The serialized file captures what goes into the model (the fully formatted +prompt, including baked-in few-shot examples and chat templates). It does +**not** capture SamplingParams, Scorers, Metrics, or answer extraction +logic -- those are configured on the olmo-eval side. + +For a full walkthrough -- including how to migrate a task from oe-eval and +how to create a new serialized task from scratch -- see +[docs/serialized_tasks.md](docs/serialized_tasks.md). + ## Tool-Augmented Evaluation olmo-eval supports evaluating models with tool use through the **Harness** abstraction. This enables comparing baseline model performance against tool-augmented performance on the same tasks. diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md new file mode 100644 index 000000000..a3f172de9 --- /dev/null +++ b/docs/serialized_tasks.md @@ -0,0 +1,338 @@ +# Serialized Tasks + +A serialized task lets you add an evaluation to olmo-eval without +implementing a new Task class. You provide a JSONL file containing +pre-formatted instances and model requests; the built-in `SerializedTask` +class handles loading and plumbing. All you need to do is register a variant +that points at your data and specifies the metrics you want. + +The goal is to keep this path as simple as possible for common task shapes +(completion, loglikelihood, chat). More exotic task types that need custom +runtime behavior (per-instance sampling params, dynamic prompt construction, +multi-request-per-instance patterns) may still require a full Task subclass, +but we aim to grow `SerializedTask` over time to cover the common cases. + +## JSONL schema at a glance + +Each line is a JSON object with two groups of fields -- one that maps to an +`Instance` (used for scoring) and one that maps to an `LMRequest` (sent to +the model): + +**Loglikelihood / BPB example** (single continuation scored against the prompt): + +```json +{ + "doc_id": 0, + "question": "Write a function to find the maximum element in a list.", + "gold_answers": ["def max_element(lst):\n return max(lst)"], + "choices": null, + "metadata": {"task_name": "mbpp:3shot:bpb", "id": 601, "test": "assert max_element([1,2,3]) == 3"}, + "request_type": "loglikelihood", + "prompt": "You are an expert Python programmer...\n\n", + "messages": null, + "continuations": ["def max_element(lst):\n return max(lst)"] +} +``` + +**Multiple-choice example** (one continuation per choice, scored by logprob): + +```json +{ + "doc_id": 0, + "question": "What is the powerhouse of the cell?", + "gold_answers": ["Mitochondria"], + "choices": ["Nucleus", "Mitochondria", "Ribosome", "Golgi apparatus"], + "metadata": {"task_name": "sciq", "gold_idx": 1}, + "request_type": "loglikelihood", + "prompt": "What is the powerhouse of the cell?\n A. Nucleus\n B. Mitochondria\n C. Ribosome\n D. Golgi apparatus\nAnswer:", + "messages": null, + "continuations": [" A", " B", " C", " D"] +} +``` + +**Chat example** (generation request with message list): + +```json +{ + "doc_id": 0, + "question": "What is the capital of France?", + "gold_answers": ["Paris"], + "choices": null, + "metadata": {"task_name": "trivia_chat"}, + "request_type": "chat", + "prompt": null, + "messages": [ + {"role": "system", "content": "Answer concisely."}, + {"role": "user", "content": "What is the capital of France?"} + ], + "continuations": null +} +``` + +**Instance-level fields** (for scoring): + +| Field | Type | Description | +|-------|------|-------------| +| `doc_id` | `int` | Unique numeric ID within the file | +| `question` | `str` | Raw question text (best-effort; may not be a clean question for all task types) | +| `gold_answers` | `list[str]` | Valid gold answers; first element maps to `Instance.gold_answer` | +| `choices` | `list[str] \| null` | Answer choices for MC tasks, null otherwise | +| `metadata` | `dict` | Arbitrary per-instance metadata; always includes `task_name`, may include `gold_idx` for MC, plus any scorer-specific fields | + +**LMRequest-level fields** (for inference): + +| Field | Type | Description | +|-------|------|-------------| +| `request_type` | `str` | `"completion"`, `"loglikelihood"`, or `"chat"` | +| `prompt` | `str \| null` | Formatted prompt string (completion / loglikelihood) | +| `messages` | `list[dict] \| null` | Chat messages list (chat requests) | +| `continuations` | `list[str] \| null` | Continuation strings for loglikelihood scoring | + +### What gets serialized and what does not + +The JSONL captures **what goes into the model**: the fully formatted prompt +or chat messages, including baked-in few-shot examples, chat templates, and +continuation strings. It also captures Instance fields needed by scorers +(question, gold answers, choices, metadata). + +The JSONL does **not** capture: + +- **SamplingParams** (max_tokens, temperature, stop_sequences, etc.) -- + these are runtime concerns configured via `TaskConfig` or CLI overrides. +- **Formatter, Scorer, and Metric definitions** -- these live in olmo-eval + task registrations, not in the data. +- **Answer extraction logic** -- implemented by scorers in olmo-eval. +- **system_prompt** -- supported by `LMRequest` but not currently read from + metadata by the default `SerializedTask` (easy to add by subclassing). + +--- + +## Example 1: Migrating a task from oe-eval-internal + +This walkthrough covers how the `olmo3:base_easy:code:bpb` suite was +migrated from oe-eval-internal using serialized data. Use it as a reference +if you have a task that already exists in oe-eval. + +### Step 1: Serialize the data + +The script +[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/main/oe_eval/serialize_benchmark.py) +converts oe-eval tasks into JSONL files. It loads a task through oe-eval's +own `load_task` / `build_all_requests` pipeline, then extracts the formatted +prompt and instance fields into `SerializedRecord` objects +(see lines 43-81 of that script for the schema dataclass). + +```bash +python -m oe_eval.serialize_benchmark \ + --task-suite olmo3:base_easy:code_bpb \ + --output-dir /tmp/serialized_benchmarks +``` + +This produces one JSONL file per leaf task plus a `manifest.json`. The +script handles collapsing multiple-choice request instances into a single +record with `choices` and `continuations` lists, and extracts metadata +fields like `test` and `entry_point` from the underlying dataset document +(see lines 221-256 for the extraction logic). + +Note that this script does not necessarily cover every task type in oe-eval +out of the box. It was written to migrate the code BPB suite and serves as +an example of how to approach serialization for other tasks. Tasks with +unusual request types (e.g. `generate_until_and_loglikelihood`) or +non-standard `doc_to_target()` implementations may need adjustments -- see +the [Limitations](#limitations-and-gotchas) section below. + +### Step 2: Upload the data + +Upload the JSONL files to a location accessible by `DataLoader` (S3, GCS, +or a local path): + +```bash +aws s3 cp /tmp/serialized_benchmarks/ s3://my-bucket/serialized/ --recursive +``` + +### Step 3: Register variants in olmo-eval + +Each JSONL file becomes a variant of the `serialized` base task. Add +registrations in +[`src/olmo_eval/evals/tasks/serialized.py`](../src/olmo_eval/evals/tasks/serialized.py). +Here is what the code BPB migration looks like (simplified): + +```python +from olmo_eval.common.metrics import BPBMetricInstanceAvg +from olmo_eval.data import DataSource +from olmo_eval.evals.tasks.common import register_variant + +_S3_BASE = "s3://ai2-llm/ianm/oe-eval-serialized/olmo3_base_easy_code_bpb" +_BPB_METRICS = (BPBMetricInstanceAvg(),) + +register_variant( + "serialized", + "codex_humaneval_3shot_bpb", + data_source=DataSource(path=f"{_S3_BASE}/codex_humaneval_3shot_bpb__none.jsonl"), + metrics=_BPB_METRICS, +) + +register_variant( + "serialized", + "mbpp_3shot_bpb", + data_source=DataSource(path=f"{_S3_BASE}/mbpp_3shot_bpb__none.jsonl"), + metrics=_BPB_METRICS, + limit=500, +) +``` + +Each call overrides `TaskConfig` fields on the `serialized` base task. +The kwargs you can pass are any `TaskConfig` field: `data_source`, +`metrics`, `limit`, `primary_metric`, `sampling_params`, etc. + +For the full set of registrations including the per-language multilingual +MBPP variants, see +[`serialized.py` lines 96-149](../src/olmo_eval/evals/tasks/serialized.py). + +### Step 4: Define a suite + +Group related serialized tasks into a suite in +[`src/olmo_eval/evals/suites/code.py`](../src/olmo_eval/evals/suites/code.py): + +```python +OLMO3_BASE_EASY_CODE_BPB_SERIALIZED = register( + Suite( + name="olmo3:base_easy:code:bpb:serialized", + tasks=( + "serialized:codex_humaneval_3shot_bpb", + "serialized:mbpp_3shot_bpb", + _SERIALIZED_MT_MBPP_V2FIX_BPB, + ), + aggregation=AggregationStrategy.AVERAGE_OF_AVERAGES, + description="OLMo3 base_easy code BPB suite from serialized data", + ) +) +``` + +### Step 5: Run it + +```bash +olmo-eval run -t serialized:codex_humaneval_3shot_bpb -m my-model +# or the whole suite: +olmo-eval run -s olmo3:base_easy:code:bpb:serialized -m my-model +``` + +--- + +## Example 2: Creating a new serialized task from scratch + +This walkthrough covers creating a serialized task that does not come from +oe-eval -- you write a standalone script to produce the JSONL. + +### Step 1: Write a serialization script + +See +[`examples/serialize_task_example.py`](../examples/serialize_task_example.py) +for a complete, runnable example. It loads the SciQ dataset from +HuggingFace (a simple 4-choice science QA benchmark) and writes a JSONL +file. The key function is `serialize_doc`, which builds one record per +example: + +```python +def serialize_doc(doc_id, doc): + # ... shuffle choices, find gold_idx ... + return { + "doc_id": doc_id, + "question": doc["question"], + "gold_answers": [doc["correct_answer"]], + "choices": choices, + "metadata": {"task_name": "sciq_serialized", "gold_idx": gold_idx}, + "request_type": "loglikelihood", + "prompt": make_prompt(doc, choices), + "messages": None, + "continuations": [f" {label}" for label in "ABCD"], + } +``` + +Run it: + +```bash +pip install datasets +python examples/serialize_task_example.py --output /tmp/sciq_serialized.jsonl +``` + +### Step 2: Register the variant + +Add a registration pointing at your JSONL. For a multiple-choice task +scored by logprob: + +```python +from olmo_eval.common.metrics import AccuracyMetric +from olmo_eval.common.scorers import MultipleChoiceScorer +from olmo_eval.data import DataSource +from olmo_eval.evals.tasks.common import register_variant + +register_variant( + "serialized", + "sciq_mc", + data_source=DataSource(path="/tmp/sciq_serialized.jsonl"), + metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), +) +``` + +### Step 3: Run it + +```bash +olmo-eval run -t serialized:sciq_mc -m my-model +``` + +--- + +## Passing task-specific metadata + +Scorers and metrics sometimes need per-instance data beyond the basic +Instance fields. For example, code execution scoring needs `test` and +`entry_point`, and IFEval constraint checking needs `instruction_id_list`. + +Put these fields in the `metadata` dict of each JSONL record: + +```json +{"metadata": {"test": "assert foo(1) == 2", "entry_point": "foo"}, ...} +``` + +The `SerializedTask` passes `metadata` through to `Instance.metadata` +unchanged, so they are available at scoring time via +`instance.metadata["test"]`. + +If you need behavior the default `SerializedTask` does not provide -- for +example, a custom `extract_answer` method, per-instance `SamplingParams`, +or reading `system_prompt` from metadata into `LMRequest` -- subclass +`SerializedTask` and register your subclass as a new base task. + +--- + +## Limitations and gotchas + +Serialization works well for straightforward completion, loglikelihood, and +chat tasks. More complex task types can introduce complications: + +- **Baked-in formatting is model-specific.** The serialized prompt includes + any chat template, FIM tokens, or context-window truncation applied during + serialization. A JSONL file produced for one model family may not be valid + for another. This is usually fine since evaluation suites tend to be + model-specific already. + +- **Dual-mode tasks (generation + BPB).** Some oe-eval tasks issue both a + `generate_until` and a `loglikelihood` request per instance. These need + two separate JSONL files -- one per request type -- registered as separate + variants with different metrics. + +- **Multiple-choice PMI normalization.** Unconditioned-prompt requests (used + for `acc_norm`) are not serialized. If needed, the olmo-eval consumer + would construct them at runtime from the `continuations` list and a + task-level unconditioned prompt string. + +- **Rolling / corpus perplexity.** Short-document perplexity can be + serialized as loglikelihood with `prompt=""` and a single continuation. + True sliding-window rolling for long documents requires olmo-eval + infrastructure that does not exist yet. + +- **Gold answer reliability.** When serializing from oe-eval, + `doc_to_target()` extracts `gold_answers`, but this method is not always + the canonical answer source for every task. Verify the serialized answers + match what scoring expects before running at scale. diff --git a/examples/serialize_task_example.py b/examples/serialize_task_example.py new file mode 100644 index 000000000..d3e74689f --- /dev/null +++ b/examples/serialize_task_example.py @@ -0,0 +1,111 @@ +"""Example: serialize a HuggingFace dataset to the JSONL format used by SerializedTask. + +This standalone script does not depend on oe-eval. It loads the SciQ +dataset (a simple science multiple-choice QA benchmark), formats each +item as a 0-shot loglikelihood request, and writes a JSONL file that +can be consumed directly by ``SerializedTask`` in olmo-eval. + +Usage: + pip install datasets + python examples/serialize_task_example.py --output /tmp/sciq_serialized.jsonl + +After producing the file you can register it in olmo-eval: + + register_variant( + "serialized", + "sciq_mc", + data_source=DataSource(path="/tmp/sciq_serialized.jsonl"), + metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), + ) + +See docs/serialized_tasks.md for the full guide. +""" + +from __future__ import annotations + +import argparse +import json +import random +from typing import Any + + +def load_sciq_test() -> list[dict[str, Any]]: + """Load the SciQ test split from HuggingFace.""" + from datasets import load_dataset + + ds = load_dataset("allenai/sciq", split="test") + return list(ds) + + +def make_prompt(doc: dict[str, Any], choices: list[str]) -> str: + """Build a simple 0-shot MC prompt.""" + labels = "ABCD" + lines = [doc["question"]] + for i, choice in enumerate(choices): + lines.append(f" {labels[i]}. {choice}") + lines.append("Answer:") + return "\n".join(lines) + + +def serialize_doc(doc_id: int, doc: dict[str, Any]) -> dict[str, Any]: + """Convert one SciQ document into a serialized record dict. + + SciQ has three distractor fields and one correct_answer field. + We assemble the four choices, shuffle them, and record the gold + index in metadata. + """ + choices = [doc["distractor1"], doc["distractor2"], doc["distractor3"], doc["correct_answer"]] + gold_idx = 3 # correct_answer is last before shuffle + + rng = random.Random(doc_id) + order = list(range(4)) + rng.shuffle(order) + choices = [choices[i] for i in order] + gold_idx = order.index(3) + + prompt = make_prompt(doc, choices) + + # Each continuation is one answer choice; the model scores each via + # loglikelihood and the highest-scoring continuation is the prediction. + labels = "ABCD" + continuations = [f" {labels[i]}" for i in range(len(choices))] + + return { + # -- Instance-level fields (for scoring) -- + "doc_id": doc_id, + "question": doc["question"], + "gold_answers": [doc["correct_answer"]], + "choices": choices, + "metadata": { + "task_name": "sciq_serialized", + "gold_idx": gold_idx, + "support": doc.get("support", ""), + }, + # -- LMRequest-level fields (for inference) -- + "request_type": "loglikelihood", + "prompt": prompt, + "messages": None, + "continuations": continuations, + } + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("--output", type=str, required=True, help="Path for the output JSONL file") + args = parser.parse_args() + + docs = load_sciq_test() + print(f"Loaded {len(docs)} SciQ test examples") + + with open(args.output, "w") as f: + for doc_id, doc in enumerate(docs): + record = serialize_doc(doc_id, doc) + f.write(json.dumps(record) + "\n") + + print(f"Wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/src/olmo_eval/evals/suites/code.py b/src/olmo_eval/evals/suites/code.py index ac996b640..7cc08c85e 100644 --- a/src/olmo_eval/evals/suites/code.py +++ b/src/olmo_eval/evals/suites/code.py @@ -75,6 +75,7 @@ description="OLMo3 base_easy code BPB suite from serialized data", ) ) + # ============================================================================= # MULTIPL_E Suites # ============================================================================= diff --git a/src/olmo_eval/evals/tasks/serialized.py b/src/olmo_eval/evals/tasks/serialized.py index 24c6342ea..98e2da38d 100644 --- a/src/olmo_eval/evals/tasks/serialized.py +++ b/src/olmo_eval/evals/tasks/serialized.py @@ -2,8 +2,16 @@ These tasks bypass the standard Formatter pipeline because the serialized data already contains both raw Instance fields (for scoring) and fully -formatted LMRequest fields (for inference). The serialized JSONL is -produced by oe-eval-internal's serialize_benchmark.py. +formatted LMRequest fields (for inference). The serialized JSONL can be +produced by oe-eval-internal's serialize_benchmark.py or by any script +that emits the schema documented in ``docs/serialized_tasks.md``. + +To add a new serialized task, call ``register_variant`` with a +``DataSource`` pointing at your JSONL file and the desired metrics. +You can add registrations directly in this file or in a new module +that imports ``SerializedTask``. See the existing registrations at +the bottom of this file, ``examples/serialize_task_example.py``, and +``docs/serialized_tasks.md`` for full walkthroughs. Top-level JSONL fields (used for Instance / LMRequest building): doc_id, question, gold_answers, choices, metadata, @@ -16,7 +24,8 @@ from collections.abc import Iterator from typing import Any -from olmo_eval.common.metrics import BPBMetricInstanceAvg +from olmo_eval.common.metrics import AccuracyMetric, BPBMetricInstanceAvg +from olmo_eval.common.scorers import MultipleChoiceScorer from olmo_eval.common.types import Instance, LMRequest, RequestType from olmo_eval.data import DataLoader, DataSource from olmo_eval.evals.tasks.common import Task, register, register_variant @@ -140,3 +149,16 @@ def format_request(self, instance: Instance) -> LMRequest: data_source=DataSource(path=f"{_S3_BASE}/mt_mbpp_v2fix_{_lang}.jsonl"), metrics=_BPB_METRICS, ) + +# ============================================================================= +# Example: SciQ multiple-choice (from examples/serialize_task_example.py) +# ============================================================================= + +register_variant( + "serialized", + "sciq_mc", + data_source=DataSource( + path="s3://ai2-llm/ianm/oe-eval-serialized/examples/sciq_serialized.jsonl" + ), + metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), +) From bd339193220edc9144bc52bab4a1b4c59fc91f13 Mon Sep 17 00:00:00 2001 From: IanM Date: Tue, 7 Apr 2026 19:49:55 -0700 Subject: [PATCH 2/3] fix metric --- docs/serialized_tasks.md | 5 ++--- examples/serialize_task_example.py | 2 +- src/olmo_eval/evals/tasks/serialized.py | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md index a3f172de9..1055ee9d9 100644 --- a/docs/serialized_tasks.md +++ b/docs/serialized_tasks.md @@ -262,8 +262,7 @@ Add a registration pointing at your JSONL. For a multiple-choice task scored by logprob: ```python -from olmo_eval.common.metrics import AccuracyMetric -from olmo_eval.common.scorers import MultipleChoiceScorer +from olmo_eval.common.metrics import LogprobMCAccuracyMetric from olmo_eval.data import DataSource from olmo_eval.evals.tasks.common import register_variant @@ -271,7 +270,7 @@ register_variant( "serialized", "sciq_mc", data_source=DataSource(path="/tmp/sciq_serialized.jsonl"), - metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), + metrics=(LogprobMCAccuracyMetric(),), ) ``` diff --git a/examples/serialize_task_example.py b/examples/serialize_task_example.py index d3e74689f..c3b7685be 100644 --- a/examples/serialize_task_example.py +++ b/examples/serialize_task_example.py @@ -15,7 +15,7 @@ "serialized", "sciq_mc", data_source=DataSource(path="/tmp/sciq_serialized.jsonl"), - metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), + metrics=(LogprobMCAccuracyMetric(),), ) See docs/serialized_tasks.md for the full guide. diff --git a/src/olmo_eval/evals/tasks/serialized.py b/src/olmo_eval/evals/tasks/serialized.py index 98e2da38d..d468ee3fb 100644 --- a/src/olmo_eval/evals/tasks/serialized.py +++ b/src/olmo_eval/evals/tasks/serialized.py @@ -24,8 +24,7 @@ from collections.abc import Iterator from typing import Any -from olmo_eval.common.metrics import AccuracyMetric, BPBMetricInstanceAvg -from olmo_eval.common.scorers import MultipleChoiceScorer +from olmo_eval.common.metrics import BPBMetricInstanceAvg, LogprobMCAccuracyMetric from olmo_eval.common.types import Instance, LMRequest, RequestType from olmo_eval.data import DataLoader, DataSource from olmo_eval.evals.tasks.common import Task, register, register_variant @@ -160,5 +159,5 @@ def format_request(self, instance: Instance) -> LMRequest: data_source=DataSource( path="s3://ai2-llm/ianm/oe-eval-serialized/examples/sciq_serialized.jsonl" ), - metrics=(AccuracyMetric(scorer=MultipleChoiceScorer),), + metrics=(LogprobMCAccuracyMetric(),), ) From 487ca103194c4e74092c251abf0abf27736a16aa Mon Sep 17 00:00:00 2001 From: IanM Date: Tue, 7 Apr 2026 20:26:07 -0700 Subject: [PATCH 3/3] edits --- docs/serialized_tasks.md | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/serialized_tasks.md b/docs/serialized_tasks.md index 1055ee9d9..c3ea20d44 100644 --- a/docs/serialized_tasks.md +++ b/docs/serialized_tasks.md @@ -74,7 +74,7 @@ the model): | Field | Type | Description | |-------|------|-------------| | `doc_id` | `int` | Unique numeric ID within the file | -| `question` | `str` | Raw question text (best-effort; may not be a clean question for all task types) | +| `question` | `str` | Raw question text | | `gold_answers` | `list[str]` | Valid gold answers; first element maps to `Instance.gold_answer` | | `choices` | `list[str] \| null` | Answer choices for MC tasks, null otherwise | | `metadata` | `dict` | Arbitrary per-instance metadata; always includes `task_name`, may include `gold_idx` for MC, plus any scorer-specific fields | @@ -99,11 +99,9 @@ The JSONL does **not** capture: - **SamplingParams** (max_tokens, temperature, stop_sequences, etc.) -- these are runtime concerns configured via `TaskConfig` or CLI overrides. -- **Formatter, Scorer, and Metric definitions** -- these live in olmo-eval +- **Scorer and Metric definitions** -- these live in olmo-eval task registrations, not in the data. - **Answer extraction logic** -- implemented by scorers in olmo-eval. -- **system_prompt** -- supported by `LMRequest` but not currently read from - metadata by the default `SerializedTask` (easy to add by subclassing). --- @@ -116,7 +114,7 @@ if you have a task that already exists in oe-eval. ### Step 1: Serialize the data The script -[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/main/oe_eval/serialize_benchmark.py) +[`oe_eval/serialize_benchmark.py`](https://github.com/allenai/oe-eval-internal/blob/ianm-serialize-bench-data-vllm-schema/oe_eval/serialize_benchmark.py) converts oe-eval tasks into JSONL files. It loads a task through oe-eval's own `load_task` / `build_all_requests` pipeline, then extracts the formatted prompt and instance fields into `SerializedRecord` objects @@ -131,8 +129,7 @@ python -m oe_eval.serialize_benchmark \ This produces one JSONL file per leaf task plus a `manifest.json`. The script handles collapsing multiple-choice request instances into a single record with `choices` and `continuations` lists, and extracts metadata -fields like `test` and `entry_point` from the underlying dataset document -(see lines 221-256 for the extraction logic). +fields like `test` and `entry_point` from the underlying dataset document. Note that this script does not necessarily cover every task type in oe-eval out of the box. It was written to migrate the code BPB suite and serves as @@ -185,10 +182,6 @@ Each call overrides `TaskConfig` fields on the `serialized` base task. The kwargs you can pass are any `TaskConfig` field: `data_source`, `metrics`, `limit`, `primary_metric`, `sampling_params`, etc. -For the full set of registrations including the per-language multilingual -MBPP variants, see -[`serialized.py` lines 96-149](../src/olmo_eval/evals/tasks/serialized.py). - ### Step 4: Define a suite Group related serialized tasks into a suite in @@ -310,13 +303,12 @@ or reading `system_prompt` from metadata into `LMRequest` -- subclass Serialization works well for straightforward completion, loglikelihood, and chat tasks. More complex task types can introduce complications: -- **Baked-in formatting is model-specific.** The serialized prompt includes +- **Baked-in formatting is model-specific.** If the serialized prompt includes any chat template, FIM tokens, or context-window truncation applied during serialization. A JSONL file produced for one model family may not be valid - for another. This is usually fine since evaluation suites tend to be - model-specific already. + for another. -- **Dual-mode tasks (generation + BPB).** Some oe-eval tasks issue both a +- **Dual-mode tasks (generation + BPB).** Some tasks issue both a `generate_until` and a `loglikelihood` request per instance. These need two separate JSONL files -- one per request type -- registered as separate variants with different metrics.