From 23f9dcd8a2771e81084427c1b78e5b8c42c45c52 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 4 Mar 2026 12:26:16 -0800 Subject: [PATCH 01/24] Init --- src/olmo_eval/evals/tasks/hard_reasoning.py | 164 ++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 src/olmo_eval/evals/tasks/hard_reasoning.py diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py new file mode 100644 index 000000000..c5804adfd --- /dev/null +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -0,0 +1,164 @@ +"""HardReasoning evaluation tasks. + +Logic puzzles and reasoning tasks that require multi-step constraint satisfaction. + +Dataset: allenai/hard-reasoning + +Usage: + olmo-eval run -m my-model -t hard_reasoning_bringing_toys + olmo-eval run -m my-model -t hard_reasoning_bringing_toys:chat +""" + +from __future__ import annotations + +import json +import sys +from collections.abc import Iterator +from typing import Any + +from olmo_eval.common.formatters import ChatFormatter +from olmo_eval.common.metrics import AccuracyMetric +from olmo_eval.common.types import Instance, LMOutput, LMRequest, RequestType, SamplingParams +from olmo_eval.evals.tasks.common import Task, register, register_variant + +HARD_REASONING_TASKS: tuple[str, ...] = ( + "bringing_toys", + "classroom_assignment", + "dinner_party", + "expense_splitting", + "printing_jobs", + "secret_santa", + "social_gathering", + "wedding_planning", + "wedding_supplies", +) + + +def _extract_last_complete_json(s: str) -> dict | None: + """Extract the last complete JSON object from a string.""" + stack: list[int] = [] + last_json_start: int | None = None + last_json_str: str | None = None + for i, char in enumerate(s): + if char == "{": + stack.append(i) + if last_json_start is None: + last_json_start = i + elif char == "}": + if stack: + stack.pop() + if not stack: + last_json_str = s[last_json_start : i + 1] + last_json_start = None + if last_json_str: + try: + return json.loads(last_json_str.replace("\n", "")) + except json.JSONDecodeError: + pass + return None + + +class HardReasoningBase(Task): + """Base class for HardReasoning logic puzzle tasks. + + Each subtask loads from a specific subset of the allenai/hard-reasoning dataset, + where files are organized as {subset}/dev_t1.jsonl and {subset}/test_t1.jsonl. + """ + + subset: str = "bringing_toys" + sampling_params = SamplingParams( + max_tokens=4096, + temperature=0.0, + stop_sequences=("\n\n",), + ) + metrics = (AccuracyMetric(),) + + @property + def instances(self) -> Iterator[Instance]: + if self._instances_cache is None: + self._instances_cache = list(self._load_hard_reasoning_split("test")) + yield from self._instances_cache + + def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: + """Load instances from a specific split of the hard-reasoning dataset.""" + import datasets as hf_datasets + + file_name = "dev_t1.jsonl" if split == "validation" else "test_t1.jsonl" + dataset = hf_datasets.load_dataset( + "allenai/hard-reasoning", + data_files={split: f"{self.subset}/{file_name}"}, + )[split] + for index, doc in enumerate(dataset): + instance = self.process_doc(doc, index) + if instance is not None: + yield instance + + def process_doc(self, doc: dict[str, Any], index: int = 0) -> Instance | None: + answer = doc["solver"]["status"] + gold_answer = ( + json.dumps(answer, sort_keys=True) if isinstance(answer, dict) else str(answer) + ) + return Instance( + question=doc["problem"]["prompt"], + gold_answer=gold_answer, + metadata={"id": doc.get("id", index)}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + if self.config.formatter is not None: + return self.config.formatter.format(instance, self.get_fewshot()) + return LMRequest( + request_type=RequestType.COMPLETION, + prompt=instance.question, + ) + + def extract_answer(self, output: LMOutput) -> str | None: + """Extract the solution from the model's JSON response.""" + json_obj = _extract_last_complete_json(output.text) + if json_obj is not None and "solution" in json_obj: + return json.dumps(json_obj["solution"], sort_keys=True) + return output.text.strip() or None + + def _build_fewshot(self) -> list[Instance]: + """Build few-shot examples from the dev split.""" + import random + + if self.config.num_fewshot == 0: + return [] + all_instances = list(self._load_hard_reasoning_split("validation")) + if not all_instances: + return [] + rng = random.Random(self.config.fewshot_seed) + return rng.sample(all_instances, min(self.config.num_fewshot, len(all_instances))) + + +# ============================================================================= +# Task Registration +# ============================================================================= + +for _subset in HARD_REASONING_TASKS: + _task_name = f"hard_reasoning_{_subset}" + _class_name = f"HardReasoning_{_subset.title().replace('_', '')}" + _cls = type( + _class_name, + (HardReasoningBase,), + { + "subset": _subset, + "__module__": __name__, + "__qualname__": _class_name, + }, + ) + setattr(sys.modules[__name__], _class_name, _cls) + register(_task_name)(_cls) + register_variant( + _task_name, + "chat", + formatter=ChatFormatter(), + sampling_params=SamplingParams(max_tokens=4096, temperature=0.0), + ) + + +__all__ = [ + "HARD_REASONING_TASKS", + "HardReasoningBase", +] From 2be29eb4faac66451f100db3e9ccf7e088aacd7d Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 4 Mar 2026 12:33:02 -0800 Subject: [PATCH 02/24] Longer context --- src/olmo_eval/evals/tasks/hard_reasoning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index c5804adfd..52942b1ab 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -154,7 +154,7 @@ def _build_fewshot(self) -> list[Instance]: _task_name, "chat", formatter=ChatFormatter(), - sampling_params=SamplingParams(max_tokens=4096, temperature=0.0), + sampling_params=SamplingParams(max_tokens=32768, temperature=0.0), ) From 6b2102de32323d96d5e4ff78654ea708d2c05afd Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 4 Mar 2026 12:57:49 -0800 Subject: [PATCH 03/24] Chat without backend --- src/olmo_eval/runners/asynq/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/olmo_eval/runners/asynq/processing.py b/src/olmo_eval/runners/asynq/processing.py index 118f524f1..2e68d5b50 100644 --- a/src/olmo_eval/runners/asynq/processing.py +++ b/src/olmo_eval/runners/asynq/processing.py @@ -254,7 +254,7 @@ async def process_items( batchable_items: list[QueueItem] = [] for item in items: - if item.request.request_type == RequestType.CHAT: + if item.request.request_type == RequestType.CHAT and harness.config.backend: chat_items.append(item) else: batchable_items.append(item) From 92f2b46b92b808070f2a19751eb6f2854691395e Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 25 Mar 2026 13:07:46 -0700 Subject: [PATCH 04/24] HardReasoning scorer based on verifier instead of gold answer --- pyproject.toml | 9 +++++ src/olmo_eval/evals/tasks/hard_reasoning.py | 43 ++++++++++++++++----- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cff1203ab..52f09ed7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,9 @@ postgres = [ gpu = [ "nvidia-ml-py>=12.560", ] +hard-reasoning = [ + "np-hard-reasoning @ git+https://github.com/allenai/np-hard-reasoning.git", +] clients = [ "openai~=2.21.0", ] @@ -132,6 +135,12 @@ include = ["src/olmo_eval/harness/backends/**"] [tool.ty.overrides.rules] unresolved-import = "ignore" +# ty overrides for hard_reasoning task (uses optional np-hard-reasoning dependency) +[[tool.ty.overrides]] +include = ["src/olmo_eval/evals/tasks/hard_reasoning.py"] +[tool.ty.overrides.rules] +unresolved-import = "ignore" + # ty overrides for adapters module (uses optional openhands/swe-rex dependencies) [[tool.ty.overrides]] include = ["src/olmo_eval/harness/adapters/**"] diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index 52942b1ab..7f0e761ea 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -14,10 +14,12 @@ import json import sys from collections.abc import Iterator +from dataclasses import dataclass from typing import Any from olmo_eval.common.formatters import ChatFormatter from olmo_eval.common.metrics import AccuracyMetric +from olmo_eval.common.scorers import Scorer from olmo_eval.common.types import Instance, LMOutput, LMRequest, RequestType, SamplingParams from olmo_eval.evals.tasks.common import Task, register, register_variant @@ -58,6 +60,28 @@ def _extract_last_complete_json(s: str) -> dict | None: return None +@dataclass(frozen=True, slots=True) +class HardReasoningScorer(Scorer): + """Score using the np_hard_reasoning check() function.""" + + name: str = "hard_reasoning_check" + + def score(self, instance: Instance, output: LMOutput) -> float: + from np_hard_reasoning.scenarios.registry import SCENARIO_REGISTRY + + if output.extracted_answer is None: + return 0.0 + subset = instance.metadata.get("subset", "") + scenario_cls = SCENARIO_REGISTRY.get(subset) + if scenario_cls is None: + return 0.0 + try: + scenario = scenario_cls.load_from_json(instance.metadata["scenario_data"]) + return 1.0 if scenario.check_json(str(output.extracted_answer)) else 0.0 + except Exception: + return 0.0 + + class HardReasoningBase(Task): """Base class for HardReasoning logic puzzle tasks. @@ -71,7 +95,7 @@ class HardReasoningBase(Task): temperature=0.0, stop_sequences=("\n\n",), ) - metrics = (AccuracyMetric(),) + metrics = (AccuracyMetric(scorer=HardReasoningScorer),) @property def instances(self) -> Iterator[Instance]: @@ -94,14 +118,13 @@ def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: yield instance def process_doc(self, doc: dict[str, Any], index: int = 0) -> Instance | None: - answer = doc["solver"]["status"] - gold_answer = ( - json.dumps(answer, sort_keys=True) if isinstance(answer, dict) else str(answer) - ) return Instance( question=doc["problem"]["prompt"], - gold_answer=gold_answer, - metadata={"id": doc.get("id", index)}, + metadata={ + "id": doc.get("id", index), + "scenario_data": doc["problem"], + "subset": self.subset, + }, ) def format_request(self, instance: Instance) -> LMRequest: @@ -113,10 +136,10 @@ def format_request(self, instance: Instance) -> LMRequest: ) def extract_answer(self, output: LMOutput) -> str | None: - """Extract the solution from the model's JSON response.""" + """Extract the last complete JSON object from the model's response.""" json_obj = _extract_last_complete_json(output.text) - if json_obj is not None and "solution" in json_obj: - return json.dumps(json_obj["solution"], sort_keys=True) + if json_obj is not None: + return json.dumps(json_obj) return output.text.strip() or None def _build_fewshot(self) -> list[Instance]: From eb38117ff92b22c7e1a32a040b48b92ae629caf5 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Tue, 7 Apr 2026 16:40:45 -0700 Subject: [PATCH 05/24] Adding unit tests for _extract_last_complete_json per PR review --- tests/evals/tasks/test_hard_reasoning.py | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/evals/tasks/test_hard_reasoning.py diff --git a/tests/evals/tasks/test_hard_reasoning.py b/tests/evals/tasks/test_hard_reasoning.py new file mode 100644 index 000000000..ecfd7b499 --- /dev/null +++ b/tests/evals/tasks/test_hard_reasoning.py @@ -0,0 +1,30 @@ +"""Tests for HardReasoning task logic.""" + +from olmo_eval.evals.tasks.hard_reasoning import _extract_last_complete_json + + +class TestExtractLastCompleteJson: + def test_simple_json(self): + assert _extract_last_complete_json('{"a": 1}') == {"a": 1} + + def test_json_after_text(self): + assert _extract_last_complete_json('Some text {"key": "value"}') == {"key": "value"} + + def test_returns_last_json(self): + result = _extract_last_complete_json('{"first": 1} some text {"second": 2}') + assert result == {"second": 2} + + def test_nested_json(self): + assert _extract_last_complete_json('{"outer": {"inner": 42}}') == {"outer": {"inner": 42}} + + def test_json_with_newlines(self): + assert _extract_last_complete_json('{"a":\n1}') == {"a": 1} + + def test_no_json_returns_none(self): + assert _extract_last_complete_json("no json here") is None + + def test_incomplete_json_returns_none(self): + assert _extract_last_complete_json('{"incomplete": ') is None + + def test_empty_string_returns_none(self): + assert _extract_last_complete_json("") is None From 996e2765043d7a8eb262ab5eea19a1c19e43c367 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Tue, 7 Apr 2026 16:42:34 -0700 Subject: [PATCH 06/24] Pinning np hard reasoning to a specific commit, per PR review --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index feb65befd..e2468d852 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ gpu = [ "nvidia-ml-py>=12.560", ] hard-reasoning = [ - "np-hard-reasoning @ git+https://github.com/allenai/np-hard-reasoning.git", + "np-hard-reasoning @ git+https://github.com/allenai/np-hard-reasoning.git@fa8bbb2a5554e34a7ce051b71e9357e44dbabd0f", ] clients = [ "openai~=2.21.0", From 3195b9a03c45155c90cf9f892bc635a26cd1504c Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 16:42:02 -0700 Subject: [PATCH 07/24] add num_instances=4 directly to the model preset --- src/olmo_eval/common/constants/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 60eef6c45..00a7be50d 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -77,6 +77,7 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", max_model_len=32768, + num_instances=4, ), "mistral-7b": ProviderConfig( kind=ProviderKind.VLLM, From bd9ba5f8ebdc14db66ff51fb1c5e45c32220d56e Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 17:03:02 -0700 Subject: [PATCH 08/24] reverting model preset --- src/olmo_eval/common/constants/models.py | 1 - src/olmo_eval/evals/tasks/hard_reasoning.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 00a7be50d..60eef6c45 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -77,7 +77,6 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", max_model_len=32768, - num_instances=4, ), "mistral-7b": ProviderConfig( kind=ProviderKind.VLLM, diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index 7f0e761ea..c77203c2c 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -90,6 +90,9 @@ class HardReasoningBase(Task): """ subset: str = "bringing_toys" + dependencies = [ + "git+https://github.com/allenai/np-hard-reasoning.git@fa8bbb2a5554e34a7ce051b71e9357e44dbabd0f", + ] sampling_params = SamplingParams( max_tokens=4096, temperature=0.0, From 16f1a98c0dfd64d442bf8ca319613aeaa151a367 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 17:10:24 -0700 Subject: [PATCH 09/24] injecting HF token --- src/olmo_eval/evals/tasks/hard_reasoning.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index c77203c2c..a227f3380 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -108,12 +108,15 @@ def instances(self) -> Iterator[Instance]: def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: """Load instances from a specific split of the hard-reasoning dataset.""" + import os + import datasets as hf_datasets file_name = "dev_t1.jsonl" if split == "validation" else "test_t1.jsonl" dataset = hf_datasets.load_dataset( "allenai/hard-reasoning", data_files={split: f"{self.subset}/{file_name}"}, + token=os.environ.get("HF_TOKEN"), )[split] for index, doc in enumerate(dataset): instance = self.process_doc(doc, index) From f0230412d0c716a8e9e5b16c3012bfb38b3a68db Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 17:15:37 -0700 Subject: [PATCH 10/24] streaming hf ds --- src/olmo_eval/evals/tasks/hard_reasoning.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index a227f3380..d6b224f67 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -117,6 +117,7 @@ def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: "allenai/hard-reasoning", data_files={split: f"{self.subset}/{file_name}"}, token=os.environ.get("HF_TOKEN"), + streaming=True, )[split] for index, doc in enumerate(dataset): instance = self.process_doc(doc, index) From 8f4e9940b4200b165ac67dd6060d05a9d2dc75b3 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 21:36:39 -0700 Subject: [PATCH 11/24] hf hub --- src/olmo_eval/evals/tasks/hard_reasoning.py | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index d6b224f67..4c49932c5 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -108,21 +108,24 @@ def instances(self) -> Iterator[Instance]: def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: """Load instances from a specific split of the hard-reasoning dataset.""" + import json import os - import datasets as hf_datasets + from huggingface_hub import hf_hub_download file_name = "dev_t1.jsonl" if split == "validation" else "test_t1.jsonl" - dataset = hf_datasets.load_dataset( - "allenai/hard-reasoning", - data_files={split: f"{self.subset}/{file_name}"}, + local_path = hf_hub_download( + repo_id="allenai/hard-reasoning", + filename=f"{self.subset}/{file_name}", + repo_type="dataset", token=os.environ.get("HF_TOKEN"), - streaming=True, - )[split] - for index, doc in enumerate(dataset): - instance = self.process_doc(doc, index) - if instance is not None: - yield instance + ) + with open(local_path) as f: + for index, line in enumerate(f): + doc = json.loads(line) + instance = self.process_doc(doc, index) + if instance is not None: + yield instance def process_doc(self, doc: dict[str, Any], index: int = 0) -> Instance | None: return Instance( From a25bc92a22789ae9e253e1d8e4dd6efab2fcdb3b Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Wed, 8 Apr 2026 21:45:32 -0700 Subject: [PATCH 12/24] task dependencies --- src/olmo_eval/evals/tasks/hard_reasoning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index 4c49932c5..f9bfbf63b 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -92,6 +92,8 @@ class HardReasoningBase(Task): subset: str = "bringing_toys" dependencies = [ "git+https://github.com/allenai/np-hard-reasoning.git@fa8bbb2a5554e34a7ce051b71e9357e44dbabd0f", + "z3-solver", + "networkx", ] sampling_params = SamplingParams( max_tokens=4096, From f613ef1bafd9dcbcbb7add4aeacd3597a163b339 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Thu, 9 Apr 2026 12:08:36 -0700 Subject: [PATCH 13/24] vLLM and chat --- src/olmo_eval/inference/providers/vllm.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/olmo_eval/inference/providers/vllm.py b/src/olmo_eval/inference/providers/vllm.py index eb64447a4..14d570942 100644 --- a/src/olmo_eval/inference/providers/vllm.py +++ b/src/olmo_eval/inference/providers/vllm.py @@ -216,6 +216,19 @@ def _build_sampling_params(self, params: SamplingParams) -> Any: return VLLMSamplingParams(**kwargs) + def _format_prompt(self, request: LMRequest) -> str: + """Convert a request to a prompt string, applying chat template for CHAT requests.""" + from olmo_eval.common.types import RequestType + + if request.request_type == RequestType.CHAT and request.messages: + tokenizer = self.llm.get_tokenizer() + return tokenizer.apply_chat_template( + list(request.messages), + tokenize=False, + add_generation_prompt=True, + ) + return request.prompt + def generate( self, requests: list[LMRequest], @@ -224,7 +237,7 @@ def generate( params = self._default_sampling_params(sampling_params) vllm_params = self._build_sampling_params(params) - prompt_strs = [req.prompt for req in requests] + prompt_strs = [self._format_prompt(req) for req in requests] if is_debug_requests(): for i, prompt in enumerate(prompt_strs): From 52b8c3a1b4f606c0e7533098d4e6cb3d73b7dd31 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Thu, 9 Apr 2026 14:18:53 -0700 Subject: [PATCH 14/24] olmo-3-7b-instruct --- src/olmo_eval/common/constants/models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 60eef6c45..a6d71e58e 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -44,6 +44,10 @@ def get_model_presets() -> dict[str, ProviderConfig]: revision="stage2-step47684", kwargs={"gpu_memory_utilization": 0.7, "add_bos_token": False}, ), + "olmo-3-7b-instruct": ProviderConfig( + kind=ProviderKind.VLLM, + model="allenai/Olmo-3-7B-Instruct", + ), "olmo-2-7b": ProviderConfig( kind=ProviderKind.VLLM, model="allenai/OLMo-2-1124-7B", From b716ef30e4ee909736e726f7ad1756c1196d17a9 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Thu, 9 Apr 2026 21:39:11 -0700 Subject: [PATCH 15/24] parse_rate --- src/olmo_eval/evals/tasks/hard_reasoning.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index f9bfbf63b..09eaa7a88 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -60,6 +60,22 @@ def _extract_last_complete_json(s: str) -> dict | None: return None +@dataclass(frozen=True, slots=True) +class HardReasoningParsedScorer(Scorer): + """Score 1.0 if the model output was parsed as valid JSON, else 0.0.""" + + name: str = "parsed" + + def score(self, instance: Instance, output: LMOutput) -> float: + if output.extracted_answer is None: + return 0.0 + try: + json.loads(output.extracted_answer) + return 1.0 + except (json.JSONDecodeError, TypeError): + return 0.0 + + @dataclass(frozen=True, slots=True) class HardReasoningScorer(Scorer): """Score using the np_hard_reasoning check() function.""" @@ -100,7 +116,10 @@ class HardReasoningBase(Task): temperature=0.0, stop_sequences=("\n\n",), ) - metrics = (AccuracyMetric(scorer=HardReasoningScorer),) + metrics = ( + AccuracyMetric(scorer=HardReasoningScorer), + AccuracyMetric(name="parse_rate", scorer=HardReasoningParsedScorer), + ) @property def instances(self) -> Iterator[Instance]: From cfd2d43d4a850cb38c1c57e2a7401122142c5ddf Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Fri, 10 Apr 2026 08:56:53 -0700 Subject: [PATCH 16/24] olmo 3.1 32B instruct preset --- src/olmo_eval/common/constants/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index a6d71e58e..8beb76732 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -48,6 +48,11 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="allenai/Olmo-3-7B-Instruct", ), + "olmo-3.1-32b-instruct": ProviderConfig( + kind=ProviderKind.VLLM, + model="allenai/Olmo-3.1-32B-Instruct", + kwargs={"tensor_parallel_size": 2}, + ), "olmo-2-7b": ProviderConfig( kind=ProviderKind.VLLM, model="allenai/OLMo-2-1124-7B", From 455298ca9d54b9fb106fada182e4dca0cf1b61a7 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Fri, 10 Apr 2026 09:03:47 -0700 Subject: [PATCH 17/24] qwen3 VL 32B instruct preset --- src/olmo_eval/common/constants/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 8beb76732..b177c4249 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -67,6 +67,11 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="Qwen/Qwen2.5-7B", ), + "qwen3-vl-32b-instruct": ProviderConfig( + kind=ProviderKind.VLLM, + model="Qwen/Qwen3-VL-32B-Instruct", + kwargs={"tensor_parallel_size": 2}, + ), "qwen3-coder-30b": ProviderConfig( kind=ProviderKind.VLLM_SERVER, model="Qwen/Qwen3-Coder-30B-A3B-Instruct", From b8659f8c8861179f934cd2dc251603c3aa5aa535 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Fri, 10 Apr 2026 13:27:54 -0700 Subject: [PATCH 18/24] qwen3 32B instruct preset --- src/olmo_eval/common/constants/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index b177c4249..e68bf1eea 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -67,9 +67,9 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="Qwen/Qwen2.5-7B", ), - "qwen3-vl-32b-instruct": ProviderConfig( + "qwen3-32b-instruct": ProviderConfig( kind=ProviderKind.VLLM, - model="Qwen/Qwen3-VL-32B-Instruct", + model="Qwen/Qwen3-32B-Instruct", kwargs={"tensor_parallel_size": 2}, ), "qwen3-coder-30b": ProviderConfig( From e565f0121ede7509af19bc69b6bf744b760b9a37 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Fri, 10 Apr 2026 21:16:10 -0700 Subject: [PATCH 19/24] gemma, olmo3-think, r1-32B --- src/olmo_eval/common/constants/models.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index e68bf1eea..8b2c4b7ac 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -53,6 +53,11 @@ def get_model_presets() -> dict[str, ProviderConfig]: model="allenai/Olmo-3.1-32B-Instruct", kwargs={"tensor_parallel_size": 2}, ), + "olmo-3.1-32b-think": ProviderConfig( + kind=ProviderKind.VLLM, + model="allenai/Olmo-3.1-32B-Think", + kwargs={"tensor_parallel_size": 2}, + ), "olmo-2-7b": ProviderConfig( kind=ProviderKind.VLLM, model="allenai/OLMo-2-1124-7B", @@ -72,6 +77,11 @@ def get_model_presets() -> dict[str, ProviderConfig]: model="Qwen/Qwen3-32B-Instruct", kwargs={"tensor_parallel_size": 2}, ), + "gemma-3-27b-it": ProviderConfig( + kind=ProviderKind.VLLM, + model="google/gemma-3-27b-it", + kwargs={"tensor_parallel_size": 2}, + ), "qwen3-coder-30b": ProviderConfig( kind=ProviderKind.VLLM_SERVER, model="Qwen/Qwen3-Coder-30B-A3B-Instruct", @@ -92,6 +102,12 @@ def get_model_presets() -> dict[str, ProviderConfig]: model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", max_model_len=32768, ), + "deepseek-r1-distill-qwen-32b": ProviderConfig( + kind=ProviderKind.VLLM, + model="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + max_model_len=32768, + kwargs={"tensor_parallel_size": 2, "enforce_eager": True}, + ), "mistral-7b": ProviderConfig( kind=ProviderKind.VLLM, model="mistralai/Mistral-7B-v0.3", From cdf35fbc098453cd945bde312fe371c7c434503b Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Sat, 11 Apr 2026 08:13:16 -0700 Subject: [PATCH 20/24] qwen3-32B --- src/olmo_eval/common/constants/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 8b2c4b7ac..7d7a4d4e2 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -72,9 +72,9 @@ def get_model_presets() -> dict[str, ProviderConfig]: kind=ProviderKind.VLLM, model="Qwen/Qwen2.5-7B", ), - "qwen3-32b-instruct": ProviderConfig( + "qwen3-32b": ProviderConfig( kind=ProviderKind.VLLM, - model="Qwen/Qwen3-32B-Instruct", + model="Qwen/Qwen3-32B", kwargs={"tensor_parallel_size": 2}, ), "gemma-3-27b-it": ProviderConfig( From 555bda9fe4a3c58f37043b45a9026ffca71d3a0e Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Fri, 17 Apr 2026 17:42:47 -0700 Subject: [PATCH 21/24] r1-qwen3-8b --- src/olmo_eval/common/constants/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 7d7a4d4e2..88beba9be 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -102,6 +102,11 @@ def get_model_presets() -> dict[str, ProviderConfig]: model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", max_model_len=32768, ), + "deepseek-r1-0528-qwen3-8b": ProviderConfig( + kind=ProviderKind.VLLM, + model="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", + max_model_len=32768, + ), "deepseek-r1-distill-qwen-32b": ProviderConfig( kind=ProviderKind.VLLM, model="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", From 3f9b90350d2a46acce326bc1f5b566ce0e504fc3 Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Sat, 18 Apr 2026 13:46:28 -0700 Subject: [PATCH 22/24] gpt-oss-20b --- src/olmo_eval/common/constants/models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/olmo_eval/common/constants/models.py b/src/olmo_eval/common/constants/models.py index 88beba9be..6d76f95d3 100644 --- a/src/olmo_eval/common/constants/models.py +++ b/src/olmo_eval/common/constants/models.py @@ -107,6 +107,10 @@ def get_model_presets() -> dict[str, ProviderConfig]: model="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", max_model_len=32768, ), + "gpt-oss-20b": ProviderConfig( + kind=ProviderKind.VLLM, + model="openai/gpt-oss-20b", + ), "deepseek-r1-distill-qwen-32b": ProviderConfig( kind=ProviderKind.VLLM, model="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", From 349b91cef00eb5f8a793f2803c83c16e9ab960ff Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Sat, 18 Apr 2026 14:35:06 -0700 Subject: [PATCH 23/24] task split --- src/olmo_eval/evals/tasks/hard_reasoning.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index 09eaa7a88..62cd2523e 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -20,7 +20,7 @@ from olmo_eval.common.formatters import ChatFormatter from olmo_eval.common.metrics import AccuracyMetric from olmo_eval.common.scorers import Scorer -from olmo_eval.common.types import Instance, LMOutput, LMRequest, RequestType, SamplingParams +from olmo_eval.common.types import Instance, LMOutput, LMRequest, RequestType, SamplingParams, Split from olmo_eval.evals.tasks.common import Task, register, register_variant HARD_REASONING_TASKS: tuple[str, ...] = ( @@ -124,7 +124,7 @@ class HardReasoningBase(Task): @property def instances(self) -> Iterator[Instance]: if self._instances_cache is None: - self._instances_cache = list(self._load_hard_reasoning_split("test")) + self._instances_cache = list(self._load_hard_reasoning_split(self.config.split)) yield from self._instances_cache def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: @@ -210,6 +210,11 @@ def _build_fewshot(self) -> list[Instance]: formatter=ChatFormatter(), sampling_params=SamplingParams(max_tokens=32768, temperature=0.0), ) + register_variant( + _task_name, + "dev", + split=Split.VALIDATION, + ) __all__ = [ From 1465f0bf4bcdc17bceb39f6fff5bc63ae9dce1dd Mon Sep 17 00:00:00 2001 From: Ronan Le Bras Date: Sun, 19 Apr 2026 15:33:58 -0700 Subject: [PATCH 24/24] new data format --- src/olmo_eval/evals/tasks/hard_reasoning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/olmo_eval/evals/tasks/hard_reasoning.py b/src/olmo_eval/evals/tasks/hard_reasoning.py index 62cd2523e..7293adfcf 100644 --- a/src/olmo_eval/evals/tasks/hard_reasoning.py +++ b/src/olmo_eval/evals/tasks/hard_reasoning.py @@ -150,10 +150,10 @@ def _load_hard_reasoning_split(self, split: str) -> Iterator[Instance]: def process_doc(self, doc: dict[str, Any], index: int = 0) -> Instance | None: return Instance( - question=doc["problem"]["prompt"], + question=doc["prompt"], metadata={ "id": doc.get("id", index), - "scenario_data": doc["problem"], + "scenario_data": doc["instance"], "subset": self.subset, }, )