From 34d8e145272dc80bd33c0b7c73a145804d1ac43b Mon Sep 17 00:00:00 2001
From: Paul Laskowski <paull@allenai.org>
Date: Fri, 27 Feb 2026 10:08:04 -0800
Subject: [PATCH 1/4] adds smoke simple smoke tests

---
 src/olmo_eval/common/types/base.py           |  14 +
 src/olmo_eval/evals/suites/smoke.py          |  22 ++
 src/olmo_eval/evals/tasks/smoke_tests.py     | 286 +++++++++++++++++++
 src/olmo_eval/inference/providers/litellm.py |  39 ++-
 4 files changed, 358 insertions(+), 3 deletions(-)
 create mode 100644 src/olmo_eval/evals/suites/smoke.py
 create mode 100644 src/olmo_eval/evals/tasks/smoke_tests.py

diff --git a/src/olmo_eval/common/types/base.py b/src/olmo_eval/common/types/base.py
index 625c973d7..d4736cdcf 100644
--- a/src/olmo_eval/common/types/base.py
+++ b/src/olmo_eval/common/types/base.py
@@ -169,12 +169,26 @@ class LMOutput:
     extracted_answer: Any = None
     metadata: dict[str, Any] = field(default_factory=dict)
     tool_calls: list[ToolCall] | None = None
+    reasoning: str | None = None
+    reasoning_content: str | None = None
 
     @property
     def has_tool_calls(self) -> bool:
         """Check if this output contains tool calls."""
         return self.tool_calls is not None and len(self.tool_calls) > 0
 
+    @property
+    def has_reasoning(self) -> bool:
+        """Check if this output contains reasoning content.
+
+        Returns True if either 'reasoning' or 'reasoning_content' field is present.
+        """
+        has_reasoning_field = self.reasoning is not None and len(self.reasoning) > 0
+        has_reasoning_content_field = (
+            self.reasoning_content is not None and len(self.reasoning_content) > 0
+        )
+        return has_reasoning_field or has_reasoning_content_field
+
 
 @dataclass(slots=True)
 class Response:
diff --git a/src/olmo_eval/evals/suites/smoke.py b/src/olmo_eval/evals/suites/smoke.py
new file mode 100644
index 000000000..a765cee73
--- /dev/null
+++ b/src/olmo_eval/evals/suites/smoke.py
@@ -0,0 +1,22 @@
+"""Smoke test suites for basic model sanity checks."""
+
+from olmo_eval.evals.suites.registry import AggregationStrategy, make_suite
+
+# =============================================================================
+# Smoke Test Suites
+# =============================================================================
+
+
+OLMO_INSTRUCT_SMOKE = make_suite(
+    name="olmo:instruct:smoke",
+    tasks=("smoke_hello", "smoke_identity_olmo", "smoke_toolcall"),
+    aggregation=AggregationStrategy.NONE,
+    description="Smoke tests for Olmo3 instruct models",
+)
+
+OLMO_THINK_SMOKE = make_suite(
+    name="olmo:think:smoke",
+    tasks=("smoke_hello", "smoke_identity_olmo", "smoke_reasoning"),
+    aggregation=AggregationStrategy.NONE,
+    description="Smoke tests for Olmo3 think models",
+)
diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py
new file mode 100644
index 000000000..52ab36d0b
--- /dev/null
+++ b/src/olmo_eval/evals/tasks/smoke_tests.py
@@ -0,0 +1,286 @@
+"""Smoke tests for basic model sanity checks.
+
+Simple, single-instance tests to verify models respond correctly to basic prompts.
+
+Usage:
+    # Generic identity check (no scoring, just captures response)
+    olmo-eval run -m any-model -t smoke_identity
+
+    # Model-specific identity checks (scores against expected substring)
+    olmo-eval run -m olmo-model -t smoke_identity_olmo
+    olmo-eval run -m llama-model -t smoke_identity_llama
+    olmo-eval run -m gpt-model -t smoke_identity_gpt
+
+    # Basic hello test
+    olmo-eval run -m any-model -t smoke_hello
+
+    # Tool calling test (verifies model can make tool calls)
+    olmo-eval run -m any-model -t smoke_toolcall
+
+    # Reasoning test (verifies model returns reasoning content)
+    olmo-eval run -m reasoning-model -t smoke_reasoning
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from dataclasses import dataclass
+from typing import ClassVar
+
+from olmo_eval.common.formatters import CompletionFormatter
+from olmo_eval.common.metrics import AccuracyMetric
+from olmo_eval.common.scorers import Scorer, ToolCallScorer
+from olmo_eval.common.types import (
+    Instance,
+    LMOutput,
+    LMRequest,
+    RequestType,
+    SamplingParams,
+    ToolSchema,
+)
+from olmo_eval.evals.tasks.common import Task, register
+
+# =============================================================================
+# Substring Scorer
+# =============================================================================
+
+
+@dataclass(frozen=True, slots=True)
+class SubstringScorer(Scorer):
+    """Score 1.0 if gold answer substring appears in the output, else 0.0.
+
+    This is useful for identity checks where we want to verify the model
+    mentions a specific name/identifier in its response.
+    """
+
+    name: ClassVar[str] = "substring_match"
+    case_sensitive: bool = False
+
+    def score(self, instance: Instance, output: LMOutput) -> float:
+        if not instance.gold_answer:
+            # No expected substring configured - skip scoring
+            return 1.0
+
+        text = output.text or ""
+        expected = instance.gold_answer
+
+        if not self.case_sensitive:
+            text = text.lower()
+            expected = expected.lower()
+
+        return 1.0 if expected in text else 0.0
+
+
+@dataclass(frozen=True, slots=True)
+class NonEmptyResponseScorer(Scorer):
+    """Score 1.0 if model produced a non-empty response, else 0.0."""
+
+    name: ClassVar[str] = "non_empty_response"
+
+    def score(self, instance: Instance, output: LMOutput) -> float:
+        return 1.0 if output.text and output.text.strip() else 0.0
+
+
+@dataclass(frozen=True, slots=True)
+class ReasoningScorer(Scorer):
+    """Score 1.0 if model produced reasoning content, else 0.0.
+
+    This verifies that reasoning models correctly return their chain-of-thought
+    in the reasoning field of the response.
+    """
+
+    name: ClassVar[str] = "reasoning_present"
+
+    def score(self, instance: Instance, output: LMOutput) -> float:
+        return 1.0 if output.has_reasoning else 0.0
+
+
+# =============================================================================
+# Smoke Test Base Classes
+# =============================================================================
+
+
+class IdentitySmokeBase(Task):
+    """Base class for identity smoke tests.
+
+    Subclasses set `expected_substring` to define what the model should say.
+    Uses CompletionFormatter (not ChatFormatter which produces CHAT requests
+    that require a backend for agentic loops).
+    """
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=1024)
+    formatter = CompletionFormatter(template="User: {question}\nAssistant:")
+
+    # Override in subclasses to set expected model identity
+    expected_substring: str = ""
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="Who are you?",
+            gold_answer=self.expected_substring,
+            metadata={"id": "identity", "check_type": "substring"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return self.config.formatter.format(instance, self.get_fewshot())
+
+
+# =============================================================================
+# Registered Smoke Test Tasks
+# =============================================================================
+
+
+@register("smoke_identity")
+class IdentitySmoke(IdentitySmokeBase):
+    """Smoke test: does the model correctly identify itself?
+
+    This is the generic version with no expected substring - it just checks
+    that the model produces a non-empty response. Use model-specific tasks
+    for substring matching:
+        - smoke_identity_olmo
+        - smoke_identity_llama
+        - smoke_identity_gpt
+        - etc.
+    """
+
+    expected_substring = ""
+    metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),)
+
+
+@register("smoke_identity_olmo")
+class IdentitySmokeOlmo(IdentitySmokeBase):
+    """Identity smoke test expecting 'Olmo' in response."""
+
+    expected_substring = "Olmo"
+    metrics = (AccuracyMetric(scorer=SubstringScorer),)
+
+
+@register("smoke_identity_llama")
+class IdentitySmokeLlama(IdentitySmokeBase):
+    """Identity smoke test expecting 'Llama' in response."""
+
+    expected_substring = "Llama"
+    metrics = (AccuracyMetric(scorer=SubstringScorer),)
+
+
+@register("smoke_identity_gpt")
+class IdentitySmokeGpt(IdentitySmokeBase):
+    """Identity smoke test expecting 'GPT' in response."""
+
+    expected_substring = "GPT"
+    metrics = (AccuracyMetric(scorer=SubstringScorer),)
+
+
+@register("smoke_hello")
+class HelloSmoke(Task):
+    """Smoke test: can the model respond to a greeting?
+
+    A basic sanity check that the model can produce a non-empty response.
+    Scores 1.0 if response is non-empty, 0.0 otherwise.
+    """
+
+    sampling_params = SamplingParams(temperature=0.0)
+    formatter = CompletionFormatter(template="User: {question}\nAssistant:")
+    metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),)
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="Hello!",
+            gold_answer="",
+            metadata={"id": "hello"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return self.config.formatter.format(instance, self.get_fewshot())
+
+
+# =============================================================================
+# Tool Calling Smoke Test
+# =============================================================================
+
+# Weather tool schema for testing tool calls
+_WEATHER_TOOL = ToolSchema(
+    name="get_current_weather",
+    description="Get the current weather in a given location",
+    parameters={
+        "type": "object",
+        "properties": {
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+            "unit": {
+                "type": "string",
+                "enum": ["celsius", "fahrenheit"],
+            },
+        },
+        "required": ["location"],
+    },
+)
+
+
+@register("smoke_toolcall")
+class ToolCallSmoke(Task):
+    """Smoke test: can the model make tool calls?
+
+    Verifies that the model can correctly invoke a tool when provided with
+    a tool schema. The test asks about weather, expecting the model to call
+    the get_current_weather tool.
+
+    Scores 1.0 if the model calls the expected tool, 0.0 otherwise.
+    """
+
+    sampling_params = SamplingParams(temperature=0.0)
+    metrics = (AccuracyMetric(scorer=ToolCallScorer),)
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="What's the weather like in Seattle?",
+            gold_answer="",
+            expected_tool_calls=({"name": "get_current_weather"},),
+            metadata={"id": "toolcall", "check_type": "tool_call"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return LMRequest(
+            request_type=RequestType.COMPLETION,
+            messages=({"role": "user", "content": instance.question},),
+            tools=(_WEATHER_TOOL,),
+        )
+
+
+# =============================================================================
+# Reasoning Smoke Test
+# =============================================================================
+
+
+@register("smoke_reasoning")
+class ReasoningSmoke(Task):
+    """Smoke test: does the model return reasoning content?
+
+    Verifies that reasoning models correctly parse and return their
+    chain-of-thought reasoning in the response. This test asks a simple
+    question and checks that the reasoning field is populated.
+
+    Scores 1.0 if reasoning is present, 0.0 otherwise.
+    """
+
+    sampling_params = SamplingParams(temperature=0.0)
+    metrics = (AccuracyMetric(scorer=ReasoningScorer),)
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="Who are you?",
+            gold_answer="",
+            metadata={"id": "reasoning", "check_type": "reasoning_present"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return LMRequest(
+            request_type=RequestType.COMPLETION,
+            messages=({"role": "user", "content": instance.question},),
+        )
diff --git a/src/olmo_eval/inference/providers/litellm.py b/src/olmo_eval/inference/providers/litellm.py
index 3cfbeeee5..a73ec453f 100644
--- a/src/olmo_eval/inference/providers/litellm.py
+++ b/src/olmo_eval/inference/providers/litellm.py
@@ -7,7 +7,7 @@
 
 from olmo_eval.common.debug import is_debug_provider
 from olmo_eval.common.logging import get_logger
-from olmo_eval.common.types import LMOutput, LMRequest, LogProbEntry, SamplingParams
+from olmo_eval.common.types import LMOutput, LMRequest, LogProbEntry, SamplingParams, ToolCall
 from olmo_eval.inference.base import InferenceProvider
 from olmo_eval.inference.retry import retry_with_backoff
 from olmo_eval.inference.utils import run_async
@@ -134,7 +134,11 @@ async def _generate_single_impl(
             kwargs["temperature"] = params.temperature
         if params.stop_sequences:
             kwargs["stop"] = list(params.stop_sequences)[:_MAX_STOP_SEQUENCES]
-        # Always request logprobs for metrics computation
+
+        # Pass tools if provided in the request
+        if request.tools:
+            kwargs["tools"] = [tool.to_openai() for tool in request.tools]
+
         kwargs["logprobs"] = True
         kwargs["top_logprobs"] = (
             1  # NOTE: workaround for litellm proxy issue https://github.com/BerriAI/litellm/issues/21932
@@ -168,7 +172,36 @@ async def _generate_single_impl(
                     "num_tokens_all": num_tokens,
                 }
 
-            outputs.append(LMOutput(text=text, logprobs=logprob_entries, metadata=metadata))
+            # Extract tool calls from response
+            tool_calls: list[ToolCall] | None = None
+            message_tool_calls = getattr(choice.message, "tool_calls", None)
+            if message_tool_calls:
+                tool_calls = [ToolCall.from_openai(tc.model_dump()) for tc in message_tool_calls]
+
+            # Extract reasoning from response (for reasoning models)
+            # Check both 'reasoning' and 'reasoning_content' fields
+            reasoning: str | None = None
+            reasoning_content: str | None = None
+            message_content = getattr(choice.message, "content", None)
+            if message_content is not None:
+                if hasattr(message_content, "reasoning"):
+                    reasoning = message_content.reasoning
+                if hasattr(message_content, "reasoning_content"):
+                    reasoning_content = message_content.reasoning_content
+            # Also check directly on message for reasoning_content (some APIs use this)
+            if reasoning_content is None:
+                reasoning_content = getattr(choice.message, "reasoning_content", None)
+
+            outputs.append(
+                LMOutput(
+                    text=text,
+                    logprobs=logprob_entries,
+                    metadata=metadata,
+                    tool_calls=tool_calls,
+                    reasoning=reasoning,
+                    reasoning_content=reasoning_content,
+                )
+            )
 
         return outputs
 

From 87f5995f21fd011072fd2d2acfac73e426c417a0 Mon Sep 17 00:00:00 2001
From: Paul Laskowski <paull@allenai.org>
Date: Tue, 17 Mar 2026 16:58:11 -0700
Subject: [PATCH 2/4] fix type errors

---
 src/olmo_eval/evals/tasks/smoke_tests.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py
index 52ab36d0b..b72bc319a 100644
--- a/src/olmo_eval/evals/tasks/smoke_tests.py
+++ b/src/olmo_eval/evals/tasks/smoke_tests.py
@@ -123,7 +123,11 @@ def instances(self) -> Iterator[Instance]:
         )
 
     def format_request(self, instance: Instance) -> LMRequest:
-        return self.config.formatter.format(instance, self.get_fewshot())
+        """Format instance for the language model."""
+        if self.config.formatter is not None:
+            return self.config.formatter.format(instance, self.get_fewshot())
+        # Fallback formatting
+        return LMRequest(request_type=self.request_type, prompt=instance.question)
 
 
 # =============================================================================
@@ -193,7 +197,11 @@ def instances(self) -> Iterator[Instance]:
         )
 
     def format_request(self, instance: Instance) -> LMRequest:
-        return self.config.formatter.format(instance, self.get_fewshot())
+        """Format instance for the language model."""
+        if self.config.formatter is not None:
+            return self.config.formatter.format(instance, self.get_fewshot())
+        # Fallback formatting
+        return LMRequest(request_type=self.request_type, prompt=instance.question)
 
 
 # =============================================================================

From ca7a14c56e2d5d92739bc4a48d21086ca9810ca6 Mon Sep 17 00:00:00 2001
From: Paul Laskowski <paull@allenai.org>
Date: Thu, 19 Mar 2026 13:44:58 -0700
Subject: [PATCH 3/4] refactor reasoning check into provider specific dict

---
 src/olmo_eval/common/types/base.py           | 17 ++++++----------
 src/olmo_eval/inference/providers/litellm.py | 21 +++++++++-----------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/olmo_eval/common/types/base.py b/src/olmo_eval/common/types/base.py
index d4736cdcf..3789aa938 100644
--- a/src/olmo_eval/common/types/base.py
+++ b/src/olmo_eval/common/types/base.py
@@ -162,6 +162,9 @@ class LMOutput:
     """Output from a language model.
 
     Supports both text generation and tool calling outputs.
+
+    The provider_extras dict holds provider-specific fields (e.g., has_reasoning
+    flag from reasoning models). Only providers that need it populate this field.
     """
 
     text: str
@@ -169,8 +172,7 @@ class LMOutput:
     extracted_answer: Any = None
     metadata: dict[str, Any] = field(default_factory=dict)
     tool_calls: list[ToolCall] | None = None
-    reasoning: str | None = None
-    reasoning_content: str | None = None
+    provider_extras: dict[str, Any] = field(default_factory=dict)
 
     @property
     def has_tool_calls(self) -> bool:
@@ -179,15 +181,8 @@ def has_tool_calls(self) -> bool:
 
     @property
     def has_reasoning(self) -> bool:
-        """Check if this output contains reasoning content.
-
-        Returns True if either 'reasoning' or 'reasoning_content' field is present.
-        """
-        has_reasoning_field = self.reasoning is not None and len(self.reasoning) > 0
-        has_reasoning_content_field = (
-            self.reasoning_content is not None and len(self.reasoning_content) > 0
-        )
-        return has_reasoning_field or has_reasoning_content_field
+        """Check if this output contains reasoning content."""
+        return self.provider_extras.get("has_reasoning", False)
 
 
 @dataclass(slots=True)
diff --git a/src/olmo_eval/inference/providers/litellm.py b/src/olmo_eval/inference/providers/litellm.py
index a73ec453f..df9eb3f75 100644
--- a/src/olmo_eval/inference/providers/litellm.py
+++ b/src/olmo_eval/inference/providers/litellm.py
@@ -178,19 +178,17 @@ async def _generate_single_impl(
             if message_tool_calls:
                 tool_calls = [ToolCall.from_openai(tc.model_dump()) for tc in message_tool_calls]
 
-            # Extract reasoning from response (for reasoning models)
-            # Check both 'reasoning' and 'reasoning_content' fields
-            reasoning: str | None = None
-            reasoning_content: str | None = None
+            # Check for reasoning content (for reasoning models)
+            has_reasoning = False
             message_content = getattr(choice.message, "content", None)
             if message_content is not None:
-                if hasattr(message_content, "reasoning"):
-                    reasoning = message_content.reasoning
-                if hasattr(message_content, "reasoning_content"):
-                    reasoning_content = message_content.reasoning_content
+                if getattr(message_content, "reasoning", None):
+                    has_reasoning = True
+                if getattr(message_content, "reasoning_content", None):
+                    has_reasoning = True
             # Also check directly on message for reasoning_content (some APIs use this)
-            if reasoning_content is None:
-                reasoning_content = getattr(choice.message, "reasoning_content", None)
+            if not has_reasoning and getattr(choice.message, "reasoning_content", None):
+                has_reasoning = True
 
             outputs.append(
                 LMOutput(
@@ -198,8 +196,7 @@ async def _generate_single_impl(
                     logprobs=logprob_entries,
                     metadata=metadata,
                     tool_calls=tool_calls,
-                    reasoning=reasoning,
-                    reasoning_content=reasoning_content,
+                    provider_extras={"has_reasoning": True} if has_reasoning else {},
                 )
             )
 

From e922f339ffd0d3f441f69d517157ec310549a536 Mon Sep 17 00:00:00 2001
From: Paul Laskowski <paull@allenai.org>
Date: Wed, 8 Apr 2026 08:30:47 -0700
Subject: [PATCH 4/4] refactor for feedback

---
 src/olmo_eval/evals/suites/smoke.py          |  22 --
 src/olmo_eval/evals/tasks/response_checks.py | 183 ++++++++++++
 src/olmo_eval/evals/tasks/smoke_tests.py     | 294 -------------------
 3 files changed, 183 insertions(+), 316 deletions(-)
 delete mode 100644 src/olmo_eval/evals/suites/smoke.py
 create mode 100644 src/olmo_eval/evals/tasks/response_checks.py
 delete mode 100644 src/olmo_eval/evals/tasks/smoke_tests.py

diff --git a/src/olmo_eval/evals/suites/smoke.py b/src/olmo_eval/evals/suites/smoke.py
deleted file mode 100644
index a765cee73..000000000
--- a/src/olmo_eval/evals/suites/smoke.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""Smoke test suites for basic model sanity checks."""
-
-from olmo_eval.evals.suites.registry import AggregationStrategy, make_suite
-
-# =============================================================================
-# Smoke Test Suites
-# =============================================================================
-
-
-OLMO_INSTRUCT_SMOKE = make_suite(
-    name="olmo:instruct:smoke",
-    tasks=("smoke_hello", "smoke_identity_olmo", "smoke_toolcall"),
-    aggregation=AggregationStrategy.NONE,
-    description="Smoke tests for Olmo3 instruct models",
-)
-
-OLMO_THINK_SMOKE = make_suite(
-    name="olmo:think:smoke",
-    tasks=("smoke_hello", "smoke_identity_olmo", "smoke_reasoning"),
-    aggregation=AggregationStrategy.NONE,
-    description="Smoke tests for Olmo3 think models",
-)
diff --git a/src/olmo_eval/evals/tasks/response_checks.py b/src/olmo_eval/evals/tasks/response_checks.py
new file mode 100644
index 000000000..cd449e310
--- /dev/null
+++ b/src/olmo_eval/evals/tasks/response_checks.py
@@ -0,0 +1,183 @@
+"""Response tests for verifying model response properties.
+
+Simple tests to verify models respond correctly with expected properties.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from dataclasses import dataclass
+from typing import ClassVar
+
+from olmo_eval.common.formatters import CompletionFormatter
+from olmo_eval.common.metrics import AccuracyMetric
+from olmo_eval.common.scorers import Scorer, SubstringRecallScorer, ToolCallScorer
+from olmo_eval.common.types import (
+    Instance,
+    LMOutput,
+    LMRequest,
+    RequestType,
+    SamplingParams,
+    ToolSchema,
+)
+from olmo_eval.evals.tasks.common import Task, register
+
+
+@dataclass(frozen=True, slots=True)
+class NonEmptyResponseScorer(Scorer):
+    """Score 1.0 if model produced a non-empty response, else 0.0."""
+
+    name: ClassVar[str] = "non_empty_response"
+
+    def score(self, instance: Instance, output: LMOutput) -> float:
+        return 1.0 if output.text and output.text.strip() else 0.0
+
+
+@dataclass(frozen=True, slots=True)
+class ReasoningResponseScorer(Scorer):
+    """Score 1.0 if model produced reasoning content, else 0.0.
+
+    This verifies that reasoning models correctly return their chain-of-thought
+    in the reasoning field of the response.
+    """
+
+    name: ClassVar[str] = "reasoning_present"
+
+    def score(self, instance: Instance, output: LMOutput) -> float:
+        return 1.0 if output.has_reasoning else 0.0
+
+
+# =============================================================================
+# Content Verification Response Test
+# =============================================================================
+
+
+@register("response_match")
+class ResponseContentVerify(Task):
+    """Verify that model responses contain expected content.
+
+    - Use without data_source (default): Asks "Who are you?" and checks for non-empty response
+
+    - Use with adhoc data_source: Loads prompts and expected substrings from file
+        and checks that each response contains the expected substring.
+
+    Data file format (JSONL):
+        {"question": "Who are you?", "expected_substring": "OLMo"}
+    """
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=1024)
+    formatter = CompletionFormatter(template="User: {question}\nAssistant:")
+    metrics = (
+        AccuracyMetric(scorer=SubstringRecallScorer),
+        AccuracyMetric(scorer=NonEmptyResponseScorer),
+    )
+    primary_metric = AccuracyMetric(scorer=SubstringRecallScorer)
+
+    def process_doc(self, doc: dict, index: int = 0) -> Instance:
+        return Instance(
+            question=doc["question"],
+            gold_answer=doc.get("expected_substring", ""),
+            metadata={"id": f"response_match_{index}", "check_type": "substring"},
+        )
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        if self.config.data_source is not None:
+            yield from self._load_instances()
+        else:
+            yield Instance(
+                question="Who are you?",
+                gold_answer="",
+                metadata={"id": "response_match_default", "check_type": "substring"},
+            )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        if self.config.formatter is not None:
+            return self.config.formatter.format(instance, self.get_fewshot())
+        return LMRequest(request_type=self.request_type, prompt=instance.question)
+
+
+# =============================================================================
+# Tool Calling Response Test
+# =============================================================================
+
+# Weather tool schema for testing tool calls
+_WEATHER_TOOL = ToolSchema(
+    name="get_current_weather",
+    description="Get the current weather in a given location",
+    parameters={
+        "type": "object",
+        "properties": {
+            "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA",
+            },
+            "unit": {
+                "type": "string",
+                "enum": ["celsius", "fahrenheit"],
+            },
+        },
+        "required": ["location"],
+    },
+)
+
+
+@register("response_toolcall")
+class ResponseToolCall(Task):
+    """Response test: can the model make tool calls?
+
+    Verifies that the model can correctly invoke a tool when provided with
+    a tool schema. The test asks about weather, expecting the model to call
+    the get_current_weather tool.
+    """
+
+    sampling_params = SamplingParams(temperature=0.0)
+    metrics = (AccuracyMetric(scorer=ToolCallScorer),)
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="What's the weather like in Seattle?",
+            gold_answer="",
+            expected_tool_calls=({"name": "get_current_weather"},),
+            metadata={"id": "toolcall", "check_type": "tool_call"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return LMRequest(
+            request_type=RequestType.COMPLETION,
+            messages=({"role": "user", "content": instance.question},),
+            tools=(_WEATHER_TOOL,),
+        )
+
+
+# =============================================================================
+# Reasoning Response Test
+# =============================================================================
+
+
+@register("response_reasoning")
+class ResponseReasoning(Task):
+    """Response test: does the model return reasoning content?
+
+    Verifies that reasoning models correctly parse and return their
+    chain-of-thought reasoning in the response. This test asks a simple
+    question and checks that the reasoning field is populated.
+    """
+
+    sampling_params = SamplingParams(temperature=0.0)
+    metrics = (AccuracyMetric(scorer=ReasoningResponseScorer),)
+
+    @property
+    def instances(self) -> Iterator[Instance]:
+        yield Instance(
+            question="Who are you?",
+            gold_answer="",
+            metadata={"id": "reasoning", "check_type": "reasoning_present"},
+        )
+
+    def format_request(self, instance: Instance) -> LMRequest:
+        return LMRequest(
+            request_type=RequestType.COMPLETION,
+            messages=({"role": "user", "content": instance.question},),
+        )
diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py
deleted file mode 100644
index b72bc319a..000000000
--- a/src/olmo_eval/evals/tasks/smoke_tests.py
+++ /dev/null
@@ -1,294 +0,0 @@
-"""Smoke tests for basic model sanity checks.
-
-Simple, single-instance tests to verify models respond correctly to basic prompts.
-
-Usage:
-    # Generic identity check (no scoring, just captures response)
-    olmo-eval run -m any-model -t smoke_identity
-
-    # Model-specific identity checks (scores against expected substring)
-    olmo-eval run -m olmo-model -t smoke_identity_olmo
-    olmo-eval run -m llama-model -t smoke_identity_llama
-    olmo-eval run -m gpt-model -t smoke_identity_gpt
-
-    # Basic hello test
-    olmo-eval run -m any-model -t smoke_hello
-
-    # Tool calling test (verifies model can make tool calls)
-    olmo-eval run -m any-model -t smoke_toolcall
-
-    # Reasoning test (verifies model returns reasoning content)
-    olmo-eval run -m reasoning-model -t smoke_reasoning
-"""
-
-from __future__ import annotations
-
-from collections.abc import Iterator
-from dataclasses import dataclass
-from typing import ClassVar
-
-from olmo_eval.common.formatters import CompletionFormatter
-from olmo_eval.common.metrics import AccuracyMetric
-from olmo_eval.common.scorers import Scorer, ToolCallScorer
-from olmo_eval.common.types import (
-    Instance,
-    LMOutput,
-    LMRequest,
-    RequestType,
-    SamplingParams,
-    ToolSchema,
-)
-from olmo_eval.evals.tasks.common import Task, register
-
-# =============================================================================
-# Substring Scorer
-# =============================================================================
-
-
-@dataclass(frozen=True, slots=True)
-class SubstringScorer(Scorer):
-    """Score 1.0 if gold answer substring appears in the output, else 0.0.
-
-    This is useful for identity checks where we want to verify the model
-    mentions a specific name/identifier in its response.
-    """
-
-    name: ClassVar[str] = "substring_match"
-    case_sensitive: bool = False
-
-    def score(self, instance: Instance, output: LMOutput) -> float:
-        if not instance.gold_answer:
-            # No expected substring configured - skip scoring
-            return 1.0
-
-        text = output.text or ""
-        expected = instance.gold_answer
-
-        if not self.case_sensitive:
-            text = text.lower()
-            expected = expected.lower()
-
-        return 1.0 if expected in text else 0.0
-
-
-@dataclass(frozen=True, slots=True)
-class NonEmptyResponseScorer(Scorer):
-    """Score 1.0 if model produced a non-empty response, else 0.0."""
-
-    name: ClassVar[str] = "non_empty_response"
-
-    def score(self, instance: Instance, output: LMOutput) -> float:
-        return 1.0 if output.text and output.text.strip() else 0.0
-
-
-@dataclass(frozen=True, slots=True)
-class ReasoningScorer(Scorer):
-    """Score 1.0 if model produced reasoning content, else 0.0.
-
-    This verifies that reasoning models correctly return their chain-of-thought
-    in the reasoning field of the response.
-    """
-
-    name: ClassVar[str] = "reasoning_present"
-
-    def score(self, instance: Instance, output: LMOutput) -> float:
-        return 1.0 if output.has_reasoning else 0.0
-
-
-# =============================================================================
-# Smoke Test Base Classes
-# =============================================================================
-
-
-class IdentitySmokeBase(Task):
-    """Base class for identity smoke tests.
-
-    Subclasses set `expected_substring` to define what the model should say.
-    Uses CompletionFormatter (not ChatFormatter which produces CHAT requests
-    that require a backend for agentic loops).
-    """
-
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=1024)
-    formatter = CompletionFormatter(template="User: {question}\nAssistant:")
-
-    # Override in subclasses to set expected model identity
-    expected_substring: str = ""
-
-    @property
-    def instances(self) -> Iterator[Instance]:
-        yield Instance(
-            question="Who are you?",
-            gold_answer=self.expected_substring,
-            metadata={"id": "identity", "check_type": "substring"},
-        )
-
-    def format_request(self, instance: Instance) -> LMRequest:
-        """Format instance for the language model."""
-        if self.config.formatter is not None:
-            return self.config.formatter.format(instance, self.get_fewshot())
-        # Fallback formatting
-        return LMRequest(request_type=self.request_type, prompt=instance.question)
-
-
-# =============================================================================
-# Registered Smoke Test Tasks
-# =============================================================================
-
-
-@register("smoke_identity")
-class IdentitySmoke(IdentitySmokeBase):
-    """Smoke test: does the model correctly identify itself?
-
-    This is the generic version with no expected substring - it just checks
-    that the model produces a non-empty response. Use model-specific tasks
-    for substring matching:
-        - smoke_identity_olmo
-        - smoke_identity_llama
-        - smoke_identity_gpt
-        - etc.
-    """
-
-    expected_substring = ""
-    metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),)
-
-
-@register("smoke_identity_olmo")
-class IdentitySmokeOlmo(IdentitySmokeBase):
-    """Identity smoke test expecting 'Olmo' in response."""
-
-    expected_substring = "Olmo"
-    metrics = (AccuracyMetric(scorer=SubstringScorer),)
-
-
-@register("smoke_identity_llama")
-class IdentitySmokeLlama(IdentitySmokeBase):
-    """Identity smoke test expecting 'Llama' in response."""
-
-    expected_substring = "Llama"
-    metrics = (AccuracyMetric(scorer=SubstringScorer),)
-
-
-@register("smoke_identity_gpt")
-class IdentitySmokeGpt(IdentitySmokeBase):
-    """Identity smoke test expecting 'GPT' in response."""
-
-    expected_substring = "GPT"
-    metrics = (AccuracyMetric(scorer=SubstringScorer),)
-
-
-@register("smoke_hello")
-class HelloSmoke(Task):
-    """Smoke test: can the model respond to a greeting?
-
-    A basic sanity check that the model can produce a non-empty response.
-    Scores 1.0 if response is non-empty, 0.0 otherwise.
-    """
-
-    sampling_params = SamplingParams(temperature=0.0)
-    formatter = CompletionFormatter(template="User: {question}\nAssistant:")
-    metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),)
-
-    @property
-    def instances(self) -> Iterator[Instance]:
-        yield Instance(
-            question="Hello!",
-            gold_answer="",
-            metadata={"id": "hello"},
-        )
-
-    def format_request(self, instance: Instance) -> LMRequest:
-        """Format instance for the language model."""
-        if self.config.formatter is not None:
-            return self.config.formatter.format(instance, self.get_fewshot())
-        # Fallback formatting
-        return LMRequest(request_type=self.request_type, prompt=instance.question)
-
-
-# =============================================================================
-# Tool Calling Smoke Test
-# =============================================================================
-
-# Weather tool schema for testing tool calls
-_WEATHER_TOOL = ToolSchema(
-    name="get_current_weather",
-    description="Get the current weather in a given location",
-    parameters={
-        "type": "object",
-        "properties": {
-            "location": {
-                "type": "string",
-                "description": "The city and state, e.g. San Francisco, CA",
-            },
-            "unit": {
-                "type": "string",
-                "enum": ["celsius", "fahrenheit"],
-            },
-        },
-        "required": ["location"],
-    },
-)
-
-
-@register("smoke_toolcall")
-class ToolCallSmoke(Task):
-    """Smoke test: can the model make tool calls?
-
-    Verifies that the model can correctly invoke a tool when provided with
-    a tool schema. The test asks about weather, expecting the model to call
-    the get_current_weather tool.
-
-    Scores 1.0 if the model calls the expected tool, 0.0 otherwise.
-    """
-
-    sampling_params = SamplingParams(temperature=0.0)
-    metrics = (AccuracyMetric(scorer=ToolCallScorer),)
-
-    @property
-    def instances(self) -> Iterator[Instance]:
-        yield Instance(
-            question="What's the weather like in Seattle?",
-            gold_answer="",
-            expected_tool_calls=({"name": "get_current_weather"},),
-            metadata={"id": "toolcall", "check_type": "tool_call"},
-        )
-
-    def format_request(self, instance: Instance) -> LMRequest:
-        return LMRequest(
-            request_type=RequestType.COMPLETION,
-            messages=({"role": "user", "content": instance.question},),
-            tools=(_WEATHER_TOOL,),
-        )
-
-
-# =============================================================================
-# Reasoning Smoke Test
-# =============================================================================
-
-
-@register("smoke_reasoning")
-class ReasoningSmoke(Task):
-    """Smoke test: does the model return reasoning content?
-
-    Verifies that reasoning models correctly parse and return their
-    chain-of-thought reasoning in the response. This test asks a simple
-    question and checks that the reasoning field is populated.
-
-    Scores 1.0 if reasoning is present, 0.0 otherwise.
-    """
-
-    sampling_params = SamplingParams(temperature=0.0)
-    metrics = (AccuracyMetric(scorer=ReasoningScorer),)
-
-    @property
-    def instances(self) -> Iterator[Instance]:
-        yield Instance(
-            question="Who are you?",
-            gold_answer="",
-            metadata={"id": "reasoning", "check_type": "reasoning_present"},
-        )
-
-    def format_request(self, instance: Instance) -> LMRequest:
-        return LMRequest(
-            request_type=RequestType.COMPLETION,
-            messages=({"role": "user", "content": instance.question},),
-        )