From 34d8e145272dc80bd33c0b7c73a145804d1ac43b Mon Sep 17 00:00:00 2001 From: Paul Laskowski Date: Fri, 27 Feb 2026 10:08:04 -0800 Subject: [PATCH 1/4] adds smoke simple smoke tests --- src/olmo_eval/common/types/base.py | 14 + src/olmo_eval/evals/suites/smoke.py | 22 ++ src/olmo_eval/evals/tasks/smoke_tests.py | 286 +++++++++++++++++++ src/olmo_eval/inference/providers/litellm.py | 39 ++- 4 files changed, 358 insertions(+), 3 deletions(-) create mode 100644 src/olmo_eval/evals/suites/smoke.py create mode 100644 src/olmo_eval/evals/tasks/smoke_tests.py diff --git a/src/olmo_eval/common/types/base.py b/src/olmo_eval/common/types/base.py index 625c973d7..d4736cdcf 100644 --- a/src/olmo_eval/common/types/base.py +++ b/src/olmo_eval/common/types/base.py @@ -169,12 +169,26 @@ class LMOutput: extracted_answer: Any = None metadata: dict[str, Any] = field(default_factory=dict) tool_calls: list[ToolCall] | None = None + reasoning: str | None = None + reasoning_content: str | None = None @property def has_tool_calls(self) -> bool: """Check if this output contains tool calls.""" return self.tool_calls is not None and len(self.tool_calls) > 0 + @property + def has_reasoning(self) -> bool: + """Check if this output contains reasoning content. + + Returns True if either 'reasoning' or 'reasoning_content' field is present. + """ + has_reasoning_field = self.reasoning is not None and len(self.reasoning) > 0 + has_reasoning_content_field = ( + self.reasoning_content is not None and len(self.reasoning_content) > 0 + ) + return has_reasoning_field or has_reasoning_content_field + @dataclass(slots=True) class Response: diff --git a/src/olmo_eval/evals/suites/smoke.py b/src/olmo_eval/evals/suites/smoke.py new file mode 100644 index 000000000..a765cee73 --- /dev/null +++ b/src/olmo_eval/evals/suites/smoke.py @@ -0,0 +1,22 @@ +"""Smoke test suites for basic model sanity checks.""" + +from olmo_eval.evals.suites.registry import AggregationStrategy, make_suite + +# ============================================================================= +# Smoke Test Suites +# ============================================================================= + + +OLMO_INSTRUCT_SMOKE = make_suite( + name="olmo:instruct:smoke", + tasks=("smoke_hello", "smoke_identity_olmo", "smoke_toolcall"), + aggregation=AggregationStrategy.NONE, + description="Smoke tests for Olmo3 instruct models", +) + +OLMO_THINK_SMOKE = make_suite( + name="olmo:think:smoke", + tasks=("smoke_hello", "smoke_identity_olmo", "smoke_reasoning"), + aggregation=AggregationStrategy.NONE, + description="Smoke tests for Olmo3 think models", +) diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py new file mode 100644 index 000000000..52ab36d0b --- /dev/null +++ b/src/olmo_eval/evals/tasks/smoke_tests.py @@ -0,0 +1,286 @@ +"""Smoke tests for basic model sanity checks. + +Simple, single-instance tests to verify models respond correctly to basic prompts. + +Usage: + # Generic identity check (no scoring, just captures response) + olmo-eval run -m any-model -t smoke_identity + + # Model-specific identity checks (scores against expected substring) + olmo-eval run -m olmo-model -t smoke_identity_olmo + olmo-eval run -m llama-model -t smoke_identity_llama + olmo-eval run -m gpt-model -t smoke_identity_gpt + + # Basic hello test + olmo-eval run -m any-model -t smoke_hello + + # Tool calling test (verifies model can make tool calls) + olmo-eval run -m any-model -t smoke_toolcall + + # Reasoning test (verifies model returns reasoning content) + olmo-eval run -m reasoning-model -t smoke_reasoning +""" + +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass +from typing import ClassVar + +from olmo_eval.common.formatters import CompletionFormatter +from olmo_eval.common.metrics import AccuracyMetric +from olmo_eval.common.scorers import Scorer, ToolCallScorer +from olmo_eval.common.types import ( + Instance, + LMOutput, + LMRequest, + RequestType, + SamplingParams, + ToolSchema, +) +from olmo_eval.evals.tasks.common import Task, register + +# ============================================================================= +# Substring Scorer +# ============================================================================= + + +@dataclass(frozen=True, slots=True) +class SubstringScorer(Scorer): + """Score 1.0 if gold answer substring appears in the output, else 0.0. + + This is useful for identity checks where we want to verify the model + mentions a specific name/identifier in its response. + """ + + name: ClassVar[str] = "substring_match" + case_sensitive: bool = False + + def score(self, instance: Instance, output: LMOutput) -> float: + if not instance.gold_answer: + # No expected substring configured - skip scoring + return 1.0 + + text = output.text or "" + expected = instance.gold_answer + + if not self.case_sensitive: + text = text.lower() + expected = expected.lower() + + return 1.0 if expected in text else 0.0 + + +@dataclass(frozen=True, slots=True) +class NonEmptyResponseScorer(Scorer): + """Score 1.0 if model produced a non-empty response, else 0.0.""" + + name: ClassVar[str] = "non_empty_response" + + def score(self, instance: Instance, output: LMOutput) -> float: + return 1.0 if output.text and output.text.strip() else 0.0 + + +@dataclass(frozen=True, slots=True) +class ReasoningScorer(Scorer): + """Score 1.0 if model produced reasoning content, else 0.0. + + This verifies that reasoning models correctly return their chain-of-thought + in the reasoning field of the response. + """ + + name: ClassVar[str] = "reasoning_present" + + def score(self, instance: Instance, output: LMOutput) -> float: + return 1.0 if output.has_reasoning else 0.0 + + +# ============================================================================= +# Smoke Test Base Classes +# ============================================================================= + + +class IdentitySmokeBase(Task): + """Base class for identity smoke tests. + + Subclasses set `expected_substring` to define what the model should say. + Uses CompletionFormatter (not ChatFormatter which produces CHAT requests + that require a backend for agentic loops). + """ + + sampling_params = SamplingParams(temperature=0.0, max_tokens=1024) + formatter = CompletionFormatter(template="User: {question}\nAssistant:") + + # Override in subclasses to set expected model identity + expected_substring: str = "" + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="Who are you?", + gold_answer=self.expected_substring, + metadata={"id": "identity", "check_type": "substring"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return self.config.formatter.format(instance, self.get_fewshot()) + + +# ============================================================================= +# Registered Smoke Test Tasks +# ============================================================================= + + +@register("smoke_identity") +class IdentitySmoke(IdentitySmokeBase): + """Smoke test: does the model correctly identify itself? + + This is the generic version with no expected substring - it just checks + that the model produces a non-empty response. Use model-specific tasks + for substring matching: + - smoke_identity_olmo + - smoke_identity_llama + - smoke_identity_gpt + - etc. + """ + + expected_substring = "" + metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),) + + +@register("smoke_identity_olmo") +class IdentitySmokeOlmo(IdentitySmokeBase): + """Identity smoke test expecting 'Olmo' in response.""" + + expected_substring = "Olmo" + metrics = (AccuracyMetric(scorer=SubstringScorer),) + + +@register("smoke_identity_llama") +class IdentitySmokeLlama(IdentitySmokeBase): + """Identity smoke test expecting 'Llama' in response.""" + + expected_substring = "Llama" + metrics = (AccuracyMetric(scorer=SubstringScorer),) + + +@register("smoke_identity_gpt") +class IdentitySmokeGpt(IdentitySmokeBase): + """Identity smoke test expecting 'GPT' in response.""" + + expected_substring = "GPT" + metrics = (AccuracyMetric(scorer=SubstringScorer),) + + +@register("smoke_hello") +class HelloSmoke(Task): + """Smoke test: can the model respond to a greeting? + + A basic sanity check that the model can produce a non-empty response. + Scores 1.0 if response is non-empty, 0.0 otherwise. + """ + + sampling_params = SamplingParams(temperature=0.0) + formatter = CompletionFormatter(template="User: {question}\nAssistant:") + metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),) + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="Hello!", + gold_answer="", + metadata={"id": "hello"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return self.config.formatter.format(instance, self.get_fewshot()) + + +# ============================================================================= +# Tool Calling Smoke Test +# ============================================================================= + +# Weather tool schema for testing tool calls +_WEATHER_TOOL = ToolSchema( + name="get_current_weather", + description="Get the current weather in a given location", + parameters={ + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["location"], + }, +) + + +@register("smoke_toolcall") +class ToolCallSmoke(Task): + """Smoke test: can the model make tool calls? + + Verifies that the model can correctly invoke a tool when provided with + a tool schema. The test asks about weather, expecting the model to call + the get_current_weather tool. + + Scores 1.0 if the model calls the expected tool, 0.0 otherwise. + """ + + sampling_params = SamplingParams(temperature=0.0) + metrics = (AccuracyMetric(scorer=ToolCallScorer),) + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="What's the weather like in Seattle?", + gold_answer="", + expected_tool_calls=({"name": "get_current_weather"},), + metadata={"id": "toolcall", "check_type": "tool_call"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return LMRequest( + request_type=RequestType.COMPLETION, + messages=({"role": "user", "content": instance.question},), + tools=(_WEATHER_TOOL,), + ) + + +# ============================================================================= +# Reasoning Smoke Test +# ============================================================================= + + +@register("smoke_reasoning") +class ReasoningSmoke(Task): + """Smoke test: does the model return reasoning content? + + Verifies that reasoning models correctly parse and return their + chain-of-thought reasoning in the response. This test asks a simple + question and checks that the reasoning field is populated. + + Scores 1.0 if reasoning is present, 0.0 otherwise. + """ + + sampling_params = SamplingParams(temperature=0.0) + metrics = (AccuracyMetric(scorer=ReasoningScorer),) + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="Who are you?", + gold_answer="", + metadata={"id": "reasoning", "check_type": "reasoning_present"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return LMRequest( + request_type=RequestType.COMPLETION, + messages=({"role": "user", "content": instance.question},), + ) diff --git a/src/olmo_eval/inference/providers/litellm.py b/src/olmo_eval/inference/providers/litellm.py index 3cfbeeee5..a73ec453f 100644 --- a/src/olmo_eval/inference/providers/litellm.py +++ b/src/olmo_eval/inference/providers/litellm.py @@ -7,7 +7,7 @@ from olmo_eval.common.debug import is_debug_provider from olmo_eval.common.logging import get_logger -from olmo_eval.common.types import LMOutput, LMRequest, LogProbEntry, SamplingParams +from olmo_eval.common.types import LMOutput, LMRequest, LogProbEntry, SamplingParams, ToolCall from olmo_eval.inference.base import InferenceProvider from olmo_eval.inference.retry import retry_with_backoff from olmo_eval.inference.utils import run_async @@ -134,7 +134,11 @@ async def _generate_single_impl( kwargs["temperature"] = params.temperature if params.stop_sequences: kwargs["stop"] = list(params.stop_sequences)[:_MAX_STOP_SEQUENCES] - # Always request logprobs for metrics computation + + # Pass tools if provided in the request + if request.tools: + kwargs["tools"] = [tool.to_openai() for tool in request.tools] + kwargs["logprobs"] = True kwargs["top_logprobs"] = ( 1 # NOTE: workaround for litellm proxy issue https://github.com/BerriAI/litellm/issues/21932 @@ -168,7 +172,36 @@ async def _generate_single_impl( "num_tokens_all": num_tokens, } - outputs.append(LMOutput(text=text, logprobs=logprob_entries, metadata=metadata)) + # Extract tool calls from response + tool_calls: list[ToolCall] | None = None + message_tool_calls = getattr(choice.message, "tool_calls", None) + if message_tool_calls: + tool_calls = [ToolCall.from_openai(tc.model_dump()) for tc in message_tool_calls] + + # Extract reasoning from response (for reasoning models) + # Check both 'reasoning' and 'reasoning_content' fields + reasoning: str | None = None + reasoning_content: str | None = None + message_content = getattr(choice.message, "content", None) + if message_content is not None: + if hasattr(message_content, "reasoning"): + reasoning = message_content.reasoning + if hasattr(message_content, "reasoning_content"): + reasoning_content = message_content.reasoning_content + # Also check directly on message for reasoning_content (some APIs use this) + if reasoning_content is None: + reasoning_content = getattr(choice.message, "reasoning_content", None) + + outputs.append( + LMOutput( + text=text, + logprobs=logprob_entries, + metadata=metadata, + tool_calls=tool_calls, + reasoning=reasoning, + reasoning_content=reasoning_content, + ) + ) return outputs From 87f5995f21fd011072fd2d2acfac73e426c417a0 Mon Sep 17 00:00:00 2001 From: Paul Laskowski Date: Tue, 17 Mar 2026 16:58:11 -0700 Subject: [PATCH 2/4] fix type errors --- src/olmo_eval/evals/tasks/smoke_tests.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py index 52ab36d0b..b72bc319a 100644 --- a/src/olmo_eval/evals/tasks/smoke_tests.py +++ b/src/olmo_eval/evals/tasks/smoke_tests.py @@ -123,7 +123,11 @@ def instances(self) -> Iterator[Instance]: ) def format_request(self, instance: Instance) -> LMRequest: - return self.config.formatter.format(instance, self.get_fewshot()) + """Format instance for the language model.""" + if self.config.formatter is not None: + return self.config.formatter.format(instance, self.get_fewshot()) + # Fallback formatting + return LMRequest(request_type=self.request_type, prompt=instance.question) # ============================================================================= @@ -193,7 +197,11 @@ def instances(self) -> Iterator[Instance]: ) def format_request(self, instance: Instance) -> LMRequest: - return self.config.formatter.format(instance, self.get_fewshot()) + """Format instance for the language model.""" + if self.config.formatter is not None: + return self.config.formatter.format(instance, self.get_fewshot()) + # Fallback formatting + return LMRequest(request_type=self.request_type, prompt=instance.question) # ============================================================================= From ca7a14c56e2d5d92739bc4a48d21086ca9810ca6 Mon Sep 17 00:00:00 2001 From: Paul Laskowski Date: Thu, 19 Mar 2026 13:44:58 -0700 Subject: [PATCH 3/4] refactor reasoning check into provider specific dict --- src/olmo_eval/common/types/base.py | 17 ++++++---------- src/olmo_eval/inference/providers/litellm.py | 21 +++++++++----------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/src/olmo_eval/common/types/base.py b/src/olmo_eval/common/types/base.py index d4736cdcf..3789aa938 100644 --- a/src/olmo_eval/common/types/base.py +++ b/src/olmo_eval/common/types/base.py @@ -162,6 +162,9 @@ class LMOutput: """Output from a language model. Supports both text generation and tool calling outputs. + + The provider_extras dict holds provider-specific fields (e.g., has_reasoning + flag from reasoning models). Only providers that need it populate this field. """ text: str @@ -169,8 +172,7 @@ class LMOutput: extracted_answer: Any = None metadata: dict[str, Any] = field(default_factory=dict) tool_calls: list[ToolCall] | None = None - reasoning: str | None = None - reasoning_content: str | None = None + provider_extras: dict[str, Any] = field(default_factory=dict) @property def has_tool_calls(self) -> bool: @@ -179,15 +181,8 @@ def has_tool_calls(self) -> bool: @property def has_reasoning(self) -> bool: - """Check if this output contains reasoning content. - - Returns True if either 'reasoning' or 'reasoning_content' field is present. - """ - has_reasoning_field = self.reasoning is not None and len(self.reasoning) > 0 - has_reasoning_content_field = ( - self.reasoning_content is not None and len(self.reasoning_content) > 0 - ) - return has_reasoning_field or has_reasoning_content_field + """Check if this output contains reasoning content.""" + return self.provider_extras.get("has_reasoning", False) @dataclass(slots=True) diff --git a/src/olmo_eval/inference/providers/litellm.py b/src/olmo_eval/inference/providers/litellm.py index a73ec453f..df9eb3f75 100644 --- a/src/olmo_eval/inference/providers/litellm.py +++ b/src/olmo_eval/inference/providers/litellm.py @@ -178,19 +178,17 @@ async def _generate_single_impl( if message_tool_calls: tool_calls = [ToolCall.from_openai(tc.model_dump()) for tc in message_tool_calls] - # Extract reasoning from response (for reasoning models) - # Check both 'reasoning' and 'reasoning_content' fields - reasoning: str | None = None - reasoning_content: str | None = None + # Check for reasoning content (for reasoning models) + has_reasoning = False message_content = getattr(choice.message, "content", None) if message_content is not None: - if hasattr(message_content, "reasoning"): - reasoning = message_content.reasoning - if hasattr(message_content, "reasoning_content"): - reasoning_content = message_content.reasoning_content + if getattr(message_content, "reasoning", None): + has_reasoning = True + if getattr(message_content, "reasoning_content", None): + has_reasoning = True # Also check directly on message for reasoning_content (some APIs use this) - if reasoning_content is None: - reasoning_content = getattr(choice.message, "reasoning_content", None) + if not has_reasoning and getattr(choice.message, "reasoning_content", None): + has_reasoning = True outputs.append( LMOutput( @@ -198,8 +196,7 @@ async def _generate_single_impl( logprobs=logprob_entries, metadata=metadata, tool_calls=tool_calls, - reasoning=reasoning, - reasoning_content=reasoning_content, + provider_extras={"has_reasoning": True} if has_reasoning else {}, ) ) From e922f339ffd0d3f441f69d517157ec310549a536 Mon Sep 17 00:00:00 2001 From: Paul Laskowski Date: Wed, 8 Apr 2026 08:30:47 -0700 Subject: [PATCH 4/4] refactor for feedback --- src/olmo_eval/evals/suites/smoke.py | 22 -- src/olmo_eval/evals/tasks/response_checks.py | 183 ++++++++++++ src/olmo_eval/evals/tasks/smoke_tests.py | 294 ------------------- 3 files changed, 183 insertions(+), 316 deletions(-) delete mode 100644 src/olmo_eval/evals/suites/smoke.py create mode 100644 src/olmo_eval/evals/tasks/response_checks.py delete mode 100644 src/olmo_eval/evals/tasks/smoke_tests.py diff --git a/src/olmo_eval/evals/suites/smoke.py b/src/olmo_eval/evals/suites/smoke.py deleted file mode 100644 index a765cee73..000000000 --- a/src/olmo_eval/evals/suites/smoke.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Smoke test suites for basic model sanity checks.""" - -from olmo_eval.evals.suites.registry import AggregationStrategy, make_suite - -# ============================================================================= -# Smoke Test Suites -# ============================================================================= - - -OLMO_INSTRUCT_SMOKE = make_suite( - name="olmo:instruct:smoke", - tasks=("smoke_hello", "smoke_identity_olmo", "smoke_toolcall"), - aggregation=AggregationStrategy.NONE, - description="Smoke tests for Olmo3 instruct models", -) - -OLMO_THINK_SMOKE = make_suite( - name="olmo:think:smoke", - tasks=("smoke_hello", "smoke_identity_olmo", "smoke_reasoning"), - aggregation=AggregationStrategy.NONE, - description="Smoke tests for Olmo3 think models", -) diff --git a/src/olmo_eval/evals/tasks/response_checks.py b/src/olmo_eval/evals/tasks/response_checks.py new file mode 100644 index 000000000..cd449e310 --- /dev/null +++ b/src/olmo_eval/evals/tasks/response_checks.py @@ -0,0 +1,183 @@ +"""Response tests for verifying model response properties. + +Simple tests to verify models respond correctly with expected properties. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass +from typing import ClassVar + +from olmo_eval.common.formatters import CompletionFormatter +from olmo_eval.common.metrics import AccuracyMetric +from olmo_eval.common.scorers import Scorer, SubstringRecallScorer, ToolCallScorer +from olmo_eval.common.types import ( + Instance, + LMOutput, + LMRequest, + RequestType, + SamplingParams, + ToolSchema, +) +from olmo_eval.evals.tasks.common import Task, register + + +@dataclass(frozen=True, slots=True) +class NonEmptyResponseScorer(Scorer): + """Score 1.0 if model produced a non-empty response, else 0.0.""" + + name: ClassVar[str] = "non_empty_response" + + def score(self, instance: Instance, output: LMOutput) -> float: + return 1.0 if output.text and output.text.strip() else 0.0 + + +@dataclass(frozen=True, slots=True) +class ReasoningResponseScorer(Scorer): + """Score 1.0 if model produced reasoning content, else 0.0. + + This verifies that reasoning models correctly return their chain-of-thought + in the reasoning field of the response. + """ + + name: ClassVar[str] = "reasoning_present" + + def score(self, instance: Instance, output: LMOutput) -> float: + return 1.0 if output.has_reasoning else 0.0 + + +# ============================================================================= +# Content Verification Response Test +# ============================================================================= + + +@register("response_match") +class ResponseContentVerify(Task): + """Verify that model responses contain expected content. + + - Use without data_source (default): Asks "Who are you?" and checks for non-empty response + + - Use with adhoc data_source: Loads prompts and expected substrings from file + and checks that each response contains the expected substring. + + Data file format (JSONL): + {"question": "Who are you?", "expected_substring": "OLMo"} + """ + + sampling_params = SamplingParams(temperature=0.0, max_tokens=1024) + formatter = CompletionFormatter(template="User: {question}\nAssistant:") + metrics = ( + AccuracyMetric(scorer=SubstringRecallScorer), + AccuracyMetric(scorer=NonEmptyResponseScorer), + ) + primary_metric = AccuracyMetric(scorer=SubstringRecallScorer) + + def process_doc(self, doc: dict, index: int = 0) -> Instance: + return Instance( + question=doc["question"], + gold_answer=doc.get("expected_substring", ""), + metadata={"id": f"response_match_{index}", "check_type": "substring"}, + ) + + @property + def instances(self) -> Iterator[Instance]: + if self.config.data_source is not None: + yield from self._load_instances() + else: + yield Instance( + question="Who are you?", + gold_answer="", + metadata={"id": "response_match_default", "check_type": "substring"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + if self.config.formatter is not None: + return self.config.formatter.format(instance, self.get_fewshot()) + return LMRequest(request_type=self.request_type, prompt=instance.question) + + +# ============================================================================= +# Tool Calling Response Test +# ============================================================================= + +# Weather tool schema for testing tool calls +_WEATHER_TOOL = ToolSchema( + name="get_current_weather", + description="Get the current weather in a given location", + parameters={ + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["location"], + }, +) + + +@register("response_toolcall") +class ResponseToolCall(Task): + """Response test: can the model make tool calls? + + Verifies that the model can correctly invoke a tool when provided with + a tool schema. The test asks about weather, expecting the model to call + the get_current_weather tool. + """ + + sampling_params = SamplingParams(temperature=0.0) + metrics = (AccuracyMetric(scorer=ToolCallScorer),) + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="What's the weather like in Seattle?", + gold_answer="", + expected_tool_calls=({"name": "get_current_weather"},), + metadata={"id": "toolcall", "check_type": "tool_call"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return LMRequest( + request_type=RequestType.COMPLETION, + messages=({"role": "user", "content": instance.question},), + tools=(_WEATHER_TOOL,), + ) + + +# ============================================================================= +# Reasoning Response Test +# ============================================================================= + + +@register("response_reasoning") +class ResponseReasoning(Task): + """Response test: does the model return reasoning content? + + Verifies that reasoning models correctly parse and return their + chain-of-thought reasoning in the response. This test asks a simple + question and checks that the reasoning field is populated. + """ + + sampling_params = SamplingParams(temperature=0.0) + metrics = (AccuracyMetric(scorer=ReasoningResponseScorer),) + + @property + def instances(self) -> Iterator[Instance]: + yield Instance( + question="Who are you?", + gold_answer="", + metadata={"id": "reasoning", "check_type": "reasoning_present"}, + ) + + def format_request(self, instance: Instance) -> LMRequest: + return LMRequest( + request_type=RequestType.COMPLETION, + messages=({"role": "user", "content": instance.question},), + ) diff --git a/src/olmo_eval/evals/tasks/smoke_tests.py b/src/olmo_eval/evals/tasks/smoke_tests.py deleted file mode 100644 index b72bc319a..000000000 --- a/src/olmo_eval/evals/tasks/smoke_tests.py +++ /dev/null @@ -1,294 +0,0 @@ -"""Smoke tests for basic model sanity checks. - -Simple, single-instance tests to verify models respond correctly to basic prompts. - -Usage: - # Generic identity check (no scoring, just captures response) - olmo-eval run -m any-model -t smoke_identity - - # Model-specific identity checks (scores against expected substring) - olmo-eval run -m olmo-model -t smoke_identity_olmo - olmo-eval run -m llama-model -t smoke_identity_llama - olmo-eval run -m gpt-model -t smoke_identity_gpt - - # Basic hello test - olmo-eval run -m any-model -t smoke_hello - - # Tool calling test (verifies model can make tool calls) - olmo-eval run -m any-model -t smoke_toolcall - - # Reasoning test (verifies model returns reasoning content) - olmo-eval run -m reasoning-model -t smoke_reasoning -""" - -from __future__ import annotations - -from collections.abc import Iterator -from dataclasses import dataclass -from typing import ClassVar - -from olmo_eval.common.formatters import CompletionFormatter -from olmo_eval.common.metrics import AccuracyMetric -from olmo_eval.common.scorers import Scorer, ToolCallScorer -from olmo_eval.common.types import ( - Instance, - LMOutput, - LMRequest, - RequestType, - SamplingParams, - ToolSchema, -) -from olmo_eval.evals.tasks.common import Task, register - -# ============================================================================= -# Substring Scorer -# ============================================================================= - - -@dataclass(frozen=True, slots=True) -class SubstringScorer(Scorer): - """Score 1.0 if gold answer substring appears in the output, else 0.0. - - This is useful for identity checks where we want to verify the model - mentions a specific name/identifier in its response. - """ - - name: ClassVar[str] = "substring_match" - case_sensitive: bool = False - - def score(self, instance: Instance, output: LMOutput) -> float: - if not instance.gold_answer: - # No expected substring configured - skip scoring - return 1.0 - - text = output.text or "" - expected = instance.gold_answer - - if not self.case_sensitive: - text = text.lower() - expected = expected.lower() - - return 1.0 if expected in text else 0.0 - - -@dataclass(frozen=True, slots=True) -class NonEmptyResponseScorer(Scorer): - """Score 1.0 if model produced a non-empty response, else 0.0.""" - - name: ClassVar[str] = "non_empty_response" - - def score(self, instance: Instance, output: LMOutput) -> float: - return 1.0 if output.text and output.text.strip() else 0.0 - - -@dataclass(frozen=True, slots=True) -class ReasoningScorer(Scorer): - """Score 1.0 if model produced reasoning content, else 0.0. - - This verifies that reasoning models correctly return their chain-of-thought - in the reasoning field of the response. - """ - - name: ClassVar[str] = "reasoning_present" - - def score(self, instance: Instance, output: LMOutput) -> float: - return 1.0 if output.has_reasoning else 0.0 - - -# ============================================================================= -# Smoke Test Base Classes -# ============================================================================= - - -class IdentitySmokeBase(Task): - """Base class for identity smoke tests. - - Subclasses set `expected_substring` to define what the model should say. - Uses CompletionFormatter (not ChatFormatter which produces CHAT requests - that require a backend for agentic loops). - """ - - sampling_params = SamplingParams(temperature=0.0, max_tokens=1024) - formatter = CompletionFormatter(template="User: {question}\nAssistant:") - - # Override in subclasses to set expected model identity - expected_substring: str = "" - - @property - def instances(self) -> Iterator[Instance]: - yield Instance( - question="Who are you?", - gold_answer=self.expected_substring, - metadata={"id": "identity", "check_type": "substring"}, - ) - - def format_request(self, instance: Instance) -> LMRequest: - """Format instance for the language model.""" - if self.config.formatter is not None: - return self.config.formatter.format(instance, self.get_fewshot()) - # Fallback formatting - return LMRequest(request_type=self.request_type, prompt=instance.question) - - -# ============================================================================= -# Registered Smoke Test Tasks -# ============================================================================= - - -@register("smoke_identity") -class IdentitySmoke(IdentitySmokeBase): - """Smoke test: does the model correctly identify itself? - - This is the generic version with no expected substring - it just checks - that the model produces a non-empty response. Use model-specific tasks - for substring matching: - - smoke_identity_olmo - - smoke_identity_llama - - smoke_identity_gpt - - etc. - """ - - expected_substring = "" - metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),) - - -@register("smoke_identity_olmo") -class IdentitySmokeOlmo(IdentitySmokeBase): - """Identity smoke test expecting 'Olmo' in response.""" - - expected_substring = "Olmo" - metrics = (AccuracyMetric(scorer=SubstringScorer),) - - -@register("smoke_identity_llama") -class IdentitySmokeLlama(IdentitySmokeBase): - """Identity smoke test expecting 'Llama' in response.""" - - expected_substring = "Llama" - metrics = (AccuracyMetric(scorer=SubstringScorer),) - - -@register("smoke_identity_gpt") -class IdentitySmokeGpt(IdentitySmokeBase): - """Identity smoke test expecting 'GPT' in response.""" - - expected_substring = "GPT" - metrics = (AccuracyMetric(scorer=SubstringScorer),) - - -@register("smoke_hello") -class HelloSmoke(Task): - """Smoke test: can the model respond to a greeting? - - A basic sanity check that the model can produce a non-empty response. - Scores 1.0 if response is non-empty, 0.0 otherwise. - """ - - sampling_params = SamplingParams(temperature=0.0) - formatter = CompletionFormatter(template="User: {question}\nAssistant:") - metrics = (AccuracyMetric(scorer=NonEmptyResponseScorer),) - - @property - def instances(self) -> Iterator[Instance]: - yield Instance( - question="Hello!", - gold_answer="", - metadata={"id": "hello"}, - ) - - def format_request(self, instance: Instance) -> LMRequest: - """Format instance for the language model.""" - if self.config.formatter is not None: - return self.config.formatter.format(instance, self.get_fewshot()) - # Fallback formatting - return LMRequest(request_type=self.request_type, prompt=instance.question) - - -# ============================================================================= -# Tool Calling Smoke Test -# ============================================================================= - -# Weather tool schema for testing tool calls -_WEATHER_TOOL = ToolSchema( - name="get_current_weather", - description="Get the current weather in a given location", - parameters={ - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "unit": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - }, - }, - "required": ["location"], - }, -) - - -@register("smoke_toolcall") -class ToolCallSmoke(Task): - """Smoke test: can the model make tool calls? - - Verifies that the model can correctly invoke a tool when provided with - a tool schema. The test asks about weather, expecting the model to call - the get_current_weather tool. - - Scores 1.0 if the model calls the expected tool, 0.0 otherwise. - """ - - sampling_params = SamplingParams(temperature=0.0) - metrics = (AccuracyMetric(scorer=ToolCallScorer),) - - @property - def instances(self) -> Iterator[Instance]: - yield Instance( - question="What's the weather like in Seattle?", - gold_answer="", - expected_tool_calls=({"name": "get_current_weather"},), - metadata={"id": "toolcall", "check_type": "tool_call"}, - ) - - def format_request(self, instance: Instance) -> LMRequest: - return LMRequest( - request_type=RequestType.COMPLETION, - messages=({"role": "user", "content": instance.question},), - tools=(_WEATHER_TOOL,), - ) - - -# ============================================================================= -# Reasoning Smoke Test -# ============================================================================= - - -@register("smoke_reasoning") -class ReasoningSmoke(Task): - """Smoke test: does the model return reasoning content? - - Verifies that reasoning models correctly parse and return their - chain-of-thought reasoning in the response. This test asks a simple - question and checks that the reasoning field is populated. - - Scores 1.0 if reasoning is present, 0.0 otherwise. - """ - - sampling_params = SamplingParams(temperature=0.0) - metrics = (AccuracyMetric(scorer=ReasoningScorer),) - - @property - def instances(self) -> Iterator[Instance]: - yield Instance( - question="Who are you?", - gold_answer="", - metadata={"id": "reasoning", "check_type": "reasoning_present"}, - ) - - def format_request(self, instance: Instance) -> LMRequest: - return LMRequest( - request_type=RequestType.COMPLETION, - messages=({"role": "user", "content": instance.question},), - )