Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions gauss_cli/runtime_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,32 @@ def format_runtime_provider_error(error: Exception) -> str:
if isinstance(error, AuthError):
return format_auth_error(error)
return str(error)


def supports_anthropic_prompt_caching(
*,
api_mode: str,
model: str,
base_url: Optional[str] = None,
) -> bool:
"""Return True when the resolved runtime can honor Anthropic prompt caching.

Prompt caching relies on Anthropic-style ``cache_control`` breakpoints. Two
runtime shapes accept them:

* The native Anthropic Messages API (``api_mode == "anthropic_messages"``).
* Any chat-completions endpoint serving a Claude model. OpenRouter is one
such endpoint, but Claude is also reachable through custom
Anthropic-compatible gateways (Z.ai, LiteLLM, self-hosted proxies). These
pass ``cache_control`` straight through to Anthropic, so caching must not
be gated on the OpenRouter base URL string.

Codex Responses and non-Claude chat-completions models do not support these
breakpoints and return False.
"""
mode = (api_mode or "").strip().lower()
if mode == "anthropic_messages":
return True
if mode == "chat_completions":
return "claude" in (model or "").lower()
return False
32 changes: 20 additions & 12 deletions run_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
)
from agent.context_compressor import ContextCompressor
from agent.prompt_caching import apply_anthropic_cache_control
from gauss_cli.runtime_provider import supports_anthropic_prompt_caching
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt
from agent.display import (
KawaiiSpinner, build_tool_preview as _build_tool_preview,
Expand Down Expand Up @@ -380,13 +381,18 @@ def __init__(
self.reasoning_config = reasoning_config # None = use default (medium for OpenRouter)
self.prefill_messages = prefill_messages or [] # Prefilled conversation turns

# Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
# Reduces input costs by ~75% on multi-turn conversations by caching the
# conversation prefix. Uses system_and_3 strategy (4 breakpoints).
is_openrouter = "openrouter" in self.base_url.lower()
is_claude = "claude" in self.model.lower()
is_native_anthropic = self.api_mode == "anthropic_messages"
self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
# Anthropic prompt caching: enabled whenever the resolved runtime can
# honor cache_control breakpoints. Reduces input costs by ~75% on
# multi-turn conversations by caching the conversation prefix. Uses
# system_and_3 strategy (4 breakpoints). Capability is decided by
# provider/api_mode rather than a base_url string match, so Claude on
# custom Anthropic-compatible endpoints (Z.ai, LiteLLM, self-hosted)
# keeps caching instead of silently losing it.
self._use_prompt_caching = supports_anthropic_prompt_caching(
api_mode=self.api_mode,
model=self.model,
base_url=self.base_url,
)
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)

# Iteration budget pressure: warn the LLM as it approaches max_iterations.
Expand Down Expand Up @@ -2642,11 +2648,13 @@ def _try_activate_fallback(self) -> bool:
"base_url": fb_base_url,
}

# Re-evaluate prompt caching for the new provider/model
is_native_anthropic = fb_api_mode == "anthropic_messages"
self._use_prompt_caching = (
("openrouter" in fb_base_url.lower() and "claude" in fb_model.lower())
or is_native_anthropic
# Re-evaluate prompt caching for the new provider/model using the
# same capability check as __init__, so a Claude fallback on a
# custom Anthropic-compatible endpoint keeps caching.
self._use_prompt_caching = supports_anthropic_prompt_caching(
api_mode=fb_api_mode,
model=fb_model,
base_url=fb_base_url,
)

print(
Expand Down
27 changes: 25 additions & 2 deletions tests/test_run_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,14 @@ def test_prompt_caching_non_claude(self):
)
assert a._use_prompt_caching is False

def test_prompt_caching_non_openrouter(self):
"""Custom base_url (not OpenRouter) should disable prompt caching."""
def test_prompt_caching_claude_custom_endpoint(self):
"""Claude on a custom Anthropic-compatible endpoint should enable caching.

Previously this was gated on "openrouter" appearing in the base URL, so a
Claude model served by a custom gateway (Z.ai, LiteLLM, self-hosted proxy)
silently lost caching. Caching is now decided by provider capability, so
any Claude model on a chat_completions endpoint keeps cache_control.
"""
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
Expand All @@ -413,6 +419,23 @@ def test_prompt_caching_non_openrouter(self):
skip_context_files=True,
skip_memory=True,
)
assert a._use_prompt_caching is True

def test_prompt_caching_non_claude_custom_endpoint(self):
"""A non-Claude model on a custom endpoint should still disable caching."""
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
):
a = AIAgent(
api_key="test-key-1234567890",
model="openai/gpt-4o",
base_url="http://localhost:8080/v1",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
assert a._use_prompt_caching is False

def test_prompt_caching_native_anthropic(self):
Expand Down