From 9d5ea2228d653b95e15c8ddbfeaf92e156c54fe3 Mon Sep 17 00:00:00 2001 From: sumleo Date: Wed, 17 Jun 2026 09:30:20 +0800 Subject: [PATCH] fix(agent): gate prompt caching by provider capability, not base_url/model string match --- gauss_cli/runtime_provider.py | 29 +++++++++++++++++++++++++++++ run_agent.py | 32 ++++++++++++++++++++------------ tests/test_run_agent.py | 27 +++++++++++++++++++++++++-- 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/gauss_cli/runtime_provider.py b/gauss_cli/runtime_provider.py index 4646924..4e24628 100644 --- a/gauss_cli/runtime_provider.py +++ b/gauss_cli/runtime_provider.py @@ -295,3 +295,32 @@ def format_runtime_provider_error(error: Exception) -> str: if isinstance(error, AuthError): return format_auth_error(error) return str(error) + + +def supports_anthropic_prompt_caching( + *, + api_mode: str, + model: str, + base_url: Optional[str] = None, +) -> bool: + """Return True when the resolved runtime can honor Anthropic prompt caching. + + Prompt caching relies on Anthropic-style ``cache_control`` breakpoints. Two + runtime shapes accept them: + + * The native Anthropic Messages API (``api_mode == "anthropic_messages"``). + * Any chat-completions endpoint serving a Claude model. OpenRouter is one + such endpoint, but Claude is also reachable through custom + Anthropic-compatible gateways (Z.ai, LiteLLM, self-hosted proxies). These + pass ``cache_control`` straight through to Anthropic, so caching must not + be gated on the OpenRouter base URL string. + + Codex Responses and non-Claude chat-completions models do not support these + breakpoints and return False. + """ + mode = (api_mode or "").strip().lower() + if mode == "anthropic_messages": + return True + if mode == "chat_completions": + return "claude" in (model or "").lower() + return False diff --git a/run_agent.py b/run_agent.py index 5df7a9d..63d1962 100644 --- a/run_agent.py +++ b/run_agent.py @@ -83,6 +83,7 @@ ) from agent.context_compressor import ContextCompressor from agent.prompt_caching import apply_anthropic_cache_control +from gauss_cli.runtime_provider import supports_anthropic_prompt_caching from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt from agent.display import ( KawaiiSpinner, build_tool_preview as _build_tool_preview, @@ -380,13 +381,18 @@ def __init__( self.reasoning_config = reasoning_config # None = use default (medium for OpenRouter) self.prefill_messages = prefill_messages or [] # Prefilled conversation turns - # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter. - # Reduces input costs by ~75% on multi-turn conversations by caching the - # conversation prefix. Uses system_and_3 strategy (4 breakpoints). - is_openrouter = "openrouter" in self.base_url.lower() - is_claude = "claude" in self.model.lower() - is_native_anthropic = self.api_mode == "anthropic_messages" - self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic + # Anthropic prompt caching: enabled whenever the resolved runtime can + # honor cache_control breakpoints. Reduces input costs by ~75% on + # multi-turn conversations by caching the conversation prefix. Uses + # system_and_3 strategy (4 breakpoints). Capability is decided by + # provider/api_mode rather than a base_url string match, so Claude on + # custom Anthropic-compatible endpoints (Z.ai, LiteLLM, self-hosted) + # keeps caching instead of silently losing it. + self._use_prompt_caching = supports_anthropic_prompt_caching( + api_mode=self.api_mode, + model=self.model, + base_url=self.base_url, + ) self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost) # Iteration budget pressure: warn the LLM as it approaches max_iterations. @@ -2642,11 +2648,13 @@ def _try_activate_fallback(self) -> bool: "base_url": fb_base_url, } - # Re-evaluate prompt caching for the new provider/model - is_native_anthropic = fb_api_mode == "anthropic_messages" - self._use_prompt_caching = ( - ("openrouter" in fb_base_url.lower() and "claude" in fb_model.lower()) - or is_native_anthropic + # Re-evaluate prompt caching for the new provider/model using the + # same capability check as __init__, so a Claude fallback on a + # custom Anthropic-compatible endpoint keeps caching. + self._use_prompt_caching = supports_anthropic_prompt_caching( + api_mode=fb_api_mode, + model=fb_model, + base_url=fb_base_url, ) print( diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index e1abb7b..ad6822f 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -398,8 +398,14 @@ def test_prompt_caching_non_claude(self): ) assert a._use_prompt_caching is False - def test_prompt_caching_non_openrouter(self): - """Custom base_url (not OpenRouter) should disable prompt caching.""" + def test_prompt_caching_claude_custom_endpoint(self): + """Claude on a custom Anthropic-compatible endpoint should enable caching. + + Previously this was gated on "openrouter" appearing in the base URL, so a + Claude model served by a custom gateway (Z.ai, LiteLLM, self-hosted proxy) + silently lost caching. Caching is now decided by provider capability, so + any Claude model on a chat_completions endpoint keeps cache_control. + """ with ( patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), @@ -413,6 +419,23 @@ def test_prompt_caching_non_openrouter(self): skip_context_files=True, skip_memory=True, ) + assert a._use_prompt_caching is True + + def test_prompt_caching_non_claude_custom_endpoint(self): + """A non-Claude model on a custom endpoint should still disable caching.""" + with ( + patch("run_agent.get_tool_definitions", return_value=[]), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + ): + a = AIAgent( + api_key="test-key-1234567890", + model="openai/gpt-4o", + base_url="http://localhost:8080/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) assert a._use_prompt_caching is False def test_prompt_caching_native_anthropic(self):