From 9d5ea2228d653b95e15c8ddbfeaf92e156c54fe3 Mon Sep 17 00:00:00 2001
From: sumleo <sumleo@users.noreply.github.com>
Date: Wed, 17 Jun 2026 09:30:20 +0800
Subject: [PATCH] fix(agent): gate prompt caching by provider capability, not
 base_url/model string match

---
 gauss_cli/runtime_provider.py | 29 +++++++++++++++++++++++++++++
 run_agent.py                  | 32 ++++++++++++++++++++------------
 tests/test_run_agent.py       | 27 +++++++++++++++++++++++++--
 3 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/gauss_cli/runtime_provider.py b/gauss_cli/runtime_provider.py
index 4646924..4e24628 100644
--- a/gauss_cli/runtime_provider.py
+++ b/gauss_cli/runtime_provider.py
@@ -295,3 +295,32 @@ def format_runtime_provider_error(error: Exception) -> str:
     if isinstance(error, AuthError):
         return format_auth_error(error)
     return str(error)
+
+
+def supports_anthropic_prompt_caching(
+    *,
+    api_mode: str,
+    model: str,
+    base_url: Optional[str] = None,
+) -> bool:
+    """Return True when the resolved runtime can honor Anthropic prompt caching.
+
+    Prompt caching relies on Anthropic-style ``cache_control`` breakpoints. Two
+    runtime shapes accept them:
+
+    * The native Anthropic Messages API (``api_mode == "anthropic_messages"``).
+    * Any chat-completions endpoint serving a Claude model. OpenRouter is one
+      such endpoint, but Claude is also reachable through custom
+      Anthropic-compatible gateways (Z.ai, LiteLLM, self-hosted proxies). These
+      pass ``cache_control`` straight through to Anthropic, so caching must not
+      be gated on the OpenRouter base URL string.
+
+    Codex Responses and non-Claude chat-completions models do not support these
+    breakpoints and return False.
+    """
+    mode = (api_mode or "").strip().lower()
+    if mode == "anthropic_messages":
+        return True
+    if mode == "chat_completions":
+        return "claude" in (model or "").lower()
+    return False
diff --git a/run_agent.py b/run_agent.py
index 5df7a9d..63d1962 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -83,6 +83,7 @@
 )
 from agent.context_compressor import ContextCompressor
 from agent.prompt_caching import apply_anthropic_cache_control
+from gauss_cli.runtime_provider import supports_anthropic_prompt_caching
 from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt
 from agent.display import (
     KawaiiSpinner, build_tool_preview as _build_tool_preview,
@@ -380,13 +381,18 @@ def __init__(
         self.reasoning_config = reasoning_config  # None = use default (medium for OpenRouter)
         self.prefill_messages = prefill_messages or []  # Prefilled conversation turns
         
-        # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
-        # Reduces input costs by ~75% on multi-turn conversations by caching the
-        # conversation prefix. Uses system_and_3 strategy (4 breakpoints).
-        is_openrouter = "openrouter" in self.base_url.lower()
-        is_claude = "claude" in self.model.lower()
-        is_native_anthropic = self.api_mode == "anthropic_messages"
-        self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
+        # Anthropic prompt caching: enabled whenever the resolved runtime can
+        # honor cache_control breakpoints. Reduces input costs by ~75% on
+        # multi-turn conversations by caching the conversation prefix. Uses
+        # system_and_3 strategy (4 breakpoints). Capability is decided by
+        # provider/api_mode rather than a base_url string match, so Claude on
+        # custom Anthropic-compatible endpoints (Z.ai, LiteLLM, self-hosted)
+        # keeps caching instead of silently losing it.
+        self._use_prompt_caching = supports_anthropic_prompt_caching(
+            api_mode=self.api_mode,
+            model=self.model,
+            base_url=self.base_url,
+        )
         self._cache_ttl = "5m"  # Default 5-minute TTL (1.25x write cost)
         
         # Iteration budget pressure: warn the LLM as it approaches max_iterations.
@@ -2642,11 +2648,13 @@ def _try_activate_fallback(self) -> bool:
                     "base_url": fb_base_url,
                 }
 
-            # Re-evaluate prompt caching for the new provider/model
-            is_native_anthropic = fb_api_mode == "anthropic_messages"
-            self._use_prompt_caching = (
-                ("openrouter" in fb_base_url.lower() and "claude" in fb_model.lower())
-                or is_native_anthropic
+            # Re-evaluate prompt caching for the new provider/model using the
+            # same capability check as __init__, so a Claude fallback on a
+            # custom Anthropic-compatible endpoint keeps caching.
+            self._use_prompt_caching = supports_anthropic_prompt_caching(
+                api_mode=fb_api_mode,
+                model=fb_model,
+                base_url=fb_base_url,
             )
 
             print(
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index e1abb7b..ad6822f 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -398,8 +398,14 @@ def test_prompt_caching_non_claude(self):
             )
             assert a._use_prompt_caching is False
 
-    def test_prompt_caching_non_openrouter(self):
-        """Custom base_url (not OpenRouter) should disable prompt caching."""
+    def test_prompt_caching_claude_custom_endpoint(self):
+        """Claude on a custom Anthropic-compatible endpoint should enable caching.
+
+        Previously this was gated on "openrouter" appearing in the base URL, so a
+        Claude model served by a custom gateway (Z.ai, LiteLLM, self-hosted proxy)
+        silently lost caching. Caching is now decided by provider capability, so
+        any Claude model on a chat_completions endpoint keeps cache_control.
+        """
         with (
             patch("run_agent.get_tool_definitions", return_value=[]),
             patch("run_agent.check_toolset_requirements", return_value={}),
@@ -413,6 +419,23 @@ def test_prompt_caching_non_openrouter(self):
                 skip_context_files=True,
                 skip_memory=True,
             )
+            assert a._use_prompt_caching is True
+
+    def test_prompt_caching_non_claude_custom_endpoint(self):
+        """A non-Claude model on a custom endpoint should still disable caching."""
+        with (
+            patch("run_agent.get_tool_definitions", return_value=[]),
+            patch("run_agent.check_toolset_requirements", return_value={}),
+            patch("run_agent.OpenAI"),
+        ):
+            a = AIAgent(
+                api_key="test-key-1234567890",
+                model="openai/gpt-4o",
+                base_url="http://localhost:8080/v1",
+                quiet_mode=True,
+                skip_context_files=True,
+                skip_memory=True,
+            )
             assert a._use_prompt_caching is False
 
     def test_prompt_caching_native_anthropic(self):