Merge branch 'main' into feat/reasoning

cpsievert · cpsievert · commit 0c8674b238a1 · 2025-11-06T10:48:50.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,13 +14,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * `ChatOpenAI()` and `ChatAzureOpenAI()` gain access to latest models, built-in tools, image generation, etc. as a result of moving to the new [Responses API](https://platform.openai.com/docs/api-reference/responses). (#192)
 * `ChatOpenAI()`, `ChatAnthropic()`, and `ChatGoogle()` gain a new `reasoning` parameter to easily opt-into, and fully customize, reasoning capabilities. (#202) 
 * A new `ContentThinking` content type was added and captures the "thinking" portion of a reasoning model. (#192)
+* `ChatAnthropic()` and `ChatBedrockAnthropic()` gain new `cache` parameter to control caching. By default it is set to "5m". This should (on average) reduce the cost of your chats. (#215)
 * Added support for systematic evaluation via [Inspect AI](https://inspect.aisi.org.uk/). This includes:
     * A new `.export_eval()` method for exporting conversation history as an Inspect eval dataset sample. This supports multi-turn conversations, tool calls, images, PDFs, and structured data.
     * A new `.to_solver()` method for translating chat instances into Inspect solvers that can be used with Inspect's evaluation framework.
     * A new `Turn.to_inspect_messages()` method for converting turns to Inspect's message format.
     * Comprehensive documentation in the [Evals guide](https://posit-dev.github.io/chatlas/misc/evals.html).
 
-
 ### Changes
 
 * `ChatOpenAI()` and `ChatAzureOpenAI()` move from OpenAI's Completions API to [Responses API](https://platform.openai.com/docs/api-reference/responses). If this happens to break behavior, change `ChatOpenAI()` -> `ChatOpenAICompletions()` (or `ChatAzureOpenAI()` -> `ChatAzureOpenAICompletions()`). (#192)
diff --git a/chatlas/_provider_anthropic.py b/chatlas/_provider_anthropic.py
@@ -47,6 +47,7 @@
         ToolParam,
         ToolUseBlock,
     )
+    from anthropic.types.cache_control_ephemeral_param import CacheControlEphemeralParam
     from anthropic.types.document_block_param import DocumentBlockParam
     from anthropic.types.image_block_param import ImageBlockParam
     from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
@@ -77,6 +78,7 @@ def ChatAnthropic(
     system_prompt: Optional[str] = None,
     model: "Optional[ModelParam]" = None,
     max_tokens: int = 4096,
+    cache: Literal["5m", "1h", "none"] = "5m",
     reasoning: Optional["int | ThinkingConfigEnabledParam"] = None,
     api_key: Optional[str] = None,
     kwargs: Optional["ChatClientArgs"] = None,
@@ -127,6 +129,10 @@ def ChatAnthropic(
         choosing a model for all but the most casual use.
     max_tokens
         Maximum number of tokens to generate before stopping.
+    cache
+        How long to cache inputs? Defaults to "5m" (five minutes).
+        Set to "none" to disable caching or "1h" to cache for one hour.
+        See the Caching section for details.
     reasoning
         Determines how many tokens Claude can be allocated to reasoning. Must be
         ≥1024 and less than `max_tokens`. Larger budgets can enable more
@@ -182,6 +188,46 @@ def ChatAnthropic(
     ```shell
     export ANTHROPIC_API_KEY=...
     ```
+
+    Caching
+    -------
+
+    Caching with Claude is a bit more complicated than other providers but we
+    believe that on average it will save you both money and time, so we have
+    enabled it by default. With other providers, like OpenAI and Google,
+    you only pay for cache reads, which cost 10% of the normal price. With
+    Claude, you also pay for cache writes, which cost 125% of the normal price
+    for 5 minute caching and 200% of the normal price for 1 hour caching.
+
+    How does this affect the total cost of a conversation? Imagine the first
+    turn sends 1000 input tokens and receives 200 output tokens. The second
+    turn must first send both the input and output from the previous turn
+    (1200 tokens). It then sends a further 1000 tokens and receives 200 tokens
+    back.
+
+    To compare the prices of these two approaches we can ignore the cost of
+    output tokens, because they are the same for both. How much will the input
+    tokens cost? If we don't use caching, we send 1000 tokens in the first turn
+    and 2200 (1000 + 200 + 1000) tokens in the second turn for a total of 3200
+    tokens. If we use caching, we'll send (the equivalent of) 1000 * 1.25 = 1250
+    tokens in the first turn. In the second turn, 1000 of the input tokens will
+    be cached so the total cost is 1000 * 0.1 + (200 + 1000) * 1.25 = 1600
+    tokens. That makes a total of 2850 tokens, i.e. 11% fewer tokens,
+    decreasing the overall cost.
+
+    Obviously, the details will vary from conversation to conversation, but
+    if you have a large system prompt that you re-use many times you should
+    expect to see larger savings. You can see exactly how many input and
+    cache input tokens each turn uses, along with the total cost,
+    with `chat.get_tokens()`. If you don't see savings for your use case, you can
+    suppress caching with `cache="none"`.
+
+    Note: Claude will only cache longer prompts, with caching requiring at least
+    1024-4096 tokens, depending on the model. So don't be surprised if you
+    don't see any differences with caching if you have a short prompt.
+
+    See all the details at
+    <https://docs.claude.com/en/docs/build-with-claude/prompt-caching>.
     """
 
     if model is None:
@@ -198,6 +244,7 @@ def ChatAnthropic(
             api_key=api_key,
             model=model,
             max_tokens=max_tokens,
+            cache=cache,
             kwargs=kwargs,
         ),
         system_prompt=system_prompt,
@@ -215,6 +262,7 @@ def __init__(
         model: str,
         api_key: Optional[str] = None,
         name: str = "Anthropic",
+        cache: Literal["5m", "1h", "none"] = "5m",
         kwargs: Optional["ChatClientArgs"] = None,
     ):
         super().__init__(name=name, model=model)
@@ -226,6 +274,7 @@ def __init__(
                 "You can install it with 'pip install anthropic'."
             )
         self._max_tokens = max_tokens
+        self._cache: Literal["5m", "1h", "none"] = cache
 
         kwargs_full: "ChatClientArgs" = {
             "api_key": api_key,
@@ -385,7 +434,13 @@ def _structured_tool_call(**kwargs: Any):
 
         if "system" not in kwargs_full:
             if len(turns) > 0 and turns[0].role == "system":
-                kwargs_full["system"] = turns[0].text
+                sys_param: "TextBlockParam" = {
+                    "type": "text",
+                    "text": turns[0].text,
+                }
+                if self._cache_control():
+                    sys_param["cache_control"] = self._cache_control()
+                kwargs_full["system"] = [sys_param]
 
         return kwargs_full
 
@@ -447,11 +502,16 @@ def value_turn(self, completion, has_data_model) -> Turn:
 
     def value_tokens(self, completion):
         usage = completion.usage
-        # N.B. Currently, Anthropic doesn't cache by default and we currently do not support
-        # manual caching in chatlas. Note also that this only tracks reads, NOT writes, which
-        # have their own cost. To track that properly, we would need another caching category and per-token cost.
+        input_tokens = completion.usage.input_tokens
+
+        # Account for cache writes by adjusting input tokens
+        # Cache writes cost 125% for 5m and 200% for 1h
+        # https://docs.claude.com/en/docs/build-with-claude/prompt-caching
+        cache_input = usage.cache_creation_input_tokens or 0
+        cache_mult = 2.0 if self._cache == "1h" else 1.25
+
         return (
-            completion.usage.input_tokens,
+            input_tokens + int(cache_input * cache_mult),
             completion.usage.output_tokens,
             usage.cache_read_input_tokens if usage.cache_read_input_tokens else 0,
         )
@@ -539,13 +599,21 @@ def supported_model_params(self) -> set[StandardModelParamNames]:
 
     def _as_message_params(self, turns: list[Turn]) -> list["MessageParam"]:
         messages: list["MessageParam"] = []
-        for turn in turns:
+        for i, turn in enumerate(turns):
             if turn.role == "system":
                 continue  # system prompt passed as separate arg
             if turn.role not in ["user", "assistant"]:
                 raise ValueError(f"Unknown role {turn.role}")
 
             content = [self._as_content_block(c) for c in turn.contents]
+
+            # Add cache control to the last content block in the last turn
+            # https://docs.claude.com/en/docs/build-with-claude/prompt-caching#how-automatic-prefix-checking-works
+            is_last_turn = i == len(turns) - 1
+            if is_last_turn and len(content) > 0:
+                if self._cache_control():
+                    content[-1]["cache_control"] = self._cache_control()
+
             role = "user" if turn.role == "user" else "assistant"
             messages.append({"role": role, "content": content})
         return messages
@@ -787,11 +855,20 @@ def batch_result_turn(self, result, has_data_model: bool = False) -> Turn | None
         message = result.result.message
         return self._as_turn(message, has_data_model)
 
+    def _cache_control(self) -> "Optional[CacheControlEphemeralParam]":
+        if self._cache == "none":
+            return None
+        return {
+            "type": "ephemeral",
+            "ttl": self._cache,
+        }
+
 
 def ChatBedrockAnthropic(
     *,
     model: Optional[str] = None,
     max_tokens: int = 4096,
+    cache: Literal["5m", "1h", "none"] = "5m",
     aws_secret_key: Optional[str] = None,
     aws_access_key: Optional[str] = None,
     aws_region: Optional[str] = None,
@@ -847,6 +924,10 @@ def ChatBedrockAnthropic(
         The model to use for the chat.
     max_tokens
         Maximum number of tokens to generate before stopping.
+    cache
+        How long to cache inputs? Defaults to "5m" (five minutes).
+        Set to "none" to disable caching or "1h" to cache for one hour.
+        See the Caching section of `ChatAnthropic` for details.
     aws_secret_key
         The AWS secret key to use for authentication.
     aws_access_key
@@ -928,6 +1009,7 @@ def ChatBedrockAnthropic(
         provider=AnthropicBedrockProvider(
             model=model,
             max_tokens=max_tokens,
+            cache=cache,
             aws_secret_key=aws_secret_key,
             aws_access_key=aws_access_key,
             aws_region=aws_region,
@@ -951,11 +1033,17 @@ def __init__(
         aws_profile: str | None,
         aws_session_token: str | None,
         max_tokens: int = 4096,
+        cache: Literal["5m", "1h", "none"] = "5m",
         base_url: str | None,
         name: str = "AWS/Bedrock",
         kwargs: Optional["ChatBedrockClientArgs"] = None,
     ):
-        super().__init__(name=name, model=model, max_tokens=max_tokens)
+        super().__init__(
+            name=name,
+            model=model,
+            max_tokens=max_tokens,
+            cache=cache,
+        )
 
         try:
             from anthropic import AnthropicBedrock, AsyncAnthropicBedrock