langwatch · Aryansharma28 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/python/examples/test_multiturn_tool_calls.py b/python/examples/test_multiturn_tool_calls.py
@@ -0,0 +1,211 @@
+"""
+E2E test: 10-turn multiturn conversation with tool calls.
+
+Verifies that the user simulator works correctly after the agent makes tool
+calls (role reversal must summarize tool messages as plain text).
+"""
+
+import json
+import random
+
+import pytest
+import litellm
+import scenario
+from function_schema import get_function_schema
+
+
+# ── Tool definitions ──────────────────────────────────────────────────────────
+
+
+def search_products(query: str) -> str:
+    """
+    Search for products by name or category.
+
+    Args:
+        query: The search query for products.
+
+    Returns:
+        A list of matching products with prices.
+    """
+    products = {
+        "headphones": [
+            "Wireless Headphones Pro - $79.99",
+            "Budget Earbuds - $19.99",
+            "Noise Cancelling Over-Ear - $149.99",
+        ],
+        "laptop": [
+            "UltraBook 14 - $999.99",
+            "Gaming Laptop X - $1,499.99",
+            "Budget Chromebook - $299.99",
+        ],
+        "keyboard": [
+            "Mechanical RGB Keyboard - $89.99",
+            "Wireless Compact Keyboard - $49.99",
+        ],
+    }
+    for key, items in products.items():
+        if key in query.lower():
+            return json.dumps(items)
+    return json.dumps(["No products found for: " + query])
+
+
+def check_stock(product_name: str) -> str:
+    """
+    Check the stock availability for a specific product.
+
+    Args:
+        product_name: The exact product name to check stock for.
+
+    Returns:
+        Stock availability information.
+    """
+    in_stock = random.choice([True, True, True, False])
+    qty = random.randint(1, 50) if in_stock else 0
+    return json.dumps(
+        {
+            "product": product_name,
+            "in_stock": in_stock,
+            "quantity": qty,
+            "ships_in": "2-3 business days" if in_stock else "out of stock",
+        }
+    )
+
+
+TOOLS = [search_products, check_stock]
+
+
+# ── Agent implementation ──────────────────────────────────────────────────────
+
+
+@scenario.cache()
+def shopping_agent(messages, response_messages: list | None = None) -> scenario.AgentReturnTypes:
+    if response_messages is None:
+        response_messages = []
+    response = litellm.completion(
+        model="openai/gpt-4.1-nano",
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a helpful online shopping assistant. "
+                    "Use the search_products tool to find items and "
+                    "check_stock to verify availability. "
+                    "Be concise and helpful."
+                ),
+            },
+            *messages,
+            *response_messages,
+        ],
+        tools=[
+            {"type": "function", "function": get_function_schema(tool)}
+            for tool in TOOLS
+        ],
+        tool_choice="auto",
+    )
+
+    message = response.choices[0].message  # type: ignore
+
+    if message.tool_calls:
+        tools_by_name = {tool.__name__: tool for tool in TOOLS}
+        tool_responses = []
+        for tool_call in message.tool_calls:
+            name = tool_call.function.name
+            args = json.loads(tool_call.function.arguments)
+            if name in tools_by_name:
+                result = tools_by_name[name](**args)
+                tool_responses.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": tool_call.id,
+                        "content": result,
+                    }
+                )
+            else:
+                raise ValueError(f"Tool {name} not found")
+
+        return shopping_agent(
+            messages,
+            [*response_messages, message, *tool_responses],
+        )
+
+    return [*response_messages, message]  # type: ignore
+
+
+# ── Test ──────────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.agent_test
+@pytest.mark.asyncio
+@pytest.mark.flaky(reruns=2)
+async def test_multiturn_shopping_with_tool_calls():
+    """10-turn conversation where the agent makes tool calls on multiple turns.
+
+    The user simulator must handle role-reversed tool messages correctly
+    throughout the entire conversation without crashing.
+    """
+
+    class ShoppingAgent(scenario.AgentAdapter):
+        async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
+            return shopping_agent(input.messages)
+
+    def check_search_tool_used(state: scenario.ScenarioState):
+        assert state.has_tool_call("search_products"), "Agent should have used search_products"
+
+    def check_stock_tool_used(state: scenario.ScenarioState):
+        assert state.has_tool_call("check_stock"), "Agent should have used check_stock"
+
+    result = await scenario.run(
+        name="10-turn shopping with tool calls",
+        description=(
+            "A user browses an online shop, asking about different products, "
+            "checking stock, comparing options, and eventually deciding what to buy. "
+            "The agent uses search and stock-check tools throughout."
+        ),
+        agents=[
+            ShoppingAgent(),
+            scenario.UserSimulatorAgent(model="openai/gpt-4.1-nano"),
+            scenario.JudgeAgent(
+                model="openai/gpt-4.1",
+                criteria=[
+                    "The agent used tools to look up products when the user asked about them",
+                    "The agent provided helpful product information and pricing",
+                    "The agent maintained context across the full conversation",
+                ],
+            ),
+        ],
+        script=[
+            # Turn 1: user asks about headphones -> agent should search
+            scenario.user("hey do you have any headphones"),
+            scenario.agent(),
+            check_search_tool_used,
+            # Turn 2: user asks follow-up (user sim runs with tool messages in history)
+            scenario.user(),
+            scenario.agent(),
+            # Turn 3: user asks about a different category
+            scenario.user("what about keyboards"),
+            scenario.agent(),
+            # Turn 4: free-form follow-up from user sim
+            scenario.user(),
+            scenario.agent(),
+            # Turn 5: user asks to check stock
+            scenario.user("can you check if the mechanical keyboard is in stock"),
+            scenario.agent(),
+            check_stock_tool_used,
+            # Turn 6-10: let the user sim drive the rest of the conversation
+            scenario.user(),
+            scenario.agent(),
+            scenario.user(),
+            scenario.agent(),
+            scenario.user(),
+            scenario.agent(),
+            scenario.user(),
+            scenario.agent(),
+            scenario.user(),
+            scenario.agent(),
+            # Judge the full conversation
+            scenario.judge(),
+        ],
+        set_id="python-examples",
+    )
+
+    assert result.success
diff --git a/python/scenario/_utils/utils.py b/python/scenario/_utils/utils.py
@@ -12,6 +12,7 @@
     Any,
     Iterator,
     Optional,
+    Sequence,
     Union,
     TypeVar,
     Awaitable,
@@ -380,38 +381,137 @@ def _is_valid_openai_message(message: Any) -> bool:
     )
 
 
+def _stringify_value(value: Any) -> str:
+    """Convert a value to a string representation for tool summaries.
+
+    Strings are returned as-is. All other values go through json.dumps so that
+    None becomes "null", dicts/lists get JSON notation, and numbers stringify
+    without a Python-specific repr. Non-serializable objects fall back to str().
+    """
+    if isinstance(value, str):
+        return value
+    try:
+        return json.dumps(value)
+    except (TypeError, ValueError):
+        return str(value)
+
+
+def _has_tool_content(message: Any) -> bool:
+    """
+    Checks if a message contains tool-related content (tool_calls, tool role).
+    These messages need to be summarized as plain text rather than role-reversed,
+    because sending raw tool content on a 'user' message breaks both OpenAI and
+    Anthropic APIs.
+    """
+    role = safe_attr_or_key(message, "role")
+    if role == "tool":
+        return True
+    if safe_attr_or_key(message, "tool_calls"):
+        return True
+    return False
+
+
+def _summarize_tool_message(message: Any) -> Optional[str]:
+    """
+    Converts a tool message into a plain-text summary so the user simulator
+    understands what the agent did without receiving raw tool protocol messages.
+
+    Handles OpenAI message format:
+    - Tool results: {"role": "tool", "tool_call_id": "...", "name": "...", "content": "..."}
+    - Tool calls: {"role": "assistant", "tool_calls": [{"function": {"name": "...", "arguments": "..."}}]}
+
+    Note: when an assistant message has both text content and tool_calls, the text
+    content is intentionally dropped and only the tool calls are summarized. This
+    matches the JS messageRoleReversal() behaviour — the tool-call summary is what
+    the user simulator needs to understand what the agent did.
+    """
+    role = safe_attr_or_key(message, "role")
+
+    # Handle tool result messages (role == "tool")
+    if role == "tool":
+        content = safe_attr_or_key(message, "content")
+        name = safe_attr_or_key(message, "name", "unknown tool")
+        return f"[Tool result from {name}: {_stringify_value(content)}]"
+
+    # Handle assistant messages with tool_calls
+    tool_calls = safe_attr_or_key(message, "tool_calls")
+    if tool_calls:
+        summaries = []
+        for tool_call in tool_calls:
+            function = safe_attr_or_key(tool_call, "function")
+            if function:
+                name = safe_attr_or_key(function, "name", "unknown tool")
+                arguments = safe_attr_or_key(function, "arguments", "{}")
+                summaries.append(f"[Called tool {name} with: {arguments}]")
+        return "\n".join(summaries) if summaries else None
+
+    return None
+
+
 def reverse_roles(
-    messages: list[ChatCompletionMessageParam],
+    messages: Sequence[ChatCompletionMessageParam],
 ) -> list[ChatCompletionMessageParam]:
     """
-    Reverses the roles of the messages in the list.
+    Reverses user <-> assistant roles for the user simulator agent.
+
+    Every message is processed individually:
+    1. Tool messages (role 'tool' or containing tool_calls)
+       -> summarized as plain text attributed to 'user' (the agent after reversal)
+    2. User messages -> become 'assistant' (so the LLM generates as "assistant")
+    3. Assistant messages -> become 'user' (the agent's words become context)
+    4. System messages -> preserved unchanged
+
+    This flat per-message approach is correct because every non-tool message must
+    be reversed regardless of whether nearby messages contain tool calls. The old
+    segment-based approach incorrectly left non-tool messages unreversed in segments
+    that contained tools, causing the user simulator to see the wrong roles and
+    respond as an assistant instead of simulating a user.
 
     Args:
         messages: The list of messages to reverse the roles of.
     """
 
-    reversed_messages = []
+    role_map = {
+        "user": "assistant",
+        "assistant": "user",
+    }
+
+    reversed_messages: list[ChatCompletionMessageParam] = []
     for message in messages:
         message = copy.deepcopy(message)
-        # Can't reverse tool calls
-        if not safe_attr_or_key(message, "content") or safe_attr_or_key(
-            message, "tool_calls"
-        ):
-            # If no content nor tool calls, we should skip it entirely, as anthropic may generate some invalid ones e.g. pure {"role": "assistant"}
-            if safe_attr_or_key(message, "tool_calls"):
-                reversed_messages.append(message)
+
+        if _has_tool_content(message):
+            summary = _summarize_tool_message(message)
+            if summary is None:
+                continue
+            reversed_messages.append({"role": "user", "content": summary})
+            continue
+
+        role = safe_attr_or_key(message, "role")
+        new_role = role_map.get(role)  # type: ignore
+        if not new_role:
+            # Preserve system and other messages unchanged
+            reversed_messages.append(message)
+            continue
+
+        # Skip bare role-only messages that have no content key at all.
+        # Some models (notably Anthropic) occasionally emit {"role": "assistant"}
+        # with no content field; passing that on as {"role": "user"} would cause
+        # an API validation error on the user simulator's next request.
+        # Note: explicit content=None is kept — that is valid in the OpenAI format
+        # for assistant messages that accompany tool_calls.
+        has_content_key = (
+            "content" in message
+            if isinstance(message, dict)
+            else hasattr(message, "content")
+        )
+        if not has_content_key:
             continue
 
-        if type(message) == dict:
-            if message["role"] == "user":
-                message["role"] = "assistant"
-            elif message["role"] == "assistant":
-                message["role"] = "user"
+        if isinstance(message, dict):
+            message["role"] = new_role  # type: ignore[typeddict-item]
         else:
-            if getattr(message, "role", None) == "user":
-                message.role = "assistant"  # type: ignore
-            elif getattr(message, "role", None) == "assistant":
-                message.role = "user"  # type: ignore
+            message.role = new_role  # type: ignore
 
         reversed_messages.append(message)