Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions python/examples/test_multiturn_tool_calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""
E2E test: 10-turn multiturn conversation with tool calls.

Verifies that the user simulator works correctly after the agent makes tool
calls (role reversal must summarize tool messages as plain text).
"""

import json
import random

import pytest
import litellm
import scenario
from function_schema import get_function_schema


# ── Tool definitions ──────────────────────────────────────────────────────────


def search_products(query: str) -> str:
"""
Search for products by name or category.

Args:
query: The search query for products.

Returns:
A list of matching products with prices.
"""
products = {
"headphones": [
"Wireless Headphones Pro - $79.99",
"Budget Earbuds - $19.99",
"Noise Cancelling Over-Ear - $149.99",
],
"laptop": [
"UltraBook 14 - $999.99",
"Gaming Laptop X - $1,499.99",
"Budget Chromebook - $299.99",
],
"keyboard": [
"Mechanical RGB Keyboard - $89.99",
"Wireless Compact Keyboard - $49.99",
],
}
for key, items in products.items():
if key in query.lower():
return json.dumps(items)
return json.dumps(["No products found for: " + query])


def check_stock(product_name: str) -> str:
"""
Check the stock availability for a specific product.

Args:
product_name: The exact product name to check stock for.

Returns:
Stock availability information.
"""
in_stock = random.choice([True, True, True, False])
qty = random.randint(1, 50) if in_stock else 0
return json.dumps(
{
"product": product_name,
"in_stock": in_stock,
"quantity": qty,
"ships_in": "2-3 business days" if in_stock else "out of stock",
}
)


TOOLS = [search_products, check_stock]


# ── Agent implementation ──────────────────────────────────────────────────────


@scenario.cache()
def shopping_agent(messages, response_messages: list | None = None) -> scenario.AgentReturnTypes:
if response_messages is None:
response_messages = []
response = litellm.completion(
model="openai/gpt-4.1-nano",
messages=[
{
"role": "system",
"content": (
"You are a helpful online shopping assistant. "
"Use the search_products tool to find items and "
"check_stock to verify availability. "
"Be concise and helpful."
),
},
*messages,
*response_messages,
],
tools=[
{"type": "function", "function": get_function_schema(tool)}
for tool in TOOLS
],
tool_choice="auto",
)

message = response.choices[0].message # type: ignore

if message.tool_calls:
tools_by_name = {tool.__name__: tool for tool in TOOLS}
tool_responses = []
for tool_call in message.tool_calls:
name = tool_call.function.name
args = json.loads(tool_call.function.arguments)
if name in tools_by_name:
result = tools_by_name[name](**args)
tool_responses.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": result,
}
)
else:
raise ValueError(f"Tool {name} not found")

return shopping_agent(
messages,
[*response_messages, message, *tool_responses],
)

return [*response_messages, message] # type: ignore


# ── Test ──────────────────────────────────────────────────────────────────────


@pytest.mark.agent_test
@pytest.mark.asyncio
@pytest.mark.flaky(reruns=2)
async def test_multiturn_shopping_with_tool_calls():
"""10-turn conversation where the agent makes tool calls on multiple turns.

The user simulator must handle role-reversed tool messages correctly
throughout the entire conversation without crashing.
"""

class ShoppingAgent(scenario.AgentAdapter):
async def call(self, input: scenario.AgentInput) -> scenario.AgentReturnTypes:
return shopping_agent(input.messages)

def check_search_tool_used(state: scenario.ScenarioState):
assert state.has_tool_call("search_products"), "Agent should have used search_products"

def check_stock_tool_used(state: scenario.ScenarioState):
assert state.has_tool_call("check_stock"), "Agent should have used check_stock"

result = await scenario.run(
name="10-turn shopping with tool calls",
description=(
"A user browses an online shop, asking about different products, "
"checking stock, comparing options, and eventually deciding what to buy. "
"The agent uses search and stock-check tools throughout."
),
agents=[
ShoppingAgent(),
scenario.UserSimulatorAgent(model="openai/gpt-4.1-nano"),
scenario.JudgeAgent(
model="openai/gpt-4.1",
criteria=[
"The agent used tools to look up products when the user asked about them",
"The agent provided helpful product information and pricing",
"The agent maintained context across the full conversation",
],
),
],
script=[
# Turn 1: user asks about headphones -> agent should search
scenario.user("hey do you have any headphones"),
scenario.agent(),
check_search_tool_used,
# Turn 2: user asks follow-up (user sim runs with tool messages in history)
scenario.user(),
scenario.agent(),
# Turn 3: user asks about a different category
scenario.user("what about keyboards"),
scenario.agent(),
# Turn 4: free-form follow-up from user sim
scenario.user(),
scenario.agent(),
# Turn 5: user asks to check stock
scenario.user("can you check if the mechanical keyboard is in stock"),
scenario.agent(),
check_stock_tool_used,
# Turn 6-10: let the user sim drive the rest of the conversation
scenario.user(),
scenario.agent(),
scenario.user(),
scenario.agent(),
scenario.user(),
scenario.agent(),
scenario.user(),
scenario.agent(),
scenario.user(),
scenario.agent(),
# Judge the full conversation
scenario.judge(),
],
set_id="python-examples",
)

assert result.success
138 changes: 119 additions & 19 deletions python/scenario/_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Any,
Iterator,
Optional,
Sequence,
Union,
TypeVar,
Awaitable,
Expand Down Expand Up @@ -380,38 +381,137 @@ def _is_valid_openai_message(message: Any) -> bool:
)


def _stringify_value(value: Any) -> str:
"""Convert a value to a string representation for tool summaries.

Strings are returned as-is. All other values go through json.dumps so that
None becomes "null", dicts/lists get JSON notation, and numbers stringify
without a Python-specific repr. Non-serializable objects fall back to str().
"""
if isinstance(value, str):
return value
try:
return json.dumps(value)
except (TypeError, ValueError):
return str(value)


def _has_tool_content(message: Any) -> bool:
"""
Checks if a message contains tool-related content (tool_calls, tool role).
These messages need to be summarized as plain text rather than role-reversed,
because sending raw tool content on a 'user' message breaks both OpenAI and
Anthropic APIs.
"""
role = safe_attr_or_key(message, "role")
if role == "tool":
return True
if safe_attr_or_key(message, "tool_calls"):
return True
return False


def _summarize_tool_message(message: Any) -> Optional[str]:
"""
Converts a tool message into a plain-text summary so the user simulator
understands what the agent did without receiving raw tool protocol messages.

Handles OpenAI message format:
- Tool results: {"role": "tool", "tool_call_id": "...", "name": "...", "content": "..."}
- Tool calls: {"role": "assistant", "tool_calls": [{"function": {"name": "...", "arguments": "..."}}]}

Note: when an assistant message has both text content and tool_calls, the text
content is intentionally dropped and only the tool calls are summarized. This
matches the JS messageRoleReversal() behaviour — the tool-call summary is what
the user simulator needs to understand what the agent did.
"""
role = safe_attr_or_key(message, "role")

# Handle tool result messages (role == "tool")
if role == "tool":
content = safe_attr_or_key(message, "content")
name = safe_attr_or_key(message, "name", "unknown tool")
return f"[Tool result from {name}: {_stringify_value(content)}]"

# Handle assistant messages with tool_calls
tool_calls = safe_attr_or_key(message, "tool_calls")
if tool_calls:
summaries = []
for tool_call in tool_calls:
function = safe_attr_or_key(tool_call, "function")
if function:
name = safe_attr_or_key(function, "name", "unknown tool")
arguments = safe_attr_or_key(function, "arguments", "{}")
summaries.append(f"[Called tool {name} with: {arguments}]")
return "\n".join(summaries) if summaries else None

return None


def reverse_roles(
messages: list[ChatCompletionMessageParam],
messages: Sequence[ChatCompletionMessageParam],
) -> list[ChatCompletionMessageParam]:
"""
Reverses the roles of the messages in the list.
Reverses user <-> assistant roles for the user simulator agent.

Every message is processed individually:
1. Tool messages (role 'tool' or containing tool_calls)
-> summarized as plain text attributed to 'user' (the agent after reversal)
2. User messages -> become 'assistant' (so the LLM generates as "assistant")
3. Assistant messages -> become 'user' (the agent's words become context)
4. System messages -> preserved unchanged

This flat per-message approach is correct because every non-tool message must
be reversed regardless of whether nearby messages contain tool calls. The old
segment-based approach incorrectly left non-tool messages unreversed in segments
that contained tools, causing the user simulator to see the wrong roles and
respond as an assistant instead of simulating a user.

Args:
messages: The list of messages to reverse the roles of.
"""

reversed_messages = []
role_map = {
"user": "assistant",
"assistant": "user",
}

reversed_messages: list[ChatCompletionMessageParam] = []
for message in messages:
message = copy.deepcopy(message)
# Can't reverse tool calls
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
message, "tool_calls"
):
# If no content nor tool calls, we should skip it entirely, as anthropic may generate some invalid ones e.g. pure {"role": "assistant"}
if safe_attr_or_key(message, "tool_calls"):
reversed_messages.append(message)

if _has_tool_content(message):
summary = _summarize_tool_message(message)
if summary is None:
continue
reversed_messages.append({"role": "user", "content": summary})
continue

role = safe_attr_or_key(message, "role")
new_role = role_map.get(role) # type: ignore
if not new_role:
# Preserve system and other messages unchanged
reversed_messages.append(message)
continue

# Skip bare role-only messages that have no content key at all.
# Some models (notably Anthropic) occasionally emit {"role": "assistant"}
# with no content field; passing that on as {"role": "user"} would cause
# an API validation error on the user simulator's next request.
# Note: explicit content=None is kept — that is valid in the OpenAI format
# for assistant messages that accompany tool_calls.
has_content_key = (
"content" in message
if isinstance(message, dict)
else hasattr(message, "content")
)
if not has_content_key:
continue

if type(message) == dict:
if message["role"] == "user":
message["role"] = "assistant"
elif message["role"] == "assistant":
message["role"] = "user"
if isinstance(message, dict):
message["role"] = new_role # type: ignore[typeddict-item]
else:
if getattr(message, "role", None) == "user":
message.role = "assistant" # type: ignore
elif getattr(message, "role", None) == "assistant":
message.role = "user" # type: ignore
message.role = new_role # type: ignore

reversed_messages.append(message)

Expand Down
Loading
Loading