Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

# Ignore all contents of the virtual environment directory
.venv/
venv/

# Handle memory directory
memory/**
Expand Down Expand Up @@ -46,3 +47,4 @@ instruments/**
# for browser-use
agent_history.gif

venv/bin/accelerate
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
},
// Optional: point VSCode to jsconfig.json if you add one
"jsconfig.json": "${workspaceFolder}/jsconfig.json",
"postman.settings.dotenv-detection-notification-visibility": false
"postman.settings.dotenv-detection-notification-visibility": false,
"cursorpyright.analysis.typeCheckingMode": "standard"
}
230 changes: 164 additions & 66 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from litellm import completion, acompletion, embedding
import litellm
from litellm.exceptions import RateLimitError as LiteLLMRateLimitError, APIConnectionError as LiteLLMAPIConnectionError
import openai
from litellm.types.utils import ModelResponse

Expand Down Expand Up @@ -225,8 +226,36 @@ def get_rate_limiter(
return limiter


def _is_non_transient_error(exc: Exception) -> bool:
"""Check if error is non-transient (should not be retried)"""
error_str = str(exc).lower()

# Model not found errors are not transient
if "model" in error_str and ("not found" in error_str or "does not exist" in error_str):
return True

# Invalid model name errors
if "invalid model" in error_str or "unknown model" in error_str:
return True

# Authentication errors (401, 403) are typically not transient
status_code = getattr(exc, "status_code", None)
if isinstance(status_code, int) and status_code in (401, 403):
return True

return False


def _is_transient_litellm_error(exc: Exception) -> bool:
"""Uses status_code when available, else falls back to exception types"""
# First check if this is a non-transient error (don't retry)
if _is_non_transient_error(exc):
return False

# Check for LiteLLM-specific exceptions first
if isinstance(exc, LiteLLMRateLimitError):
return True

# Prefer explicit status codes if present
status_code = getattr(exc, "status_code", None)
if isinstance(status_code, int):
Expand Down Expand Up @@ -485,81 +514,110 @@ async def unified_call(
self.a0_model_conf, str(msgs_conv), rate_limiter_callback
)

# Prepare call kwargs and retry config (strip A0-only params before calling LiteLLM)
# Prepare call kwargs (strip A0-only params before calling LiteLLM)
call_kwargs: dict[str, Any] = {**self.kwargs, **kwargs}
max_retries: int = int(call_kwargs.pop("a0_retry_attempts", 2))
retry_delay_s: float = float(call_kwargs.pop("a0_retry_delay_seconds", 1.5))
stream = reasoning_callback is not None or response_callback is not None or tokens_callback is not None

# results
result = ChatGenerationResult()

attempt = 0
while True:
got_any_chunk = False
try:
# call model
_completion = await acompletion(
model=self.model_name,
messages=msgs_conv,
stream=stream,
**call_kwargs,
)
try:
# call model
_completion = await acompletion(
model=self.model_name,
messages=msgs_conv,
stream=stream,
**call_kwargs,
)

if stream:
# iterate over chunks
async for chunk in _completion: # type: ignore
got_any_chunk = True
# parse chunk
parsed = _parse_chunk(chunk)
output = result.add_chunk(parsed)

# collect reasoning delta and call callbacks
if output["reasoning_delta"]:
if reasoning_callback:
await reasoning_callback(output["reasoning_delta"], result.reasoning)
if tokens_callback:
await tokens_callback(
output["reasoning_delta"],
approximate_tokens(output["reasoning_delta"]),
)
# Add output tokens to rate limiter if configured
if limiter:
limiter.add(output=approximate_tokens(output["reasoning_delta"]))
# collect response delta and call callbacks
if output["response_delta"]:
if response_callback:
await response_callback(output["response_delta"], result.response)
if tokens_callback:
await tokens_callback(
output["response_delta"],
approximate_tokens(output["response_delta"]),
)
# Add output tokens to rate limiter if configured
if limiter:
limiter.add(output=approximate_tokens(output["response_delta"]))

# non-stream response
else:
parsed = _parse_chunk(_completion)
if stream:
# iterate over chunks
async for chunk in _completion: # type: ignore
# parse chunk
parsed = _parse_chunk(chunk)
output = result.add_chunk(parsed)
if limiter:
if output["response_delta"]:
limiter.add(output=approximate_tokens(output["response_delta"]))
if output["reasoning_delta"]:

# collect reasoning delta and call callbacks
if output["reasoning_delta"]:
if reasoning_callback:
await reasoning_callback(output["reasoning_delta"], result.reasoning)
if tokens_callback:
await tokens_callback(
output["reasoning_delta"],
approximate_tokens(output["reasoning_delta"]),
)
# Add output tokens to rate limiter if configured
if limiter:
limiter.add(output=approximate_tokens(output["reasoning_delta"]))
# collect response delta and call callbacks
if output["response_delta"]:
if response_callback:
await response_callback(output["response_delta"], result.response)
if tokens_callback:
await tokens_callback(
output["response_delta"],
approximate_tokens(output["response_delta"]),
)
# Add output tokens to rate limiter if configured
if limiter:
limiter.add(output=approximate_tokens(output["response_delta"]))

# Successful completion of stream
return result.response, result.reasoning
# non-stream response
else:
parsed = _parse_chunk(_completion)
output = result.add_chunk(parsed)
if limiter:
if output["response_delta"]:
limiter.add(output=approximate_tokens(output["response_delta"]))
if output["reasoning_delta"]:
limiter.add(output=approximate_tokens(output["reasoning_delta"]))

except Exception as e:
import asyncio
# Successful completion
return result.response, result.reasoning

# Retry only if no chunks received and error is transient
if got_any_chunk or not _is_transient_litellm_error(e) or attempt >= max_retries:
raise
attempt += 1
await asyncio.sleep(retry_delay_s)
except Exception as e:
# Check for OpenRouter data policy error and provide helpful guidance
error_str = str(e)
if "openrouter" in self.provider.lower() and ("data policy" in error_str.lower() or "free model publication" in error_str.lower()):
raise Exception(
f"OpenRouter data policy error: {error_str}\n\n"
"To fix this, please:\n"
"1. Go to https://openrouter.ai/settings/privacy\n"
"2. Enable 'Free model publication' in your data policy settings\n"
"3. Or use a different model that matches your current data policy"
) from e

# Check for model not found errors (especially Ollama) and provide helpful guidance
if _is_non_transient_error(e):
error_lower = error_str.lower()
if "ollama" in error_lower or "ollama" in self.provider.lower():
if "model" in error_lower and ("not found" in error_lower or "does not exist" in error_lower):
# Extract model name from error if possible
model_name = self.model_name.split("/")[-1] if "/" in self.model_name else self.model_name
raise Exception(
f"Ollama model not found: {error_str}\n\n"
f"To fix this, please:\n"
f"1. Make sure Ollama is running: `ollama serve`\n"
f"2. Pull the model: `ollama pull {model_name}`\n"
f"3. Verify the model exists: `ollama list`\n"
f"4. Check that the model name '{model_name}' is correct"
) from e
raise Exception(f"Configuration error (not retriable): {error_str}") from e

# Provide helpful error message for rate limit errors
if isinstance(e, LiteLLMRateLimitError):
error_msg = f"Rate limit error: {error_str}"
if "openrouter" in self.provider.lower():
error_msg += (
"\n\nOpenRouter rate limit suggestions:\n"
"1. Wait a few moments and try again\n"
"2. Add your own API key at https://openrouter.ai/settings/integrations to accumulate rate limits\n"
"3. Consider using a different model or provider"
)
raise Exception(error_msg) from e

# Re-raise all other errors as-is
raise


class AsyncAIChatReplacement:
Expand Down Expand Up @@ -617,13 +675,12 @@ async def _acall(
# Apply rate limiting if configured
apply_rate_limiter_sync(self._wrapper.a0_model_conf, str(messages))

# Call the model
try:
model = kwargs.pop("model", None)
kwrgs = {**self._wrapper.kwargs, **kwargs}

# hack from browser-use to fix json schema for gemini (additionalProperties, $defs, $ref)
if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] and model.startswith("gemini/"):
if "response_format" in kwrgs and "json_schema" in kwrgs["response_format"] and model and model.startswith("gemini/"):
kwrgs["response_format"]["json_schema"] = ChatGoogle("")._fix_gemini_schema(kwrgs["response_format"]["json_schema"])

resp = await acompletion(
Expand All @@ -644,7 +701,48 @@ async def _acall(
pass

except Exception as e:
raise e
# Check for OpenRouter data policy error and provide helpful guidance
error_str = str(e)
if "openrouter" in self.provider.lower() and ("data policy" in error_str.lower() or "free model publication" in error_str.lower()):
raise Exception(
f"OpenRouter data policy error: {error_str}\n\n"
"To fix this, please:\n"
"1. Go to https://openrouter.ai/settings/privacy\n"
"2. Enable 'Free model publication' in your data policy settings\n"
"3. Or use a different model that matches your current data policy"
) from e

# Check for model not found errors (especially Ollama) and provide helpful guidance
if _is_non_transient_error(e):
error_lower = error_str.lower()
if "ollama" in error_lower or "ollama" in self.provider.lower():
if "model" in error_lower and ("not found" in error_lower or "does not exist" in error_lower):
# Extract model name from error if possible
model_name = self.model_name.split("/")[-1] if "/" in self.model_name else self.model_name
raise Exception(
f"Ollama model not found: {error_str}\n\n"
f"To fix this, please:\n"
f"1. Make sure Ollama is running: `ollama serve`\n"
f"2. Pull the model: `ollama pull {model_name}`\n"
f"3. Verify the model exists: `ollama list`\n"
f"4. Check that the model name '{model_name}' is correct"
) from e
raise Exception(f"Configuration error (not retriable): {error_str}") from e

# Provide helpful error message for rate limit errors
if isinstance(e, LiteLLMRateLimitError):
error_msg = f"Rate limit error: {error_str}"
if "openrouter" in self.provider.lower():
error_msg += (
"\n\nOpenRouter rate limit suggestions:\n"
"1. Wait a few moments and try again\n"
"2. Add your own API key at https://openrouter.ai/settings/integrations to accumulate rate limits\n"
"3. Consider using a different model or provider"
)
raise Exception(error_msg) from e

# Re-raise all other errors as-is
raise

# another hack for browser-use post process invalid jsons
try:
Expand Down
16 changes: 13 additions & 3 deletions python/helpers/memory_consolidation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class ConsolidationConfig:
max_llm_context_memories: int = 5
keyword_extraction_sys_prompt: str = "memory.keyword_extraction.sys.md"
keyword_extraction_msg_prompt: str = "memory.keyword_extraction.msg.md"
processing_timeout_seconds: int = 60
processing_timeout_seconds: int = 180 # Increased from 60 to 180 seconds for complex consolidations
# Add safety threshold for REPLACE actions
replace_similarity_threshold: float = 0.9 # Higher threshold for replacement safety

Expand Down Expand Up @@ -102,7 +102,17 @@ async def process_new_memory(
return result

except asyncio.TimeoutError:
PrintStyle().error(f"Memory consolidation timeout for area {area}")
PrintStyle().error(
f"Memory consolidation timeout for area '{area}' "
f"(exceeded {self.config.processing_timeout_seconds}s). "
f"This may occur with large memory databases or slow LLM responses. "
f"Consider increasing processing_timeout_seconds in ConsolidationConfig."
)
if log_item:
log_item.update(
result=f"Timeout after {self.config.processing_timeout_seconds}s",
error="consolidation_timeout"
)
return {"success": False, "memory_ids": []}

except Exception as e:
Expand Down Expand Up @@ -790,7 +800,7 @@ def create_memory_consolidator(agent: Agent, **config_overrides) -> MemoryConsol
- replace_similarity_threshold: Safety threshold for REPLACE actions (default 0.9)
- max_similar_memories: Maximum memories to discover (default 10)
- max_llm_context_memories: Maximum memories to send to LLM (default 5)
- processing_timeout_seconds: Timeout for consolidation processing (default 30)
- processing_timeout_seconds: Timeout for consolidation processing (default 180)
"""
config = ConsolidationConfig(**config_overrides)
return MemoryConsolidator(agent, config)
2 changes: 1 addition & 1 deletion python/helpers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1532,7 +1532,7 @@ def get_default_settings() -> Settings:
variables="",
secrets="",
litellm_global_kwargs={},
update_check_enabled=True,
update_check_enabled=False,
)


Expand Down
Loading