diff --git a/backend/app/services/search_service.py b/backend/app/services/search_service.py index 9ee9b1a..1332058 100644 --- a/backend/app/services/search_service.py +++ b/backend/app/services/search_service.py @@ -302,24 +302,22 @@ def search_youtube(query: str) -> List[SearchResult]: raise YouTubeAPIError(f"YouTube search failed: {str(e)}") def perform_search(query: str) -> List[SearchResult]: - """Perform parallel searches on Google, Bing, and YouTube APIs. - - Args: - query: The search query to run - - Returns: - Combined list of unique SearchResult objects + """Perform parallel searches on Google + YouTube and merge results. + + Bing was removed in 2026-05 — Microsoft permanently retired the Bing + Search API on 2025-08-11; every call returned `410 Gone`. Keeping the + future in the pool meant ~3s of pointless wait + log noise on every + /search, contributing to gunicorn worker timeouts on /answer. The + `search_bing` function is retained for now as dead code in case we + swap a different provider in via the same shape — see PR #1 (Tavily). """ try: - with ThreadPoolExecutor(max_workers=3) as executor: - bing_future = executor.submit(search_bing, query) + with ThreadPoolExecutor(max_workers=2) as executor: google_future = executor.submit(search_google, query) youtube_future = executor.submit(search_youtube, query) - + results = [] - - # Gather results, handling potential failures - for future in [bing_future, google_future, youtube_future]: + for future in [google_future, youtube_future]: try: results.extend(future.result()) except (SearchAPIError, YouTubeAPIError) as e: diff --git a/backend/gunicorn_config.py b/backend/gunicorn_config.py index 1a52c2c..918f563 100644 --- a/backend/gunicorn_config.py +++ b/backend/gunicorn_config.py @@ -1,3 +1,13 @@ workers = 4 # Number of worker processes worker_class = "uvicorn.workers.UvicornWorker" -bind = "0.0.0.0:8000" # Bind to the appropriate host and port \ No newline at end of file +bind = "0.0.0.0:8000" # Bind to the appropriate host and port + +# `/answer` regularly does: parallel search providers (~3s) → LLM rerank +# (~1–2s) → main answer generation. With reasoning-tuned models like +# `gpt-oss-120b` or `qwq-32b` the answer step alone can run 15–60s +# (chain-of-thought before the final answer). Gunicorn's default 30s +# timeout was killing requests mid-generation — log evidence: +# "WORKER TIMEOUT (pid:42)" → "Worker was sent SIGKILL" +# Lift to 120s. Long-tail still gets a clean 504 rather than a 30s SIGKILL. +timeout = 120 +graceful_timeout = 30