From bd492e504cdb830b8676c9f38288b08290bdc787 Mon Sep 17 00:00:00 2001
From: Paritosh Tripathi <paritosh.tripathi.work@gmail.com>
Date: Sat, 9 May 2026 22:37:26 +0530
Subject: [PATCH] fix(prod): bump gunicorn timeout to 120s; drop Bing (HTTP
 410)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two prod issues from Render logs at 2026-05-09T17:00:

1. Gunicorn killed /answer mid-generation:
     [CRITICAL] WORKER TIMEOUT (pid:42)
     Worker (pid:42) was sent SIGKILL! Perhaps out of memory?
   The default 30s worker timeout is too short for the new chat path:
   parallel search providers (~3s) → LLM rerank (~1–2s) → main answer
   on a reasoning-tuned model (gpt-oss-120b 5–15s; qwq-32b 30–60s for
   chain-of-thought). Bumped to 120s — long-tail still gets a clean 504
   instead of a SIGKILL.

2. Bing Search API returns 410 Gone on every call:
     Bing search error: 410 Client Error: Gone for url:
     https://api.bing.microsoft.com/v7.0/search?...
   Microsoft permanently retired the Bing v7 Search API on 2025-08-11.
   Removed from perform_search's executor pool. Still ~3s of pointless
   wait per /search before this; with it gone, /search drops noticeably.
   `search_bing` is left in the file as dead code — same shape if we
   want to slot a replacement (Tavily, Brave) in later. See PR #1.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/app/services/search_service.py | 24 +++++++++++-------------
 backend/gunicorn_config.py             | 12 +++++++++++-
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/backend/app/services/search_service.py b/backend/app/services/search_service.py
index 9ee9b1a..1332058 100644
--- a/backend/app/services/search_service.py
+++ b/backend/app/services/search_service.py
@@ -302,24 +302,22 @@ def search_youtube(query: str) -> List[SearchResult]:
         raise YouTubeAPIError(f"YouTube search failed: {str(e)}")
 
 def perform_search(query: str) -> List[SearchResult]:
-    """Perform parallel searches on Google, Bing, and YouTube APIs.
-    
-    Args:
-        query: The search query to run
-    
-    Returns:
-        Combined list of unique SearchResult objects
+    """Perform parallel searches on Google + YouTube and merge results.
+
+    Bing was removed in 2026-05 — Microsoft permanently retired the Bing
+    Search API on 2025-08-11; every call returned `410 Gone`. Keeping the
+    future in the pool meant ~3s of pointless wait + log noise on every
+    /search, contributing to gunicorn worker timeouts on /answer. The
+    `search_bing` function is retained for now as dead code in case we
+    swap a different provider in via the same shape — see PR #1 (Tavily).
     """
     try:
-        with ThreadPoolExecutor(max_workers=3) as executor:
-            bing_future = executor.submit(search_bing, query)
+        with ThreadPoolExecutor(max_workers=2) as executor:
             google_future = executor.submit(search_google, query)
             youtube_future = executor.submit(search_youtube, query)
-            
+
             results = []
-            
-            # Gather results, handling potential failures
-            for future in [bing_future, google_future, youtube_future]:
+            for future in [google_future, youtube_future]:
                 try:
                     results.extend(future.result())
                 except (SearchAPIError, YouTubeAPIError) as e:
diff --git a/backend/gunicorn_config.py b/backend/gunicorn_config.py
index 1a52c2c..918f563 100644
--- a/backend/gunicorn_config.py
+++ b/backend/gunicorn_config.py
@@ -1,3 +1,13 @@
 workers = 4  # Number of worker processes
 worker_class = "uvicorn.workers.UvicornWorker"
-bind = "0.0.0.0:8000"  # Bind to the appropriate host and port 
\ No newline at end of file
+bind = "0.0.0.0:8000"  # Bind to the appropriate host and port
+
+# `/answer` regularly does: parallel search providers (~3s) → LLM rerank
+# (~1–2s) → main answer generation. With reasoning-tuned models like
+# `gpt-oss-120b` or `qwq-32b` the answer step alone can run 15–60s
+# (chain-of-thought before the final answer). Gunicorn's default 30s
+# timeout was killing requests mid-generation — log evidence:
+#   "WORKER TIMEOUT (pid:42)" → "Worker was sent SIGKILL"
+# Lift to 120s. Long-tail still gets a clean 504 rather than a 30s SIGKILL.
+timeout = 120
+graceful_timeout = 30