From 0ce4e44d06e77fe1d52b4d3004a2e4ce97d32dae Mon Sep 17 00:00:00 2001 From: Paul Date: Thu, 28 May 2026 20:40:39 -0600 Subject: [PATCH 01/10] Logger improvement --- elm/web/search/run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 6a4e4449..79e08f8e 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -313,7 +313,10 @@ async def load_docs(sources, file_loader): docs = await file_loader.fetch_all(*sources) logger.debug("Loaded %d docs from %d sources", len(docs), len(sources)) docs = [doc for doc in docs if not doc.empty] - logger.debug("%d docs are not empty", len(docs)) + if len(docs)== 1: + logger.debug("%d doc is not empty", len(docs)) + else: + logger.debug("%d docs are not empty", len(docs)) page_lens = {} for doc in docs: From c21dc98fc24e5f6659b28c80758bf21ffa2aa073 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 19:45:21 -0600 Subject: [PATCH 02/10] Add raw parameter to search --- elm/web/search/base.py | 86 +++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/elm/web/search/base.py b/elm/web/search/base.py index c14ab1fb..33e6c3f0 100644 --- a/elm/web/search/base.py +++ b/elm/web/search/base.py @@ -24,7 +24,7 @@ class SearchEngineLinkSearch(ABC): _SE_NAME = "" - async def results(self, *queries, num_results=10): + async def results(self, *queries, num_results=10, raw=False): """Retrieve links for the first `num_results` of each query This function executes a search for each input query and @@ -44,24 +44,28 @@ async def results(self, *queries, num_results=10): determined by the number of results on a page (excluding ads). You can, however, use this input to limit the number of results returned. By default, ``10``. + raw : bool, optional + If ``True``, return provider-specific records that always + include a ``"url"`` field. By default, ``False``. Returns ------- list List equal to the length of the input queries, where each entry is another list containing no more than `num_results` - links. + links or raw result records. """ queries = map(clean_search_query, queries) - return await self._get_links(queries, num_results) + return await self._get_links(queries, num_results, raw=raw) - async def _get_links(self, queries, num_results): + async def _get_links(self, queries, num_results, raw=False): """Get links for multiple queries""" outer_task_name = asyncio.current_task().get_name() searches = [ asyncio.create_task( - self._skip_exc_search(query, num_results=num_results), + self._skip_exc_search(query, num_results=num_results, + raw=raw), name=outer_task_name, ) for query in queries @@ -72,10 +76,10 @@ async def _get_links(self, queries, num_results): logger.trace("Got results for link search:\n%r", results) return results - async def _skip_exc_search(self, query, num_results=10): + async def _skip_exc_search(self, query, num_results=10, raw=False): """Perform search while ignoring errors""" try: - return await self._search(query, num_results=num_results) + return await self._search(query, num_results=num_results, raw=raw) except KeyboardInterrupt: raise except Exception as e: @@ -113,7 +117,7 @@ async def _move_and_click(self, page, input_el): return await page.mouse.click(x, y) @abstractmethod - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" raise NotImplementedError @@ -181,7 +185,7 @@ async def _browser_page(self): async with pw_page(**page_kwargs) as page: yield page - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" logger.debug("Searching %s: %r", self._SE_NAME, query) num_results = min(num_results, self.MAX_RESULTS_CONSIDERED_PER_PAGE) @@ -200,16 +204,17 @@ async def _search(self, query, num_results=10): await _navigate_to_se_url(page, se_url=url, timeout=self.PAGE_LOAD_TIMEOUT) logger.trace("Extracting links for query: %r", query) - return await self._extract_links(page, num_results, query) + return await self._extract_links(page, num_results, query, raw=raw) - async def _get_links(self, queries, num_results): + async def _get_links(self, queries, num_results, raw=False): """Get links for multiple queries""" outer_task_name = asyncio.current_task().get_name() async with async_playwright() as pw_instance: await self._load_browser(pw_instance) searches = [ asyncio.create_task( - self._skip_exc_search(query, num_results=num_results), + self._skip_exc_search(query, num_results=num_results, + raw=raw), name=outer_task_name, ) for query in queries @@ -221,7 +226,7 @@ async def _get_links(self, queries, num_results): await self._close_browser() return results - async def _extract_links(self, page, num_results, query): + async def _extract_links(self, page, num_results, query, raw=False): """Extract links for top `num_results` on page""" await page.wait_for_load_state("networkidle", timeout=self.PAGE_LOAD_TIMEOUT) @@ -243,7 +248,7 @@ async def _extract_links(self, page, num_results, query): if len(links) >= num_results: break - return links + return _format_url_results(links, raw=raw) @property @abstractmethod @@ -320,8 +325,61 @@ def get_response(self, path='/search'): raise e +def format_search_results(results, url_key, raw=False): + """Normalize structured search results into a consistent shape + + Parameters + ---------- + results : iterable of dict + Iterable of search result records, where each record is a dict + containing at least a key corresponding to `url_key` whose value + is the URL of the search result. + url_key : str + Key in each search result record that corresponds to the URL of + the search result. + raw : bool, optional + Option to return a list of dicts with attrs for each query + instead of only a list of url strings. By default, ``False``. + + Returns + ------- + list + List of URLs corresponding to the search results, or if + `raw=True`, a list of dicts containing the URL and attrs for + each search result. + """ + formatted_results = [] + for info in results: + url = _clean_search_result_url(info.get(url_key, "")) + if not url: + continue + + if raw: + formatted_results.append({"url": url, "attrs": info}) + else: + formatted_results.append(url) + + return formatted_results + + async def _navigate_to_se_url(page, se_url, timeout=90_000): """Navigate to search engine url""" await page.goto(se_url) logger.trace("Waiting for load") await page.wait_for_load_state("networkidle", timeout=timeout) + + + +def _clean_search_result_url(url): + """Normalize a search result URL""" + return (url or "").replace("+", "%20") + + +def _format_url_results(urls, raw=False): + """Normalize URL-only search results into a consistent shape""" + formatted_results = list(filter(None, (_clean_search_result_url(url) + for url in urls))) + if not raw: + return formatted_results + + return [{"url": url} for url in formatted_results] \ No newline at end of file From 81583edd6befd8b52a463d56beeef61add05644d Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 19:46:08 -0600 Subject: [PATCH 03/10] Use raw parameter in searches --- elm/web/search/duckduckgo.py | 13 +++++++------ elm/web/search/dux.py | 7 +++---- elm/web/search/google.py | 21 ++++++++++----------- elm/web/search/tavily.py | 8 ++++---- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 5bd2cda0..546016ba 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -7,7 +7,8 @@ from ddgs import DDGS from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, - SearchEngineLinkSearch) + SearchEngineLinkSearch, + format_search_results) logger = logging.getLogger(__name__) @@ -80,7 +81,7 @@ def __init__(self, region="us-en", timeout=10, verify=False, self.sleep_min_seconds = sleep_min_seconds self.sleep_max_seconds = sleep_max_seconds - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" ddgs = DDGS(timeout=self.timeout, verify=self.verify) @@ -88,14 +89,14 @@ async def _search(self, query, num_results=10): backend="duckduckgo", num_results=num_results) - return list(filter(None, (info.get('href', "").replace("+", "%20") - for info in results))) + return format_search_results(results, "href", raw=raw) - async def _skip_exc_search(self, query, num_results=10): + async def _skip_exc_search(self, query, num_results=10, raw=False): """Sleep between DDG searched to avoid rate limiting""" async with _DDGS_SEMAPHORE: try: - out = await self._search(query, num_results=num_results) + out = await self._search(query, num_results=num_results, + raw=raw) except Exception as e: logger.exception(e) out = [] diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 4dcdaf4c..92e4d1b9 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -4,7 +4,7 @@ from ddgs import DDGS -from elm.web.search.base import SearchEngineLinkSearch +from elm.web.search.base import SearchEngineLinkSearch, format_search_results logger = logging.getLogger(__name__) @@ -68,7 +68,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, self.timeout = timeout self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" ddgs = DDGS(timeout=self.timeout, verify=self.verify) @@ -79,5 +79,4 @@ async def _search(self, query, num_results=10): backend=self.backend, max_results=num_results) - return list(filter(None, (info.get('href', "").replace("+", "%20") - for info in results))) + return format_search_results(results, "href", raw=raw) diff --git a/elm/web/search/google.py b/elm/web/search/google.py index 7214103c..6a6e27fd 100644 --- a/elm/web/search/google.py +++ b/elm/web/search/google.py @@ -14,7 +14,8 @@ from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, APISearchEngineLinkSearch, - PatchedSerpApiClient) + PatchedSerpApiClient, + format_search_results) logger = logging.getLogger(__name__) @@ -228,7 +229,7 @@ def __init__(self, api_key=None, cse_id=None): super().__init__(api_key=api_key) self.cse_id = cse_id or os.environ.get(self.CSE_ID_VAR or "") - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" build_args = dict(self._BUILD_ARGS) build_args["developerKey"] = self.api_key @@ -237,7 +238,7 @@ async def _search(self, query, num_results=10): results = build(**build_args).cse().list(**search_args).execute() results = (results or {}).get('items', []) - return list(filter(None, (info.get("link") for info in results))) + return format_search_results(results, "link", raw=raw) class SerpAPIGoogleSearch(APISearchEngineLinkSearch): @@ -264,7 +265,7 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10, **param_kwargs): + async def _search(self, query, num_results=10, raw=False, **param_kwargs): """Search web for links related to a query""" params = {"q": query, "hl": "en", "gl": "us", "api_key": self.api_key} @@ -273,9 +274,8 @@ async def _search(self, query, num_results=10, **param_kwargs): client = PatchedSerpApiClient(params, engine="google", verify=self.verify) results = client.get_dict() - results = results.get("organic_results", []) - return list(filter(None, (info.get('link', "").replace("+", "%20") - for info in results)))[:num_results] + results = (results or {}).get("organic_results", []) + return format_search_results(results, "link", raw=raw)[:num_results] class APISerperSearch(APISearchEngineLinkSearch): @@ -303,7 +303,7 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" payload = json.dumps({"q": query, "num": num_results}) @@ -312,6 +312,5 @@ async def _search(self, query, num_results=10): response = requests.request("POST", self._URL, headers=headers, data=payload, verify=self.verify) - results = json.loads(response.text).get('organic', {}) - return list(filter(None, (result.get("link", "").replace("+", "%20") - for result in results))) + results = json.loads(response.text).get('organic', []) + return format_search_results(results, "link", raw=raw) diff --git a/elm/web/search/tavily.py b/elm/web/search/tavily.py index f1a4b2cb..33d89b31 100644 --- a/elm/web/search/tavily.py +++ b/elm/web/search/tavily.py @@ -8,7 +8,8 @@ from tavily.errors import (UsageLimitExceededError, InvalidAPIKeyError, BadRequestError, ForbiddenError) -from elm.web.search.base import APISearchEngineLinkSearch +from elm.web.search.base import (APISearchEngineLinkSearch, + format_search_results) logger = logging.getLogger(__name__) @@ -140,11 +141,10 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" client = _PatchedTavilyClient(api_key=self.api_key, verify=self.verify) response = client.search(query=query, max_results=num_results) results = response.get("results", []) - return list(filter(None, (info.get('url', "").replace("+", "%20") - for info in results))) + return format_search_results(results, "url", raw=raw) From 068065b11f5842cae295f15c5f85aa9af1b1c853 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:08:46 -0600 Subject: [PATCH 04/10] Return more attrs with raw result --- elm/web/search/base.py | 21 +++++++++++++++------ elm/web/search/duckduckgo.py | 3 ++- elm/web/search/dux.py | 3 ++- elm/web/search/google.py | 9 ++++++--- elm/web/search/tavily.py | 3 ++- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/elm/web/search/base.py b/elm/web/search/base.py index 33e6c3f0..42b83e7a 100644 --- a/elm/web/search/base.py +++ b/elm/web/search/base.py @@ -248,7 +248,7 @@ async def _extract_links(self, page, num_results, query, raw=False): if len(links) >= num_results: break - return _format_url_results(links, raw=raw) + return _format_url_results(self._SE_NAME, query, links, raw=raw) @property @abstractmethod @@ -325,11 +325,15 @@ def get_response(self, path='/search'): raise e -def format_search_results(results, url_key, raw=False): +def format_search_results(se_name, query, results, url_key, raw=False): """Normalize structured search results into a consistent shape Parameters ---------- + se_name : str + Name of the search engine that produced the results. + query : str + The search query corresponding to the search results. results : iterable of dict Iterable of search result records, where each record is a dict containing at least a key corresponding to `url_key` whose value @@ -349,13 +353,16 @@ def format_search_results(results, url_key, raw=False): each search result. """ formatted_results = [] - for info in results: + for rank, info in enumerate(results, start=1): url = _clean_search_result_url(info.get(url_key, "")) if not url: continue if raw: - formatted_results.append({"url": url, "attrs": info}) + formatted_results.append({"url": url, "query": query, + "search_engine": se_name, + "query_rank": rank, + "attrs": info}) else: formatted_results.append(url) @@ -375,11 +382,13 @@ def _clean_search_result_url(url): return (url or "").replace("+", "%20") -def _format_url_results(urls, raw=False): +def _format_url_results(se_name, query, urls, raw=False): """Normalize URL-only search results into a consistent shape""" formatted_results = list(filter(None, (_clean_search_result_url(url) for url in urls))) if not raw: return formatted_results - return [{"url": url} for url in formatted_results] \ No newline at end of file + return [{"url": url, "query": query, "search_engine": se_name, + "query_rank": rank,} + for rank, url in enumerate(formatted_results, start=1)] \ No newline at end of file diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 546016ba..bf8e2330 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -89,7 +89,8 @@ async def _search(self, query, num_results=10, raw=False): backend="duckduckgo", num_results=num_results) - return format_search_results(results, "href", raw=raw) + return format_search_results(self._SE_NAME, query, results, + url_key="href", raw=raw) async def _skip_exc_search(self, query, num_results=10, raw=False): """Sleep between DDG searched to avoid rate limiting""" diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 92e4d1b9..48222118 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -79,4 +79,5 @@ async def _search(self, query, num_results=10, raw=False): backend=self.backend, max_results=num_results) - return format_search_results(results, "href", raw=raw) + return format_search_results(self._SE_NAME, query, results, + url_key="href", raw=raw) diff --git a/elm/web/search/google.py b/elm/web/search/google.py index 6a6e27fd..b889e811 100644 --- a/elm/web/search/google.py +++ b/elm/web/search/google.py @@ -238,7 +238,8 @@ async def _search(self, query, num_results=10, raw=False): results = build(**build_args).cse().list(**search_args).execute() results = (results or {}).get('items', []) - return format_search_results(results, "link", raw=raw) + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw) class SerpAPIGoogleSearch(APISearchEngineLinkSearch): @@ -275,7 +276,8 @@ async def _search(self, query, num_results=10, raw=False, **param_kwargs): verify=self.verify) results = client.get_dict() results = (results or {}).get("organic_results", []) - return format_search_results(results, "link", raw=raw)[:num_results] + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw)[:num_results] class APISerperSearch(APISearchEngineLinkSearch): @@ -313,4 +315,5 @@ async def _search(self, query, num_results=10, raw=False): response = requests.request("POST", self._URL, headers=headers, data=payload, verify=self.verify) results = json.loads(response.text).get('organic', []) - return format_search_results(results, "link", raw=raw) + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw) diff --git a/elm/web/search/tavily.py b/elm/web/search/tavily.py index 33d89b31..6f6fe1ac 100644 --- a/elm/web/search/tavily.py +++ b/elm/web/search/tavily.py @@ -147,4 +147,5 @@ async def _search(self, query, num_results=10, raw=False): client = _PatchedTavilyClient(api_key=self.api_key, verify=self.verify) response = client.search(query=query, max_results=num_results) results = response.get("results", []) - return format_search_results(results, "url", raw=raw) + return format_search_results(self._SE_NAME, query, results, + url_key="url", raw=raw) From 8c66936c893a88dcbee35cd003f277c968657aee Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:09:12 -0600 Subject: [PATCH 05/10] Add tests --- tests/web/search/test_web_search_api.py | 89 ++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/tests/web/search/test_web_search_api.py b/tests/web/search/test_web_search_api.py index 76030773..fff7f7cf 100644 --- a/tests/web/search/test_web_search_api.py +++ b/tests/web/search/test_web_search_api.py @@ -7,7 +7,11 @@ import elm.web.search.duckduckgo import elm.web.search.google -from elm.web.search.base import APISearchEngineLinkSearch +from elm.web.search.base import (APISearchEngineLinkSearch, + SearchEngineLinkSearch, + format_search_results, + _format_url_results) +from elm.web.search.run import _single_query_api SE_API_TO_TEST = [(elm.web.search.duckduckgo.APIDuckDuckGoSearch, @@ -43,6 +47,89 @@ async def _search(self, *__, **___): assert MockAPISearchEngine().api_key is None +def test_format_search_results_raw(): + """Test raw structured results preserve URL and attrs""" + results = [{"href": "https://example.com/a+b", "title": "Result A"}, + {"href": "", "title": "Missing URL"}] + + out = format_search_results("test_se", "query", results, "href", raw=False) + assert out == ["https://example.com/a%20b"] + + out = format_search_results("test_se", "query", results, "href", raw=True) + assert out == [{ + "url": "https://example.com/a%20b", + "query": "query", + "search_engine": "test_se", + "query_rank": 1, + "attrs": {"href": "https://example.com/a+b", "title": "Result A"}, + }] + + +def test_format_url_results_raw(): + """Test raw URL-only results still include URL keys""" + out = _format_url_results("test_se", "query", + ["https://example.com/a+b", ""]) + assert out == ["https://example.com/a%20b"] + + out = _format_url_results("test_se", "query", + ["https://example.com/a+b", ""], raw=True) + assert out == [ + { + "url": "https://example.com/a%20b", + "query": "query", + "search_engine": "test_se", + "query_rank": 1, + } + ] + + +@pytest.mark.asyncio +async def test_results_passes_raw_flag(): + """Test SearchEngineLinkSearch.results forwards raw to _search""" + + class MockSearchEngine(SearchEngineLinkSearch): + """MockSearchEngine""" + + async def _search(self, query, num_results=10, raw=False): + if raw: + return [{"url": query, "attrs": {"query": query}}] + return [query] + + search_engine = MockSearchEngine() + + assert await search_engine.results("https://example.com") == [[ + "https://example.com" + ]] + assert await search_engine.results("https://example.com", raw=True) == [[{ + "url": "https://example.com", + 'attrs': {'query': 'https://example.com'}, + }]] + + +@pytest.mark.asyncio +async def test_single_query_api_passes_raw_flag(): + """Test internal API runner preserves raw search outputs""" + + class MockSearchEngine: + """MockSearchEngine""" + + _SE_NAME = "Mock" + + async def results(self, *queries, num_results=10, raw=False): + assert queries == ("query",) + assert num_results > 0 + return [[{"url": "https://example.com", "attrs": {}}]] if raw else [[ + "https://example.com" + ]] + + search_engine = MockSearchEngine() + + assert await _single_query_api(search_engine, "query", raw=True) == [[{ + "url": "https://example.com", + "attrs": {}, + }]] + + @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") == "true", reason="Fails in GHA due to rate limiting") @pytest.mark.parametrize("queries", [['1. "NatLabRockies elm"'], From 719f153dfbb3feb22f0f624ea89aa562a26ab77f Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:24:09 -0600 Subject: [PATCH 06/10] Add `search_all_se` function --- elm/web/search/run.py | 147 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 133 insertions(+), 14 deletions(-) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 79e08f8e..47afca38 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -131,6 +131,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, - ddg_api_kwargs - google_cse_api_kwargs - google_serper_api_kwargs + - google_serpapi_kwargs - tavily_api_kwargs - ddgs_kwargs - cf_google_se_kwargs @@ -235,6 +236,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, - ddg_api_kwargs - google_cse_api_kwargs - google_serper_api_kwargs + - google_serpapi_kwargs - tavily_api_kwargs - ddgs_kwargs - cf_google_se_kwargs @@ -280,10 +282,9 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, return urls else: for se_name in search_engines: - logger.debug("Searching web using %r", se_name) urls = await _single_se_search(se_name, queries, num_urls, ignore_url_parts, browser_semaphore, - task_name, kwargs) + task_name, kwargs, raw=False) if urls: return urls @@ -292,6 +293,116 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, return set() +async def search_all_se(queries, search_engines=_DEFAULT_SE, + num_urls=None, ignore_url_parts=None, + browser_semaphore=None, task_name=None, **kwargs): + """Retrieve search query URLs using multiple search engines if needed + + Parameters + ---------- + queries : collection of str + Collection of strings representing google queries. Documents for + the top `num_urls` google search results (from all of these + queries _combined_ will be returned from this function. + search_engines : iterable of str + Ordered collection of search engine names to attempt for web + search. If the first search engine in the list returns a set + of URLs, then iteration will end and documents for each URL will + be returned. Otherwise, the next engine in this list will be + used to run the web search. If this also fails, the next engine + is used and so on. If all web searches fail, an empty list is + returned. See :obj:`~elm.web.search.run.SEARCH_ENGINE_OPTIONS` + for supported search engine options. + By default, ``("PlaywrightGoogleLinkSearch", )``. + num_urls : int, optional + Number of unique top Google search result to return as docs. The + google search results from all queries are interleaved and the + top `num_urls` unique URL's are downloaded as docs. If this + number is less than ``len(queries)``, some of your queries may + not contribute to the final output. By default, ``None``, which + sets ``num_urls = 3 * len(queries)``. + ignore_url_parts : iterable of str, optional + Optional URL components to blacklist. For example, supplying + `ignore_url_parts={"wikipedia.org"}` will ignore all URLs that + contain "wikipedia.org". By default, ``None``. + browser_semaphore : :class:`asyncio.Semaphore`, optional + Semaphore instance that can be used to limit the number of + playwright browsers open concurrently. If ``None``, no limits + are applied. By default, ``None``. + task_name : str, optional + Optional task name to use in :func:`asyncio.create_task`. + By default, ``None``. + use_fallback_per_query : bool, default=True + Option to use the fallback list of search engines on a per-query + basis. This means if a single query fails with one search + engine, the fallback search engines will be attempted for that + query. If this input is ``False``, the fallback search engines + are only used if *all* search queries fail for a single search + engine. By default, ``True``. + **kwargs + Keyword-argument pairs to initialize search engines. This input + can include and any/all of the following keywords: + + - ddg_api_kwargs + - google_cse_api_kwargs + - google_serper_api_kwargs + - google_serpapi_kwargs + - tavily_api_kwargs + - ddgs_kwargs + - cf_google_se_kwargs + - pw_bing_se_kwargs + - pw_ddg_se_kwargs + - pw_google_cse_kwargs + - pw_google_se_kwargs + - pw_yahoo_se_kwargs + - pw_launch_kwargs + + Each of these inputs should be a dictionary with + keyword-argument pairs that you can use to initialize the search + engines in the `search_engines` input. If ``pw_launch_kwargs`` + is detected, it will be added to the kwargs for all of the + PLaywright-based search engines so that you do not have to + repeatedly specify the launch parameters. For example, you may + specify ``pw_launch_kwargs={"headless": False}`` to + have all Playwright-based searches show the browser and _also_ + specify ``google_serper_api_kwargs={"api_key": "..."}`` to + specify the API key for the Google Serper search. + + Returns + ------- + list of list of dict + List of search results for each query, where each search result + is represented as a dictionary containing the following keys: + + - url: URL of the search result + - query: The search query that resulted in this search result + - search_engine: The search engine that returned this result + - query_rank: The rank of this search result for the query + + Other keys such as "attrs" may also be included depending on the + search engine. + + + Raises + ------ + ELMInputError + If `search_engines` input is empty. + """ + num_urls = num_urls or 3 * len(queries) + if len(search_engines) < 1: + msg = f"Must provide at least one search engine! Got {search_engines=}" + logger.error(msg) + raise ELMInputError(msg) + + searchers = [asyncio.create_task( + _single_se_search(se_name, queries, num_urls, + ignore_url_parts, browser_semaphore, + task_name, kwargs, raw=True), + name=task_name) for se_name in search_engines] + + return await asyncio.gather(*searchers) + + async def load_docs(sources, file_loader): """Load a document for each input URL @@ -329,10 +440,14 @@ async def load_docs(sources, file_loader): async def _single_se_search(se_name, queries, num_urls, ignore_url_parts, - browser_sem, task_name, kwargs): + browser_sem, task_name, kwargs, raw=False): """Search for links using a single search engine""" _validate_se_name(se_name) - links = await _run_search(se_name, queries, browser_sem, task_name, kwargs) + logger.debug("Searching web using %r", se_name) + links = await _run_search(se_name, queries, browser_sem, task_name, + kwargs, raw) + if raw: + return [link[0] for link in links] return _down_select_urls(links, num_urls=num_urls, ignore_url_parts=ignore_url_parts) @@ -347,7 +462,7 @@ async def _multi_se_search(search_engines, queries, num_urls, logger.debug("Searching web using %r", se_name) links = await _run_search(se_name, remaining_queries, browser_sem, - task_name, kwargs) + task_name, kwargs, raw=False) logger.trace("Links: %r", links) failed_queries = [] @@ -369,15 +484,16 @@ async def _multi_se_search(search_engines, queries, num_urls, ignore_url_parts=ignore_url_parts) -async def _run_search(se_name, queries, browser_sem, task_name, kwargs): +async def _run_search(se_name, queries, browser_sem, task_name, kwargs, raw): """Run a search for multiple queries on a single search engine""" searchers = [asyncio.create_task(_single_query_search(se_name, query, - browser_sem, kwargs), + browser_sem, kwargs, + raw), name=task_name) for query in queries] return await asyncio.gather(*searchers) -async def _single_query_search(se_name, query, browser_sem, kwargs): +async def _single_query_search(se_name, query, browser_sem, kwargs, raw): """Execute a single search query on a single search engine""" try: search_engine, uses_browser = _init_se(se_name, kwargs) @@ -390,12 +506,13 @@ async def _single_query_search(se_name, query, browser_sem, kwargs): # help avoid some detection by staggering the browser launches await asyncio.sleep(random.uniform(1, 10)) return await _single_query_pw(search_engine, query, - browser_sem=browser_sem) + browser_sem=browser_sem, + raw=raw) - return await _single_query_api(search_engine, query) + return await _single_query_api(search_engine, query, raw=raw) -async def _single_query_pw(search_engine, question, browser_sem): +async def _single_query_pw(search_engine, question, browser_sem, raw=False): """Perform a single browser-based search""" if browser_sem is None: browser_sem = AsyncExitStack() @@ -405,14 +522,16 @@ async def _single_query_pw(search_engine, question, browser_sem): logger.trace("Starting %s search for %r with browser_semaphore=%r", search_engine._SE_NAME, question, browser_sem) return await search_engine.results(question, - num_results=_RESULTS_PER_QUERY) + num_results=_RESULTS_PER_QUERY, + raw=raw) -async def _single_query_api(search_engine, question): +async def _single_query_api(search_engine, question, raw=False): """Perform a single api-based search""" logger.trace("Starting %s search for %r", search_engine._SE_NAME, question) return await search_engine.results(question, - num_results=_RESULTS_PER_QUERY) + num_results=_RESULTS_PER_QUERY, + raw=raw) def _init_se(se_name, kwargs): From 6b9106a7629fd09e5ad35d0a745552ee73e0d977 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:24:28 -0600 Subject: [PATCH 07/10] Remove param doc --- elm/web/search/run.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 47afca38..e8f6398a 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -332,13 +332,6 @@ async def search_all_se(queries, search_engines=_DEFAULT_SE, task_name : str, optional Optional task name to use in :func:`asyncio.create_task`. By default, ``None``. - use_fallback_per_query : bool, default=True - Option to use the fallback list of search engines on a per-query - basis. This means if a single query fails with one search - engine, the fallback search engines will be attempted for that - query. If this input is ``False``, the fallback search engines - are only used if *all* search queries fail for a single search - engine. By default, ``True``. **kwargs Keyword-argument pairs to initialize search engines. This input can include and any/all of the following keywords: From e3c9120c1f2cc2bf352ef502e9fba255ceebb777 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:29:49 -0600 Subject: [PATCH 08/10] Add `async_get_dict` --- elm/web/search/base.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/elm/web/search/base.py b/elm/web/search/base.py index 42b83e7a..b9d3552d 100644 --- a/elm/web/search/base.py +++ b/elm/web/search/base.py @@ -324,6 +324,21 @@ def get_response(self, path='/search'): logger.error(e, e.response.status_code) raise e + async def async_get_dict(self, path='/search'): + """Get search response as dict + + Parameters + ---------- + path : str, default='/search' + API path to use for the search. + + Returns + ------- + Dict with the formatted response content + """ + self.params_dict["output"] = "json" + return dict(json.loads(self.get_response(path).text)) + def format_search_results(se_name, query, results, url_key, raw=False): """Normalize structured search results into a consistent shape From d1f2f4c7e8e7738cfb1ccff19afb0db4c96129db Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:35:27 -0600 Subject: [PATCH 09/10] `SerpAPIGoogleSearch` is now actually async --- elm/web/search/base.py | 32 +++++++++++++++++++++++++++++++- elm/web/search/google.py | 2 +- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/elm/web/search/base.py b/elm/web/search/base.py index b9d3552d..0eb3dc95 100644 --- a/elm/web/search/base.py +++ b/elm/web/search/base.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- """ELM Web Scraping - Base class for search engine search""" import os +import json import random import asyncio import logging import requests +import httpx from urllib.parse import quote from abc import ABC, abstractmethod from contextlib import asynccontextmanager @@ -324,6 +326,34 @@ def get_response(self, path='/search'): logger.error(e, e.response.status_code) raise e + async def async_get_response(self, path='/search'): + """Get search response + + Parameters + ---------- + path : str, default='/search' + API path to use for the search. + + Returns + ------- + Response object provided by ``httpx.AsyncClient.get``. + """ + url = None + try: + url, parameter = self.construct_url(path) + timeout = self.timeout / 1000 + async with httpx.AsyncClient(verify=self.verify, + timeout=timeout) as client: + response = await client.get(url, params=parameter) + return response + except httpx.HTTPError as e: + logger.error("fail: " + url) + if e.response is not None: + logger.error(e, e.response.status_code) + else: + logger.error(e) + raise e + async def async_get_dict(self, path='/search'): """Get search response as dict @@ -337,7 +367,7 @@ async def async_get_dict(self, path='/search'): Dict with the formatted response content """ self.params_dict["output"] = "json" - return dict(json.loads(self.get_response(path).text)) + return dict(json.loads((await self.async_get_response(path)).text)) def format_search_results(se_name, query, results, url_key, raw=False): diff --git a/elm/web/search/google.py b/elm/web/search/google.py index b889e811..7bdf7e63 100644 --- a/elm/web/search/google.py +++ b/elm/web/search/google.py @@ -274,7 +274,7 @@ async def _search(self, query, num_results=10, raw=False, **param_kwargs): client = PatchedSerpApiClient(params, engine="google", verify=self.verify) - results = client.get_dict() + results = await client.async_get_dict() results = (results or {}).get("organic_results", []) return format_search_results(self._SE_NAME, query, results, url_key="link", raw=raw)[:num_results] From 7cb7af88fa9fab1776371f0fd10939e1fcaa75b2 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 29 May 2026 20:35:41 -0600 Subject: [PATCH 10/10] Bump version --- elm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/version.py b/elm/version.py index 61d1a8c6..e8b38249 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.40" +__version__ = "0.0.41"