diff --git a/elm/version.py b/elm/version.py index 61d1a8c6..e8b38249 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.40" +__version__ = "0.0.41" diff --git a/elm/web/search/base.py b/elm/web/search/base.py index c14ab1fb..0eb3dc95 100644 --- a/elm/web/search/base.py +++ b/elm/web/search/base.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- """ELM Web Scraping - Base class for search engine search""" import os +import json import random import asyncio import logging import requests +import httpx from urllib.parse import quote from abc import ABC, abstractmethod from contextlib import asynccontextmanager @@ -24,7 +26,7 @@ class SearchEngineLinkSearch(ABC): _SE_NAME = "" - async def results(self, *queries, num_results=10): + async def results(self, *queries, num_results=10, raw=False): """Retrieve links for the first `num_results` of each query This function executes a search for each input query and @@ -44,24 +46,28 @@ async def results(self, *queries, num_results=10): determined by the number of results on a page (excluding ads). You can, however, use this input to limit the number of results returned. By default, ``10``. + raw : bool, optional + If ``True``, return provider-specific records that always + include a ``"url"`` field. By default, ``False``. Returns ------- list List equal to the length of the input queries, where each entry is another list containing no more than `num_results` - links. + links or raw result records. """ queries = map(clean_search_query, queries) - return await self._get_links(queries, num_results) + return await self._get_links(queries, num_results, raw=raw) - async def _get_links(self, queries, num_results): + async def _get_links(self, queries, num_results, raw=False): """Get links for multiple queries""" outer_task_name = asyncio.current_task().get_name() searches = [ asyncio.create_task( - self._skip_exc_search(query, num_results=num_results), + self._skip_exc_search(query, num_results=num_results, + raw=raw), name=outer_task_name, ) for query in queries @@ -72,10 +78,10 @@ async def _get_links(self, queries, num_results): logger.trace("Got results for link search:\n%r", results) return results - async def _skip_exc_search(self, query, num_results=10): + async def _skip_exc_search(self, query, num_results=10, raw=False): """Perform search while ignoring errors""" try: - return await self._search(query, num_results=num_results) + return await self._search(query, num_results=num_results, raw=raw) except KeyboardInterrupt: raise except Exception as e: @@ -113,7 +119,7 @@ async def _move_and_click(self, page, input_el): return await page.mouse.click(x, y) @abstractmethod - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" raise NotImplementedError @@ -181,7 +187,7 @@ async def _browser_page(self): async with pw_page(**page_kwargs) as page: yield page - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" logger.debug("Searching %s: %r", self._SE_NAME, query) num_results = min(num_results, self.MAX_RESULTS_CONSIDERED_PER_PAGE) @@ -200,16 +206,17 @@ async def _search(self, query, num_results=10): await _navigate_to_se_url(page, se_url=url, timeout=self.PAGE_LOAD_TIMEOUT) logger.trace("Extracting links for query: %r", query) - return await self._extract_links(page, num_results, query) + return await self._extract_links(page, num_results, query, raw=raw) - async def _get_links(self, queries, num_results): + async def _get_links(self, queries, num_results, raw=False): """Get links for multiple queries""" outer_task_name = asyncio.current_task().get_name() async with async_playwright() as pw_instance: await self._load_browser(pw_instance) searches = [ asyncio.create_task( - self._skip_exc_search(query, num_results=num_results), + self._skip_exc_search(query, num_results=num_results, + raw=raw), name=outer_task_name, ) for query in queries @@ -221,7 +228,7 @@ async def _get_links(self, queries, num_results): await self._close_browser() return results - async def _extract_links(self, page, num_results, query): + async def _extract_links(self, page, num_results, query, raw=False): """Extract links for top `num_results` on page""" await page.wait_for_load_state("networkidle", timeout=self.PAGE_LOAD_TIMEOUT) @@ -243,7 +250,7 @@ async def _extract_links(self, page, num_results, query): if len(links) >= num_results: break - return links + return _format_url_results(self._SE_NAME, query, links, raw=raw) @property @abstractmethod @@ -319,9 +326,114 @@ def get_response(self, path='/search'): logger.error(e, e.response.status_code) raise e + async def async_get_response(self, path='/search'): + """Get search response + + Parameters + ---------- + path : str, default='/search' + API path to use for the search. + + Returns + ------- + Response object provided by ``httpx.AsyncClient.get``. + """ + url = None + try: + url, parameter = self.construct_url(path) + timeout = self.timeout / 1000 + async with httpx.AsyncClient(verify=self.verify, + timeout=timeout) as client: + response = await client.get(url, params=parameter) + return response + except httpx.HTTPError as e: + logger.error("fail: " + url) + if e.response is not None: + logger.error(e, e.response.status_code) + else: + logger.error(e) + raise e + + async def async_get_dict(self, path='/search'): + """Get search response as dict + + Parameters + ---------- + path : str, default='/search' + API path to use for the search. + + Returns + ------- + Dict with the formatted response content + """ + self.params_dict["output"] = "json" + return dict(json.loads((await self.async_get_response(path)).text)) + + +def format_search_results(se_name, query, results, url_key, raw=False): + """Normalize structured search results into a consistent shape + + Parameters + ---------- + se_name : str + Name of the search engine that produced the results. + query : str + The search query corresponding to the search results. + results : iterable of dict + Iterable of search result records, where each record is a dict + containing at least a key corresponding to `url_key` whose value + is the URL of the search result. + url_key : str + Key in each search result record that corresponds to the URL of + the search result. + raw : bool, optional + Option to return a list of dicts with attrs for each query + instead of only a list of url strings. By default, ``False``. + + Returns + ------- + list + List of URLs corresponding to the search results, or if + `raw=True`, a list of dicts containing the URL and attrs for + each search result. + """ + formatted_results = [] + for rank, info in enumerate(results, start=1): + url = _clean_search_result_url(info.get(url_key, "")) + if not url: + continue + + if raw: + formatted_results.append({"url": url, "query": query, + "search_engine": se_name, + "query_rank": rank, + "attrs": info}) + else: + formatted_results.append(url) + + return formatted_results + async def _navigate_to_se_url(page, se_url, timeout=90_000): """Navigate to search engine url""" await page.goto(se_url) logger.trace("Waiting for load") await page.wait_for_load_state("networkidle", timeout=timeout) + + + +def _clean_search_result_url(url): + """Normalize a search result URL""" + return (url or "").replace("+", "%20") + + +def _format_url_results(se_name, query, urls, raw=False): + """Normalize URL-only search results into a consistent shape""" + formatted_results = list(filter(None, (_clean_search_result_url(url) + for url in urls))) + if not raw: + return formatted_results + + return [{"url": url, "query": query, "search_engine": se_name, + "query_rank": rank,} + for rank, url in enumerate(formatted_results, start=1)] \ No newline at end of file diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 5bd2cda0..bf8e2330 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -7,7 +7,8 @@ from ddgs import DDGS from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, - SearchEngineLinkSearch) + SearchEngineLinkSearch, + format_search_results) logger = logging.getLogger(__name__) @@ -80,7 +81,7 @@ def __init__(self, region="us-en", timeout=10, verify=False, self.sleep_min_seconds = sleep_min_seconds self.sleep_max_seconds = sleep_max_seconds - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" ddgs = DDGS(timeout=self.timeout, verify=self.verify) @@ -88,14 +89,15 @@ async def _search(self, query, num_results=10): backend="duckduckgo", num_results=num_results) - return list(filter(None, (info.get('href', "").replace("+", "%20") - for info in results))) + return format_search_results(self._SE_NAME, query, results, + url_key="href", raw=raw) - async def _skip_exc_search(self, query, num_results=10): + async def _skip_exc_search(self, query, num_results=10, raw=False): """Sleep between DDG searched to avoid rate limiting""" async with _DDGS_SEMAPHORE: try: - out = await self._search(query, num_results=num_results) + out = await self._search(query, num_results=num_results, + raw=raw) except Exception as e: logger.exception(e) out = [] diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 4dcdaf4c..48222118 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -4,7 +4,7 @@ from ddgs import DDGS -from elm.web.search.base import SearchEngineLinkSearch +from elm.web.search.base import SearchEngineLinkSearch, format_search_results logger = logging.getLogger(__name__) @@ -68,7 +68,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, self.timeout = timeout self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" ddgs = DDGS(timeout=self.timeout, verify=self.verify) @@ -79,5 +79,5 @@ async def _search(self, query, num_results=10): backend=self.backend, max_results=num_results) - return list(filter(None, (info.get('href', "").replace("+", "%20") - for info in results))) + return format_search_results(self._SE_NAME, query, results, + url_key="href", raw=raw) diff --git a/elm/web/search/google.py b/elm/web/search/google.py index 7214103c..7bdf7e63 100644 --- a/elm/web/search/google.py +++ b/elm/web/search/google.py @@ -14,7 +14,8 @@ from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, APISearchEngineLinkSearch, - PatchedSerpApiClient) + PatchedSerpApiClient, + format_search_results) logger = logging.getLogger(__name__) @@ -228,7 +229,7 @@ def __init__(self, api_key=None, cse_id=None): super().__init__(api_key=api_key) self.cse_id = cse_id or os.environ.get(self.CSE_ID_VAR or "") - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" build_args = dict(self._BUILD_ARGS) build_args["developerKey"] = self.api_key @@ -237,7 +238,8 @@ async def _search(self, query, num_results=10): results = build(**build_args).cse().list(**search_args).execute() results = (results or {}).get('items', []) - return list(filter(None, (info.get("link") for info in results))) + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw) class SerpAPIGoogleSearch(APISearchEngineLinkSearch): @@ -264,7 +266,7 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10, **param_kwargs): + async def _search(self, query, num_results=10, raw=False, **param_kwargs): """Search web for links related to a query""" params = {"q": query, "hl": "en", "gl": "us", "api_key": self.api_key} @@ -272,10 +274,10 @@ async def _search(self, query, num_results=10, **param_kwargs): client = PatchedSerpApiClient(params, engine="google", verify=self.verify) - results = client.get_dict() - results = results.get("organic_results", []) - return list(filter(None, (info.get('link', "").replace("+", "%20") - for info in results)))[:num_results] + results = await client.async_get_dict() + results = (results or {}).get("organic_results", []) + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw)[:num_results] class APISerperSearch(APISearchEngineLinkSearch): @@ -303,7 +305,7 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" payload = json.dumps({"q": query, "num": num_results}) @@ -312,6 +314,6 @@ async def _search(self, query, num_results=10): response = requests.request("POST", self._URL, headers=headers, data=payload, verify=self.verify) - results = json.loads(response.text).get('organic', {}) - return list(filter(None, (result.get("link", "").replace("+", "%20") - for result in results))) + results = json.loads(response.text).get('organic', []) + return format_search_results(self._SE_NAME, query, results, + url_key="link", raw=raw) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 6a4e4449..e8f6398a 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -131,6 +131,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, - ddg_api_kwargs - google_cse_api_kwargs - google_serper_api_kwargs + - google_serpapi_kwargs - tavily_api_kwargs - ddgs_kwargs - cf_google_se_kwargs @@ -235,6 +236,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, - ddg_api_kwargs - google_cse_api_kwargs - google_serper_api_kwargs + - google_serpapi_kwargs - tavily_api_kwargs - ddgs_kwargs - cf_google_se_kwargs @@ -280,10 +282,9 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, return urls else: for se_name in search_engines: - logger.debug("Searching web using %r", se_name) urls = await _single_se_search(se_name, queries, num_urls, ignore_url_parts, browser_semaphore, - task_name, kwargs) + task_name, kwargs, raw=False) if urls: return urls @@ -292,6 +293,109 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, return set() +async def search_all_se(queries, search_engines=_DEFAULT_SE, + num_urls=None, ignore_url_parts=None, + browser_semaphore=None, task_name=None, **kwargs): + """Retrieve search query URLs using multiple search engines if needed + + Parameters + ---------- + queries : collection of str + Collection of strings representing google queries. Documents for + the top `num_urls` google search results (from all of these + queries _combined_ will be returned from this function. + search_engines : iterable of str + Ordered collection of search engine names to attempt for web + search. If the first search engine in the list returns a set + of URLs, then iteration will end and documents for each URL will + be returned. Otherwise, the next engine in this list will be + used to run the web search. If this also fails, the next engine + is used and so on. If all web searches fail, an empty list is + returned. See :obj:`~elm.web.search.run.SEARCH_ENGINE_OPTIONS` + for supported search engine options. + By default, ``("PlaywrightGoogleLinkSearch", )``. + num_urls : int, optional + Number of unique top Google search result to return as docs. The + google search results from all queries are interleaved and the + top `num_urls` unique URL's are downloaded as docs. If this + number is less than ``len(queries)``, some of your queries may + not contribute to the final output. By default, ``None``, which + sets ``num_urls = 3 * len(queries)``. + ignore_url_parts : iterable of str, optional + Optional URL components to blacklist. For example, supplying + `ignore_url_parts={"wikipedia.org"}` will ignore all URLs that + contain "wikipedia.org". By default, ``None``. + browser_semaphore : :class:`asyncio.Semaphore`, optional + Semaphore instance that can be used to limit the number of + playwright browsers open concurrently. If ``None``, no limits + are applied. By default, ``None``. + task_name : str, optional + Optional task name to use in :func:`asyncio.create_task`. + By default, ``None``. + **kwargs + Keyword-argument pairs to initialize search engines. This input + can include and any/all of the following keywords: + + - ddg_api_kwargs + - google_cse_api_kwargs + - google_serper_api_kwargs + - google_serpapi_kwargs + - tavily_api_kwargs + - ddgs_kwargs + - cf_google_se_kwargs + - pw_bing_se_kwargs + - pw_ddg_se_kwargs + - pw_google_cse_kwargs + - pw_google_se_kwargs + - pw_yahoo_se_kwargs + - pw_launch_kwargs + + Each of these inputs should be a dictionary with + keyword-argument pairs that you can use to initialize the search + engines in the `search_engines` input. If ``pw_launch_kwargs`` + is detected, it will be added to the kwargs for all of the + PLaywright-based search engines so that you do not have to + repeatedly specify the launch parameters. For example, you may + specify ``pw_launch_kwargs={"headless": False}`` to + have all Playwright-based searches show the browser and _also_ + specify ``google_serper_api_kwargs={"api_key": "..."}`` to + specify the API key for the Google Serper search. + + Returns + ------- + list of list of dict + List of search results for each query, where each search result + is represented as a dictionary containing the following keys: + + - url: URL of the search result + - query: The search query that resulted in this search result + - search_engine: The search engine that returned this result + - query_rank: The rank of this search result for the query + + Other keys such as "attrs" may also be included depending on the + search engine. + + + Raises + ------ + ELMInputError + If `search_engines` input is empty. + """ + num_urls = num_urls or 3 * len(queries) + if len(search_engines) < 1: + msg = f"Must provide at least one search engine! Got {search_engines=}" + logger.error(msg) + raise ELMInputError(msg) + + searchers = [asyncio.create_task( + _single_se_search(se_name, queries, num_urls, + ignore_url_parts, browser_semaphore, + task_name, kwargs, raw=True), + name=task_name) for se_name in search_engines] + + return await asyncio.gather(*searchers) + + async def load_docs(sources, file_loader): """Load a document for each input URL @@ -313,7 +417,10 @@ async def load_docs(sources, file_loader): docs = await file_loader.fetch_all(*sources) logger.debug("Loaded %d docs from %d sources", len(docs), len(sources)) docs = [doc for doc in docs if not doc.empty] - logger.debug("%d docs are not empty", len(docs)) + if len(docs)== 1: + logger.debug("%d doc is not empty", len(docs)) + else: + logger.debug("%d docs are not empty", len(docs)) page_lens = {} for doc in docs: @@ -326,10 +433,14 @@ async def load_docs(sources, file_loader): async def _single_se_search(se_name, queries, num_urls, ignore_url_parts, - browser_sem, task_name, kwargs): + browser_sem, task_name, kwargs, raw=False): """Search for links using a single search engine""" _validate_se_name(se_name) - links = await _run_search(se_name, queries, browser_sem, task_name, kwargs) + logger.debug("Searching web using %r", se_name) + links = await _run_search(se_name, queries, browser_sem, task_name, + kwargs, raw) + if raw: + return [link[0] for link in links] return _down_select_urls(links, num_urls=num_urls, ignore_url_parts=ignore_url_parts) @@ -344,7 +455,7 @@ async def _multi_se_search(search_engines, queries, num_urls, logger.debug("Searching web using %r", se_name) links = await _run_search(se_name, remaining_queries, browser_sem, - task_name, kwargs) + task_name, kwargs, raw=False) logger.trace("Links: %r", links) failed_queries = [] @@ -366,15 +477,16 @@ async def _multi_se_search(search_engines, queries, num_urls, ignore_url_parts=ignore_url_parts) -async def _run_search(se_name, queries, browser_sem, task_name, kwargs): +async def _run_search(se_name, queries, browser_sem, task_name, kwargs, raw): """Run a search for multiple queries on a single search engine""" searchers = [asyncio.create_task(_single_query_search(se_name, query, - browser_sem, kwargs), + browser_sem, kwargs, + raw), name=task_name) for query in queries] return await asyncio.gather(*searchers) -async def _single_query_search(se_name, query, browser_sem, kwargs): +async def _single_query_search(se_name, query, browser_sem, kwargs, raw): """Execute a single search query on a single search engine""" try: search_engine, uses_browser = _init_se(se_name, kwargs) @@ -387,12 +499,13 @@ async def _single_query_search(se_name, query, browser_sem, kwargs): # help avoid some detection by staggering the browser launches await asyncio.sleep(random.uniform(1, 10)) return await _single_query_pw(search_engine, query, - browser_sem=browser_sem) + browser_sem=browser_sem, + raw=raw) - return await _single_query_api(search_engine, query) + return await _single_query_api(search_engine, query, raw=raw) -async def _single_query_pw(search_engine, question, browser_sem): +async def _single_query_pw(search_engine, question, browser_sem, raw=False): """Perform a single browser-based search""" if browser_sem is None: browser_sem = AsyncExitStack() @@ -402,14 +515,16 @@ async def _single_query_pw(search_engine, question, browser_sem): logger.trace("Starting %s search for %r with browser_semaphore=%r", search_engine._SE_NAME, question, browser_sem) return await search_engine.results(question, - num_results=_RESULTS_PER_QUERY) + num_results=_RESULTS_PER_QUERY, + raw=raw) -async def _single_query_api(search_engine, question): +async def _single_query_api(search_engine, question, raw=False): """Perform a single api-based search""" logger.trace("Starting %s search for %r", search_engine._SE_NAME, question) return await search_engine.results(question, - num_results=_RESULTS_PER_QUERY) + num_results=_RESULTS_PER_QUERY, + raw=raw) def _init_se(se_name, kwargs): diff --git a/elm/web/search/tavily.py b/elm/web/search/tavily.py index f1a4b2cb..6f6fe1ac 100644 --- a/elm/web/search/tavily.py +++ b/elm/web/search/tavily.py @@ -8,7 +8,8 @@ from tavily.errors import (UsageLimitExceededError, InvalidAPIKeyError, BadRequestError, ForbiddenError) -from elm.web.search.base import APISearchEngineLinkSearch +from elm.web.search.base import (APISearchEngineLinkSearch, + format_search_results) logger = logging.getLogger(__name__) @@ -140,11 +141,11 @@ def __init__(self, api_key=None, verify=False): super().__init__(api_key=api_key) self.verify = verify - async def _search(self, query, num_results=10): + async def _search(self, query, num_results=10, raw=False): """Search web for links related to a query""" client = _PatchedTavilyClient(api_key=self.api_key, verify=self.verify) response = client.search(query=query, max_results=num_results) results = response.get("results", []) - return list(filter(None, (info.get('url', "").replace("+", "%20") - for info in results))) + return format_search_results(self._SE_NAME, query, results, + url_key="url", raw=raw) diff --git a/tests/web/search/test_web_search_api.py b/tests/web/search/test_web_search_api.py index 76030773..fff7f7cf 100644 --- a/tests/web/search/test_web_search_api.py +++ b/tests/web/search/test_web_search_api.py @@ -7,7 +7,11 @@ import elm.web.search.duckduckgo import elm.web.search.google -from elm.web.search.base import APISearchEngineLinkSearch +from elm.web.search.base import (APISearchEngineLinkSearch, + SearchEngineLinkSearch, + format_search_results, + _format_url_results) +from elm.web.search.run import _single_query_api SE_API_TO_TEST = [(elm.web.search.duckduckgo.APIDuckDuckGoSearch, @@ -43,6 +47,89 @@ async def _search(self, *__, **___): assert MockAPISearchEngine().api_key is None +def test_format_search_results_raw(): + """Test raw structured results preserve URL and attrs""" + results = [{"href": "https://example.com/a+b", "title": "Result A"}, + {"href": "", "title": "Missing URL"}] + + out = format_search_results("test_se", "query", results, "href", raw=False) + assert out == ["https://example.com/a%20b"] + + out = format_search_results("test_se", "query", results, "href", raw=True) + assert out == [{ + "url": "https://example.com/a%20b", + "query": "query", + "search_engine": "test_se", + "query_rank": 1, + "attrs": {"href": "https://example.com/a+b", "title": "Result A"}, + }] + + +def test_format_url_results_raw(): + """Test raw URL-only results still include URL keys""" + out = _format_url_results("test_se", "query", + ["https://example.com/a+b", ""]) + assert out == ["https://example.com/a%20b"] + + out = _format_url_results("test_se", "query", + ["https://example.com/a+b", ""], raw=True) + assert out == [ + { + "url": "https://example.com/a%20b", + "query": "query", + "search_engine": "test_se", + "query_rank": 1, + } + ] + + +@pytest.mark.asyncio +async def test_results_passes_raw_flag(): + """Test SearchEngineLinkSearch.results forwards raw to _search""" + + class MockSearchEngine(SearchEngineLinkSearch): + """MockSearchEngine""" + + async def _search(self, query, num_results=10, raw=False): + if raw: + return [{"url": query, "attrs": {"query": query}}] + return [query] + + search_engine = MockSearchEngine() + + assert await search_engine.results("https://example.com") == [[ + "https://example.com" + ]] + assert await search_engine.results("https://example.com", raw=True) == [[{ + "url": "https://example.com", + 'attrs': {'query': 'https://example.com'}, + }]] + + +@pytest.mark.asyncio +async def test_single_query_api_passes_raw_flag(): + """Test internal API runner preserves raw search outputs""" + + class MockSearchEngine: + """MockSearchEngine""" + + _SE_NAME = "Mock" + + async def results(self, *queries, num_results=10, raw=False): + assert queries == ("query",) + assert num_results > 0 + return [[{"url": "https://example.com", "attrs": {}}]] if raw else [[ + "https://example.com" + ]] + + search_engine = MockSearchEngine() + + assert await _single_query_api(search_engine, "query", raw=True) == [[{ + "url": "https://example.com", + "attrs": {}, + }]] + + @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") == "true", reason="Fails in GHA due to rate limiting") @pytest.mark.parametrize("queries", [['1. "NatLabRockies elm"'],