Skip to content

Commit 302c3ea

Browse files
committed
Enhance news scraper and API response handling
- Updated the `.env.template` to include new API keys for news scrapers, ensuring proper configuration for future integrations. - Introduced a fallback mechanism in `news_scraper.py` using Scrapling to handle request failures, particularly for 403 and 429 status codes, improving resilience against blocking. - Refactored `get_scholar` and `get_scholar_news` in `serve.py` to enforce stricter data retrieval rules, ensuring that news and media items are fetched through dedicated endpoints, optimizing API response structure.
1 parent 233bc8b commit 302c3ea

5 files changed

Lines changed: 476 additions & 16 deletions

File tree

.env.template

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
EMAIL_USER=your_email_here
22
EMAIL_PASS=your_email_password_here
33

4-
NEWS_API_ORG_KEY=your_newsapi_key_here
5-
THE_GUARDIAN_API_KEY=your_guardian_api_key_here
6-
GOOGLE_API_KEY=your_google_api_key_here
7-
GOOGLE_CX_ID=your_google_cx_id_here
4+
# News Scrapers
5+
SCRAPER_API_KEY=
6+
NEWS_API_ORG_KEY=
7+
NEWSAPI_AI_KEY=
8+
NEWSAPI_COM_KEY=
9+
THE_GUARDIAN_API_KEY=
10+
GOOGLE_API_KEY=
11+
GOOGLE_CX_ID=
812

913
# Optional: Flask configuration
1014
FLASK_ENV=production
@@ -36,3 +40,4 @@ PUBLICATION_DELAY=1
3640
# 3 blocks/failures, requests fall back to SOCKS5_PROXIES.
3741
# TOR_PROXY=http://localhost:3128
3842
TOR_PROXY=
43+

src/news_filters.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import os
5+
from functools import lru_cache
6+
from hashlib import sha256
7+
from pathlib import Path
8+
from time import time
9+
10+
import requests
11+
try:
12+
from scrapling.fetchers import FetcherSession # type: ignore
13+
14+
_SCRAPLING_AVAILABLE = True
15+
except Exception: # pragma: no cover
16+
FetcherSession = None # type: ignore
17+
_SCRAPLING_AVAILABLE = False
18+
19+
URL_CHECK_HEADERS = {
20+
"User-Agent": "Mozilla/5.0 (compatible; RummerLab/1.0; +https://rummerlab.org)",
21+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
22+
}
23+
24+
RUMMER_STRONG_MARKERS = (
25+
"rummerlab",
26+
"physioshark",
27+
"physiologyfish",
28+
)
29+
30+
RUMMER_NAME_MARKERS = (
31+
"jodie rummer",
32+
"dr jodie rummer",
33+
"professor jodie rummer",
34+
)
35+
36+
RUMMER_CONTEXT_MARKERS = (
37+
"shark",
38+
"sharks",
39+
"fish",
40+
"marine",
41+
"jcu",
42+
"james cook university",
43+
)
44+
45+
NEWS_HTML_CACHE_DIR = Path(os.environ.get("CACHE_DIR", "cache")) / "news_html"
46+
NEWS_HTML_CACHE_DIR.mkdir(parents=True, exist_ok=True)
47+
48+
def _news_html_cache_max_age_seconds() -> int | None:
49+
"""
50+
Returns:
51+
- None: keep forever (no expiry)
52+
- int: max age in seconds
53+
"""
54+
raw = os.environ.get("NEWS_HTML_CACHE_EXPIRE_SECONDS", None)
55+
if raw is None:
56+
return 60 * 60 * 24 * 365 # 365 days default
57+
if raw.strip() == "":
58+
return None
59+
return int(raw)
60+
61+
62+
NEWS_HTML_CACHE_MAX_AGE_SECONDS = _news_html_cache_max_age_seconds()
63+
64+
65+
def _url_cache_key(url: str) -> str:
66+
return sha256(url.encode("utf-8")).hexdigest()
67+
68+
69+
def _cache_paths_for_url(url: str) -> tuple[Path, Path]:
70+
key = _url_cache_key(url)
71+
return (NEWS_HTML_CACHE_DIR / f"{key}.html", NEWS_HTML_CACHE_DIR / f"{key}.json")
72+
73+
74+
def _is_likely_blocked_or_captcha(text_lower: str) -> bool:
75+
"""
76+
Heuristics to avoid persisting "fake 200" pages (captcha / bot detection / blocked).
77+
Keep this broad; false positives are preferable to caching a captcha page.
78+
"""
79+
markers = (
80+
"captcha",
81+
"recaptcha",
82+
"hcaptcha",
83+
"cloudflare",
84+
"ddos protection",
85+
"attention required",
86+
"verify you are human",
87+
"are you a robot",
88+
"access denied",
89+
"request blocked",
90+
"bot detection",
91+
"unusual traffic",
92+
"temporarily unavailable",
93+
"incapsula",
94+
"imperva",
95+
"akamai",
96+
"sucuri",
97+
)
98+
return any(m in text_lower for m in markers)
99+
100+
101+
def _load_cached_html(url: str) -> str | None:
102+
html_path, meta_path = _cache_paths_for_url(url)
103+
if not html_path.exists() or not meta_path.exists():
104+
return None
105+
try:
106+
meta = json.loads(meta_path.read_text(encoding="utf-8"))
107+
fetched_at = float(meta.get("fetched_at", 0))
108+
if fetched_at <= 0:
109+
return None
110+
if NEWS_HTML_CACHE_MAX_AGE_SECONDS is not None:
111+
if time() - fetched_at > NEWS_HTML_CACHE_MAX_AGE_SECONDS:
112+
return None
113+
return html_path.read_text(encoding="utf-8", errors="ignore")
114+
except OSError:
115+
return None
116+
except (ValueError, TypeError):
117+
return None
118+
119+
120+
def _save_cached_html(url: str, html: str) -> None:
121+
html_path, meta_path = _cache_paths_for_url(url)
122+
try:
123+
html_path.write_text(html, encoding="utf-8", errors="ignore")
124+
meta_path.write_text(
125+
json.dumps({"url": url, "fetched_at": time()}, ensure_ascii=False),
126+
encoding="utf-8",
127+
)
128+
except OSError:
129+
# Best-effort cache only
130+
return
131+
132+
133+
def _scrapling_fetch_html_prefix(url: str, *, timeout_s: int = 8) -> tuple[int, str, str] | None:
134+
"""
135+
Fetch a page using Scrapling's static engine (browser-like TLS + headers).
136+
137+
Returns (status_code, content_type_lower, text_lower) or None on any error.
138+
Only returns a bounded prefix of the page text (to keep it fast).
139+
"""
140+
if not _SCRAPLING_AVAILABLE or FetcherSession is None:
141+
return None
142+
try:
143+
# NOTE: we create a short-lived session per call to keep this simple and safe.
144+
# If this becomes a hotspot, we can switch to a longer-lived session/pool.
145+
with FetcherSession(
146+
impersonate="chrome",
147+
timeout=timeout_s,
148+
stealthy_headers=True,
149+
follow_redirects=True,
150+
retries=2,
151+
retry_delay=1,
152+
verify=True,
153+
) as s:
154+
resp = s.get(url)
155+
status_code = int(getattr(resp, "status_code", 0) or 0)
156+
headers = getattr(resp, "headers", {}) or {}
157+
# curl-cffi headers are case-insensitive; treat like dict
158+
ctype = (headers.get("Content-Type") or headers.get("content-type") or "").lower()
159+
text = getattr(resp, "text", "") or ""
160+
if not isinstance(text, str):
161+
return None
162+
text_lower = text[:200_000].lower()
163+
return status_code, ctype, text_lower
164+
except Exception:
165+
return None
166+
167+
168+
@lru_cache(maxsize=4096)
169+
def url_is_definitely_404(url: str) -> bool:
170+
"""
171+
Return True only when we're confident the URL is a 404.
172+
We do HEAD with redirects and a short timeout, with GET fallback for sites
173+
that block/disable HEAD.
174+
"""
175+
try:
176+
resp = requests.head(
177+
url,
178+
allow_redirects=True,
179+
timeout=5,
180+
headers=URL_CHECK_HEADERS,
181+
)
182+
if resp.status_code == 404:
183+
return True
184+
if resp.status_code in (405, 403):
185+
resp = requests.get(
186+
url,
187+
allow_redirects=True,
188+
timeout=5,
189+
headers=URL_CHECK_HEADERS,
190+
stream=True,
191+
)
192+
return resp.status_code == 404
193+
return False
194+
except requests.RequestException:
195+
# Only exclude on definitive 404. Any network hiccup -> keep the item.
196+
return False
197+
198+
199+
@lru_cache(maxsize=4096)
200+
def url_page_is_about_rummer(url: str) -> bool | None:
201+
"""
202+
Best-effort content check.
203+
204+
Returns:
205+
- True: confident the page mentions Dr Jodie Rummer / lab terms
206+
- False: confident it does NOT (based on fetched content)
207+
- None: unknown (blocked, paywall, non-HTML, network error) -> do not exclude
208+
"""
209+
cached_html = _load_cached_html(url)
210+
if cached_html:
211+
text = cached_html.lower()
212+
if any(m in text for m in RUMMER_STRONG_MARKERS):
213+
return True
214+
if any(m in text for m in RUMMER_NAME_MARKERS):
215+
return True
216+
if "rummer" in text and any(m in text for m in ("jodie", *(RUMMER_CONTEXT_MARKERS))):
217+
return True
218+
return False
219+
220+
try:
221+
fetched = _scrapling_fetch_html_prefix(url, timeout_s=8)
222+
if not fetched:
223+
return None
224+
status_code, ctype, text = fetched
225+
if status_code == 404:
226+
# Let the 404 filter handle it; treat as unknown here.
227+
return None
228+
if ctype and ("text/html" not in ctype and "application/xhtml+xml" not in ctype):
229+
return None
230+
if not text.strip():
231+
return None
232+
if _is_likely_blocked_or_captcha(text):
233+
return None
234+
235+
if any(m in text for m in RUMMER_STRONG_MARKERS):
236+
_save_cached_html(url, text)
237+
return True
238+
if any(m in text for m in RUMMER_NAME_MARKERS):
239+
_save_cached_html(url, text)
240+
return True
241+
242+
# We only accept generic "rummer" when there's some relevant context.
243+
if "rummer" in text and any(m in text for m in ("jodie", *(RUMMER_CONTEXT_MARKERS))):
244+
_save_cached_html(url, text)
245+
return True
246+
247+
# If we successfully fetched HTML and found none of the markers, treat as not about.
248+
_save_cached_html(url, text)
249+
return False
250+
except requests.RequestException:
251+
return None
252+
253+
254+
def filter_media_items(items: list[dict]) -> list[dict]:
255+
"""
256+
Filter media items:
257+
- drop items with absolute URL that is definitively 404
258+
- drop items with absolute URL that we confidently conclude are not about Rummer
259+
- keep on unknown (errors, blocked, non-HTML) to avoid false negatives
260+
- keep items without an absolute URL
261+
"""
262+
filtered: list[dict] = []
263+
for item in items:
264+
url = (item.get("url") or "").strip() if isinstance(item, dict) else ""
265+
# Keep items without an absolute URL (e.g. curated/in-site items).
266+
if not url or not (url.startswith("http://") or url.startswith("https://")):
267+
filtered.append(item)
268+
continue
269+
if url_is_definitely_404(url):
270+
continue
271+
about = url_page_is_about_rummer(url)
272+
if about is False:
273+
continue
274+
filtered.append(item)
275+
return filtered
276+
277+
278+
def clear_caches() -> None:
279+
url_is_definitely_404.cache_clear()
280+
url_page_is_about_rummer.cache_clear()
281+

src/news_scraper.py

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515
import requests
1616
from bs4 import BeautifulSoup
1717
from dotenv import load_dotenv
18+
try:
19+
from scrapling.fetchers import FetcherSession # type: ignore
20+
21+
_SCRAPLING_AVAILABLE = True
22+
except Exception: # pragma: no cover
23+
FetcherSession = None # type: ignore
24+
_SCRAPLING_AVAILABLE = False
1825

1926
load_dotenv()
2027

@@ -557,9 +564,62 @@ def cached_request(
557564
response._content = json.dumps(cached).encode()
558565
return response
559566

567+
def _scrapling_fallback() -> requests.Response:
568+
"""
569+
Fallback fetch using Scrapling's static engine (browser-like TLS + headers).
570+
This helps with 403 blocks and some local SSL chain issues.
571+
"""
572+
if not _SCRAPLING_AVAILABLE or FetcherSession is None:
573+
raise RuntimeError("Scrapling is not available")
574+
if method.lower() != "get":
575+
raise RuntimeError("Scrapling fallback only supports GET")
576+
577+
# Build the final URL including query params.
578+
final_url = url
579+
if params:
580+
final_url = requests.Request("GET", url, params=params).prepare().url # type: ignore[assignment]
581+
582+
# Merge headers: caller headers win.
583+
merged_headers = dict(DEFAULT_HEADERS)
584+
if headers:
585+
merged_headers.update(headers)
586+
587+
with FetcherSession(
588+
impersonate="chrome",
589+
timeout=timeout,
590+
stealthy_headers=True,
591+
follow_redirects=True,
592+
retries=2,
593+
retry_delay=1,
594+
headers=merged_headers,
595+
verify=True,
596+
) as s:
597+
resp = s.get(final_url)
598+
599+
response = requests.Response()
600+
response.status_code = int(getattr(resp, "status_code", 0) or 0)
601+
response.url = final_url
602+
response._content = (getattr(resp, "text", "") or "").encode("utf-8", errors="ignore")
603+
# Best-effort headers
604+
try:
605+
response.headers.update(getattr(resp, "headers", {}) or {})
606+
except Exception:
607+
pass
608+
response.raise_for_status()
609+
return response
610+
560611
# Make the actual request
561-
response = requests.request(method, url, headers=headers, params=params, timeout=timeout)
562-
response.raise_for_status()
612+
try:
613+
response = requests.request(method, url, headers=headers, params=params, timeout=timeout)
614+
response.raise_for_status()
615+
except requests.RequestException as e:
616+
# Only fallback on cases where a browser-like client may help.
617+
status = getattr(getattr(e, "response", None), "status_code", None)
618+
if status in (403, 429) or isinstance(e, requests.exceptions.SSLError):
619+
logger.warning("requests failed for %s (%s); trying Scrapling fallback", url, e)
620+
response = _scrapling_fallback()
621+
else:
622+
raise
563623

564624
# Cache the response data
565625
try:

0 commit comments

Comments
 (0)