From 27108e058fe01fcaba098535be316f61839514c8 Mon Sep 17 00:00:00 2001 From: Brenda Solari Date: Thu, 16 Oct 2025 18:44:51 -0500 Subject: [PATCH 1/2] Add proxy rotation module --- pyproject.toml | 3 + src/ps_helper/middlewares/proxy_rotator.py | 241 +++++++++++++++++++++ tests/pdf_analyzer_test.py | 2 +- tests/test_proxy_middlewares.py | 122 +++++++++++ 4 files changed, 367 insertions(+), 1 deletion(-) create mode 100644 src/ps_helper/middlewares/proxy_rotator.py create mode 100644 tests/test_proxy_middlewares.py diff --git a/pyproject.toml b/pyproject.toml index 37ef3b4..08f6ff5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,3 +32,6 @@ ps-helper = "ps_helper.cli.main:main" [tool.setuptools.packages.find] where = ["src"] + +[tool.pytest.ini_options] +pythonpath = ["src"] diff --git a/src/ps_helper/middlewares/proxy_rotator.py b/src/ps_helper/middlewares/proxy_rotator.py new file mode 100644 index 0000000..0682706 --- /dev/null +++ b/src/ps_helper/middlewares/proxy_rotator.py @@ -0,0 +1,241 @@ +import random +import logging +import time +from scrapy import signals +from scrapy.exceptions import NotConfigured + +logger = logging.getLogger(__name__) + + +class BaseProxyRotator: + """Base class with shared proxy setup logic and stats.""" + + def __init__(self, proxy_providers): + self.proxy_providers = proxy_providers + self.proxies = self._build_proxy_list(proxy_providers) + self.proxy_stats = { + proxy: {"requests": 0, "success": 0, "fails": 0, "banned_until": 0} + for proxy in self.proxies + } + logger.info(f"ProxyRotator initialized with {len(self.proxies)} proxies") + + def _build_proxy_list(self, providers_dict): + proxies = [] + for provider, data in providers_dict.items(): + user = data.get("user") + password = data.get("password") + url = data.get("url") + port = data.get("port") + if user and password: + proxy = f"http://{user}:{password}@{url}:{port}" + else: + proxy = f"http://{url}:{port}" + proxies.append(proxy) + return proxies + + def _record_success(self, proxy): + if proxy in self.proxy_stats: + self.proxy_stats[proxy]["success"] += 1 + + def _record_failure(self, proxy): + if proxy in self.proxy_stats: + self.proxy_stats[proxy]["fails"] += 1 + + def _record_request(self, proxy): + if proxy in self.proxy_stats: + self.proxy_stats[proxy]["requests"] += 1 + + def log_summary(self, spider): + logger.info("=" * 60) + logger.info("PROXY USAGE SUMMARY") + logger.info("=" * 60) + for proxy, stats in self.proxy_stats.items(): + total = stats["requests"] + fails = stats["fails"] + success = stats["success"] + rate = (success / total * 100) if total else 0 + banned = "YES" if stats.get("banned_until", 0) > time.time() else "NO" + spider.logger.info( + f"Proxy: {proxy}\n" + f" Total requests: {total}\n" + f" Successes: {success}\n" + f" Failures: {fails}\n" + f" Success rate: {rate:.1f}%\n" + f" Banned: {banned}\n" + f"{'-' * 50}" + ) + logger.info("=" * 60) + + +class SequentialProxyRotatorMiddleware(BaseProxyRotator): + """ + Simple sequential rotation (round-robin) with stats. + to DOWNLOADER_MIDDLEWARES option:: + + DOWNLOADER_MIDDLEWARES = { + # ... + 'ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware': 620, + # ... + } + + Settings: + * ``PROXY_PROVIDERS`` - a list of proxies to choose from; + """ + + def __init__(self, proxy_providers): + super().__init__(proxy_providers) + self.current_index = 0 + + @classmethod + def from_crawler(cls, crawler): + providers = crawler.settings.get("PROXY_PROVIDERS") + if not providers: + raise NotConfigured("PROXY_PROVIDERS not configured") + + middleware = cls(providers) + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) + return middleware + + def get_next_proxy(self): + proxy = self.proxies[self.current_index] + self.current_index = (self.current_index + 1) % len(self.proxies) + return proxy + + def process_request(self, request, spider): + proxy = self.get_next_proxy() + request.meta["proxy"] = proxy + self._record_request(proxy) + logger.debug(f"[Sequential] Using proxy: {proxy}") + return None + + def process_response(self, request, response, spider): + proxy = request.meta.get("proxy") + if proxy: + if response.status < 400: + self._record_success(proxy) + else: + self._record_failure(proxy) + return response + + def process_exception(self, request, exception, spider): + proxy = request.meta.get("proxy") + if proxy: + self._record_failure(proxy) + logger.warning(f"[Sequential] Proxy {proxy} exception: {exception}") + return None + + def spider_closed(self, spider): + self.log_summary(spider) + + +class SmartProxyRotatorMiddleware(BaseProxyRotator): + """ + Advanced rotation with failure tracking, cooldown bans, and stats. + + To enable it, add it to DOWNLOADER_MIDDLEWARES option:: + + DOWNLOADER_MIDDLEWARES = { + # ... + 'ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware': 620, + # ... + } + + Settings: + * ``PROXY_PROVIDERS`` - a list of proxies to choose from + * ``PROXY_BAN_THRESHOLD`` - number of failures before the proxy is banned + * ``PROXY_COOLDOWN`` - seconds that the proxy is deactivated + * ``PROXY_ROTATION_MODE`` - 'random' or 'round_robin' + """ + + def __init__( + self, + proxy_providers, + ban_threshold=3, + cooldown_time=300, + rotation_mode="random", + ): + super().__init__(proxy_providers) + self.ban_threshold = ban_threshold + self.cooldown_time = cooldown_time + self.rotation_mode = rotation_mode + self.current_index = 0 # for round robin + + @classmethod + def from_crawler(cls, crawler): + providers = crawler.settings.get("PROXY_PROVIDERS") + if not providers: + raise NotConfigured("PROXY_PROVIDERS not configured") + + ban_threshold = crawler.settings.getint("PROXY_BAN_THRESHOLD", 3) + cooldown_time = crawler.settings.getint("PROXY_COOLDOWN", 300) + rotation_mode = crawler.settings.get("PROXY_ROTATION_MODE", "random") + + middleware = cls(providers, ban_threshold, cooldown_time, rotation_mode) + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) + return middleware + + def get_available_proxies(self): + """Return only proxies not currently banned.""" + now = time.time() + return [p for p, s in self.proxy_stats.items() if s["banned_until"] < now] + + def get_next_proxy(self): + available = self.get_available_proxies() + + if not available: + logger.warning("[Smart] All proxies are banned! Resetting bans.") + for s in self.proxy_stats.values(): + s["banned_until"] = 0 + available = self.proxies + + if self.rotation_mode == "round_robin": + # Skip banned ones but keep round-robin order + for _ in range(len(self.proxies)): + proxy = self.proxies[self.current_index] + self.current_index = (self.current_index + 1) % len(self.proxies) + if proxy in available: + return proxy + # fallback if somehow none available + return random.choice(available) + else: + return random.choice(available) + + def _ban_proxy(self, proxy): + stats = self.proxy_stats[proxy] + stats["banned_until"] = time.time() + self.cooldown_time + logger.info(f"[Smart] Proxy temporarily banned: {proxy}") + + def register_failure(self, proxy): + stats = self.proxy_stats[proxy] + stats["fails"] += 1 + if stats["fails"] >= self.ban_threshold: + self._ban_proxy(proxy) + stats["fails"] = 0 # reset after ban + + def process_request(self, request, spider): + proxy = self.get_next_proxy() + request.meta["proxy"] = proxy + self._record_request(proxy) + logger.debug(f"[Smart] Using proxy: {proxy}") + + def process_response(self, request, response, spider): + proxy = request.meta.get("proxy") + if proxy: + if response.status >= 400: + self.register_failure(proxy) + self._record_failure(proxy) + logger.warning(f"[Smart] Proxy {proxy} failed (HTTP {response.status})") + else: + self._record_success(proxy) + return response + + def process_exception(self, request, exception, spider): + proxy = request.meta.get("proxy") + if proxy: + self.register_failure(proxy) + self._record_failure(proxy) + logger.warning(f"[Smart] Proxy {proxy} raised exception: {exception}") + return None + + def spider_closed(self, spider): + self.log_summary(spider) diff --git a/tests/pdf_analyzer_test.py b/tests/pdf_analyzer_test.py index 2075fa3..708c349 100644 --- a/tests/pdf_analyzer_test.py +++ b/tests/pdf_analyzer_test.py @@ -1,5 +1,5 @@ import os -from ps_helper.pdf_analyzer import PDFAnalyzer +from ps_helper.pdf.pdf_analyzer import PDFAnalyzer LOCAL_PDF_PATH = "test_files/scansmpl.pdf" diff --git a/tests/test_proxy_middlewares.py b/tests/test_proxy_middlewares.py new file mode 100644 index 0000000..8f3c6e2 --- /dev/null +++ b/tests/test_proxy_middlewares.py @@ -0,0 +1,122 @@ +import pytest +import time +from scrapy.http import Request, Response +from scrapy.spiders import Spider + +from ps_helper.middlewares.proxy_rotator import ( + SequentialProxyRotatorMiddleware, + SmartProxyRotatorMiddleware, +) + + +class DummySpider(Spider): + name = "dummy" + + +def make_request(url="http://example.com", callback=None): + return Request(url=url, callback=callback or (lambda r: None)) + + +def make_response(request, status=200): + return Response(url=request.url, request=request, status=status) + + +def make_exception(exc): + return exc + + +@pytest.fixture +def providers(): + return { + "p1": {"user": "u1", "password": "p1", "url": "127.0.0.1", "port": "1111"}, + "p2": {"user": "u2", "password": "p2", "url": "127.0.0.1", "port": "2222"}, + } + + +def test_sequential_rotation_records_stats(providers): + middleware = SequentialProxyRotatorMiddleware(providers) + spider = DummySpider() + # simulate 4 requests, two proxies round-robin + r1 = make_request("http://a") + middleware.process_request(r1, spider) + assert "proxy" in r1.meta + + r2 = make_request("http://b") + middleware.process_request(r2, spider) + assert "proxy" in r2.meta + # simulate success for r1, failure for r2 + res1 = make_response(r1, status=200) + middleware.process_response(r1, res1, spider) + res2 = make_response(r2, status=500) + middleware.process_response(r2, res2, spider) + # check stats recorded + stats = middleware.proxy_stats + total_requests = sum(s["requests"] for s in stats.values()) + assert total_requests == 2 + # one success, one failure total + total_success = sum(s["success"] for s in stats.values()) + total_fails = sum(s["fails"] for s in stats.values()) + assert total_success + total_fails == 2 + + +def test_smart_rotation_bans_after_threshold(providers): + """Test that proxies are banned after reaching the failure threshold and become available again after cooldown.""" + middleware = SmartProxyRotatorMiddleware( + providers, ban_threshold=2, cooldown_time=1, rotation_mode="round_robin" + ) + spider = DummySpider() + + # Force enough failures to trigger a ban + for i in range(4): + req = make_request(f"http://item/{i}") + middleware.process_request(req, spider) + middleware.process_exception(req, Exception("connection error"), spider) + + # At least one proxy should be temporarily banned + banned = [p for p, s in middleware.proxy_stats.items() if s["banned_until"] > time.time()] + assert len(banned) >= 1, "At least one proxy should be temporarily banned" + + # Wait for cooldown to expire + time.sleep(1.1) + + # All proxies should be available again + available = middleware.get_available_proxies() + assert len(available) == len(middleware.proxies), "All proxies should be available after cooldown" + + +def test_smart_rotation_round_robin_skips_banned(providers): + """Test that the round-robin mode skips banned proxies.""" + middleware = SmartProxyRotatorMiddleware( + providers, ban_threshold=1, cooldown_time=5, rotation_mode="round_robin" + ) + spider = DummySpider() + + # Force one proxy to be banned + req = make_request("http://test") + middleware.process_request(req, spider) + proxy_used = req.meta["proxy"] + middleware.register_failure(proxy_used) + middleware._record_failure(proxy_used) + + assert middleware.proxy_stats[proxy_used]["banned_until"] > time.time() + + # The next proxy in round-robin mode should skip the banned one + next_proxy = middleware.get_next_proxy() + assert next_proxy != proxy_used, "Round-robin mode should skip currently banned proxies" + + +def test_smart_rotation_random_mode(providers): + """Test that random mode picks proxies randomly from the list.""" + middleware = SmartProxyRotatorMiddleware( + providers, ban_threshold=2, cooldown_time=2, rotation_mode="random" + ) + spider = DummySpider() + + used = set() + for i in range(5): + req = make_request(f"http://random/{i}") + middleware.process_request(req, spider) + used.add(req.meta["proxy"]) + + assert all(u in middleware.proxies for u in used), "All used proxies must be from the configured list" + assert len(used) <= len(providers), "Random mode must not generate unknown proxies" From c47c63570850cdfc2df3b78734d58e0fe42570bc Mon Sep 17 00:00:00 2001 From: Brenda Solari Date: Fri, 17 Oct 2025 13:15:12 -0500 Subject: [PATCH 2/2] Update README.md to include proxy rotator middleware --- src/ps_helper/middlewares/README.md | 87 +++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/ps_helper/middlewares/README.md b/src/ps_helper/middlewares/README.md index 3f314ce..449f050 100644 --- a/src/ps_helper/middlewares/README.md +++ b/src/ps_helper/middlewares/README.md @@ -16,3 +16,90 @@ DOWNLOADER_CLIENTCONTEXTFACTORY = "ps_helper.middlewares.LegacyConnectContextFac Scrapy will then use the LegacyConnectContextFactory for all HTTPS connections. -------------------------------------- + +# Proxy Rotator Middlewares + +This module provides two Scrapy downloader middlewares for rotating HTTP proxies with optional smart banning logic and statistics tracking. + +--- + +## 🧩 Middlewares + +### **1. SequentialProxyRotatorMiddleware** +A simple **round-robin** proxy rotation strategy that cycles through proxies sequentially. + +#### Enable in `settings.py` +```python +DOWNLOADER_MIDDLEWARES = { + "ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware": 620, +} +``` + +#### Required Setting +```python +PROXY_PROVIDERS = { + "provider1": {"url": "proxy1.com", "port": 8080}, + "provider2": {"url": "proxy2.com", "port": 8080, "user": "user", "password": "pass"}, +} +``` + +#### Behavior +- Rotates proxies in order. +- Logs total requests, successes, failures, and success rate for each proxy when the spider closes. + +--- + +### **2. SmartProxyRotatorMiddleware** +A more advanced rotation system that supports banning failed proxies temporarily and two rotation modes (`random` or `round_robin`). + +#### Enable in `settings.py` +```python +DOWNLOADER_MIDDLEWARES = { + "ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware": 620, +} +``` + +#### Available Settings +```python +PROXY_PROVIDERS = { + "proxy1": {"url": "proxy1.com", "port": 8080}, + "proxy2": {"url": "proxy2.com", "port": 8080, "user": "user", "password": "pass"}, +} + +PROXY_BAN_THRESHOLD = 3 # Number of failures before banning a proxy +PROXY_COOLDOWN = 300 # Cooldown duration in seconds for banned proxies +PROXY_ROTATION_MODE = "random" # 'random' or 'round_robin' +``` + +#### Features +- Automatically bans proxies that fail too many times. +- Supports **cooldown** (temporary ban). +- Chooses proxies randomly or sequentially while skipping banned ones. +- Displays a detailed summary when the spider closes. + +--- + +## 🧠 Summary Logs Example +When a spider finishes, a summary like this appears in the logs: + +``` +============================================================ +PROXY USAGE SUMMARY +============================================================ +Proxy: http://proxy1.com:8080 + Total requests: 120 + Successes: 110 + Failures: 10 + Success rate: 91.7% + Banned: NO +-------------------------------------------------- +Proxy: http://proxy2.com:8080 + Total requests: 50 + Successes: 25 + Failures: 25 + Success rate: 50.0% + Banned: YES +============================================================ +``` + +---