Skip to content

Commit 27108e0

Browse files
committed
Add proxy rotation module
1 parent b0cf51c commit 27108e0

File tree

4 files changed

+367
-1
lines changed

4 files changed

+367
-1
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,6 @@ ps-helper = "ps_helper.cli.main:main"
3232

3333
[tool.setuptools.packages.find]
3434
where = ["src"]
35+
36+
[tool.pytest.ini_options]
37+
pythonpath = ["src"]
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
import random
2+
import logging
3+
import time
4+
from scrapy import signals
5+
from scrapy.exceptions import NotConfigured
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
class BaseProxyRotator:
11+
"""Base class with shared proxy setup logic and stats."""
12+
13+
def __init__(self, proxy_providers):
14+
self.proxy_providers = proxy_providers
15+
self.proxies = self._build_proxy_list(proxy_providers)
16+
self.proxy_stats = {
17+
proxy: {"requests": 0, "success": 0, "fails": 0, "banned_until": 0}
18+
for proxy in self.proxies
19+
}
20+
logger.info(f"ProxyRotator initialized with {len(self.proxies)} proxies")
21+
22+
def _build_proxy_list(self, providers_dict):
23+
proxies = []
24+
for provider, data in providers_dict.items():
25+
user = data.get("user")
26+
password = data.get("password")
27+
url = data.get("url")
28+
port = data.get("port")
29+
if user and password:
30+
proxy = f"http://{user}:{password}@{url}:{port}"
31+
else:
32+
proxy = f"http://{url}:{port}"
33+
proxies.append(proxy)
34+
return proxies
35+
36+
def _record_success(self, proxy):
37+
if proxy in self.proxy_stats:
38+
self.proxy_stats[proxy]["success"] += 1
39+
40+
def _record_failure(self, proxy):
41+
if proxy in self.proxy_stats:
42+
self.proxy_stats[proxy]["fails"] += 1
43+
44+
def _record_request(self, proxy):
45+
if proxy in self.proxy_stats:
46+
self.proxy_stats[proxy]["requests"] += 1
47+
48+
def log_summary(self, spider):
49+
logger.info("=" * 60)
50+
logger.info("PROXY USAGE SUMMARY")
51+
logger.info("=" * 60)
52+
for proxy, stats in self.proxy_stats.items():
53+
total = stats["requests"]
54+
fails = stats["fails"]
55+
success = stats["success"]
56+
rate = (success / total * 100) if total else 0
57+
banned = "YES" if stats.get("banned_until", 0) > time.time() else "NO"
58+
spider.logger.info(
59+
f"Proxy: {proxy}\n"
60+
f" Total requests: {total}\n"
61+
f" Successes: {success}\n"
62+
f" Failures: {fails}\n"
63+
f" Success rate: {rate:.1f}%\n"
64+
f" Banned: {banned}\n"
65+
f"{'-' * 50}"
66+
)
67+
logger.info("=" * 60)
68+
69+
70+
class SequentialProxyRotatorMiddleware(BaseProxyRotator):
71+
"""
72+
Simple sequential rotation (round-robin) with stats.
73+
to DOWNLOADER_MIDDLEWARES option::
74+
75+
DOWNLOADER_MIDDLEWARES = {
76+
# ...
77+
'ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware': 620,
78+
# ...
79+
}
80+
81+
Settings:
82+
* ``PROXY_PROVIDERS`` - a list of proxies to choose from;
83+
"""
84+
85+
def __init__(self, proxy_providers):
86+
super().__init__(proxy_providers)
87+
self.current_index = 0
88+
89+
@classmethod
90+
def from_crawler(cls, crawler):
91+
providers = crawler.settings.get("PROXY_PROVIDERS")
92+
if not providers:
93+
raise NotConfigured("PROXY_PROVIDERS not configured")
94+
95+
middleware = cls(providers)
96+
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
97+
return middleware
98+
99+
def get_next_proxy(self):
100+
proxy = self.proxies[self.current_index]
101+
self.current_index = (self.current_index + 1) % len(self.proxies)
102+
return proxy
103+
104+
def process_request(self, request, spider):
105+
proxy = self.get_next_proxy()
106+
request.meta["proxy"] = proxy
107+
self._record_request(proxy)
108+
logger.debug(f"[Sequential] Using proxy: {proxy}")
109+
return None
110+
111+
def process_response(self, request, response, spider):
112+
proxy = request.meta.get("proxy")
113+
if proxy:
114+
if response.status < 400:
115+
self._record_success(proxy)
116+
else:
117+
self._record_failure(proxy)
118+
return response
119+
120+
def process_exception(self, request, exception, spider):
121+
proxy = request.meta.get("proxy")
122+
if proxy:
123+
self._record_failure(proxy)
124+
logger.warning(f"[Sequential] Proxy {proxy} exception: {exception}")
125+
return None
126+
127+
def spider_closed(self, spider):
128+
self.log_summary(spider)
129+
130+
131+
class SmartProxyRotatorMiddleware(BaseProxyRotator):
132+
"""
133+
Advanced rotation with failure tracking, cooldown bans, and stats.
134+
135+
To enable it, add it to DOWNLOADER_MIDDLEWARES option::
136+
137+
DOWNLOADER_MIDDLEWARES = {
138+
# ...
139+
'ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware': 620,
140+
# ...
141+
}
142+
143+
Settings:
144+
* ``PROXY_PROVIDERS`` - a list of proxies to choose from
145+
* ``PROXY_BAN_THRESHOLD`` - number of failures before the proxy is banned
146+
* ``PROXY_COOLDOWN`` - seconds that the proxy is deactivated
147+
* ``PROXY_ROTATION_MODE`` - 'random' or 'round_robin'
148+
"""
149+
150+
def __init__(
151+
self,
152+
proxy_providers,
153+
ban_threshold=3,
154+
cooldown_time=300,
155+
rotation_mode="random",
156+
):
157+
super().__init__(proxy_providers)
158+
self.ban_threshold = ban_threshold
159+
self.cooldown_time = cooldown_time
160+
self.rotation_mode = rotation_mode
161+
self.current_index = 0 # for round robin
162+
163+
@classmethod
164+
def from_crawler(cls, crawler):
165+
providers = crawler.settings.get("PROXY_PROVIDERS")
166+
if not providers:
167+
raise NotConfigured("PROXY_PROVIDERS not configured")
168+
169+
ban_threshold = crawler.settings.getint("PROXY_BAN_THRESHOLD", 3)
170+
cooldown_time = crawler.settings.getint("PROXY_COOLDOWN", 300)
171+
rotation_mode = crawler.settings.get("PROXY_ROTATION_MODE", "random")
172+
173+
middleware = cls(providers, ban_threshold, cooldown_time, rotation_mode)
174+
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
175+
return middleware
176+
177+
def get_available_proxies(self):
178+
"""Return only proxies not currently banned."""
179+
now = time.time()
180+
return [p for p, s in self.proxy_stats.items() if s["banned_until"] < now]
181+
182+
def get_next_proxy(self):
183+
available = self.get_available_proxies()
184+
185+
if not available:
186+
logger.warning("[Smart] All proxies are banned! Resetting bans.")
187+
for s in self.proxy_stats.values():
188+
s["banned_until"] = 0
189+
available = self.proxies
190+
191+
if self.rotation_mode == "round_robin":
192+
# Skip banned ones but keep round-robin order
193+
for _ in range(len(self.proxies)):
194+
proxy = self.proxies[self.current_index]
195+
self.current_index = (self.current_index + 1) % len(self.proxies)
196+
if proxy in available:
197+
return proxy
198+
# fallback if somehow none available
199+
return random.choice(available)
200+
else:
201+
return random.choice(available)
202+
203+
def _ban_proxy(self, proxy):
204+
stats = self.proxy_stats[proxy]
205+
stats["banned_until"] = time.time() + self.cooldown_time
206+
logger.info(f"[Smart] Proxy temporarily banned: {proxy}")
207+
208+
def register_failure(self, proxy):
209+
stats = self.proxy_stats[proxy]
210+
stats["fails"] += 1
211+
if stats["fails"] >= self.ban_threshold:
212+
self._ban_proxy(proxy)
213+
stats["fails"] = 0 # reset after ban
214+
215+
def process_request(self, request, spider):
216+
proxy = self.get_next_proxy()
217+
request.meta["proxy"] = proxy
218+
self._record_request(proxy)
219+
logger.debug(f"[Smart] Using proxy: {proxy}")
220+
221+
def process_response(self, request, response, spider):
222+
proxy = request.meta.get("proxy")
223+
if proxy:
224+
if response.status >= 400:
225+
self.register_failure(proxy)
226+
self._record_failure(proxy)
227+
logger.warning(f"[Smart] Proxy {proxy} failed (HTTP {response.status})")
228+
else:
229+
self._record_success(proxy)
230+
return response
231+
232+
def process_exception(self, request, exception, spider):
233+
proxy = request.meta.get("proxy")
234+
if proxy:
235+
self.register_failure(proxy)
236+
self._record_failure(proxy)
237+
logger.warning(f"[Smart] Proxy {proxy} raised exception: {exception}")
238+
return None
239+
240+
def spider_closed(self, spider):
241+
self.log_summary(spider)

tests/pdf_analyzer_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from ps_helper.pdf_analyzer import PDFAnalyzer
2+
from ps_helper.pdf.pdf_analyzer import PDFAnalyzer
33

44
LOCAL_PDF_PATH = "test_files/scansmpl.pdf"
55

tests/test_proxy_middlewares.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import pytest
2+
import time
3+
from scrapy.http import Request, Response
4+
from scrapy.spiders import Spider
5+
6+
from ps_helper.middlewares.proxy_rotator import (
7+
SequentialProxyRotatorMiddleware,
8+
SmartProxyRotatorMiddleware,
9+
)
10+
11+
12+
class DummySpider(Spider):
13+
name = "dummy"
14+
15+
16+
def make_request(url="http://example.com", callback=None):
17+
return Request(url=url, callback=callback or (lambda r: None))
18+
19+
20+
def make_response(request, status=200):
21+
return Response(url=request.url, request=request, status=status)
22+
23+
24+
def make_exception(exc):
25+
return exc
26+
27+
28+
@pytest.fixture
29+
def providers():
30+
return {
31+
"p1": {"user": "u1", "password": "p1", "url": "127.0.0.1", "port": "1111"},
32+
"p2": {"user": "u2", "password": "p2", "url": "127.0.0.1", "port": "2222"},
33+
}
34+
35+
36+
def test_sequential_rotation_records_stats(providers):
37+
middleware = SequentialProxyRotatorMiddleware(providers)
38+
spider = DummySpider()
39+
# simulate 4 requests, two proxies round-robin
40+
r1 = make_request("http://a")
41+
middleware.process_request(r1, spider)
42+
assert "proxy" in r1.meta
43+
44+
r2 = make_request("http://b")
45+
middleware.process_request(r2, spider)
46+
assert "proxy" in r2.meta
47+
# simulate success for r1, failure for r2
48+
res1 = make_response(r1, status=200)
49+
middleware.process_response(r1, res1, spider)
50+
res2 = make_response(r2, status=500)
51+
middleware.process_response(r2, res2, spider)
52+
# check stats recorded
53+
stats = middleware.proxy_stats
54+
total_requests = sum(s["requests"] for s in stats.values())
55+
assert total_requests == 2
56+
# one success, one failure total
57+
total_success = sum(s["success"] for s in stats.values())
58+
total_fails = sum(s["fails"] for s in stats.values())
59+
assert total_success + total_fails == 2
60+
61+
62+
def test_smart_rotation_bans_after_threshold(providers):
63+
"""Test that proxies are banned after reaching the failure threshold and become available again after cooldown."""
64+
middleware = SmartProxyRotatorMiddleware(
65+
providers, ban_threshold=2, cooldown_time=1, rotation_mode="round_robin"
66+
)
67+
spider = DummySpider()
68+
69+
# Force enough failures to trigger a ban
70+
for i in range(4):
71+
req = make_request(f"http://item/{i}")
72+
middleware.process_request(req, spider)
73+
middleware.process_exception(req, Exception("connection error"), spider)
74+
75+
# At least one proxy should be temporarily banned
76+
banned = [p for p, s in middleware.proxy_stats.items() if s["banned_until"] > time.time()]
77+
assert len(banned) >= 1, "At least one proxy should be temporarily banned"
78+
79+
# Wait for cooldown to expire
80+
time.sleep(1.1)
81+
82+
# All proxies should be available again
83+
available = middleware.get_available_proxies()
84+
assert len(available) == len(middleware.proxies), "All proxies should be available after cooldown"
85+
86+
87+
def test_smart_rotation_round_robin_skips_banned(providers):
88+
"""Test that the round-robin mode skips banned proxies."""
89+
middleware = SmartProxyRotatorMiddleware(
90+
providers, ban_threshold=1, cooldown_time=5, rotation_mode="round_robin"
91+
)
92+
spider = DummySpider()
93+
94+
# Force one proxy to be banned
95+
req = make_request("http://test")
96+
middleware.process_request(req, spider)
97+
proxy_used = req.meta["proxy"]
98+
middleware.register_failure(proxy_used)
99+
middleware._record_failure(proxy_used)
100+
101+
assert middleware.proxy_stats[proxy_used]["banned_until"] > time.time()
102+
103+
# The next proxy in round-robin mode should skip the banned one
104+
next_proxy = middleware.get_next_proxy()
105+
assert next_proxy != proxy_used, "Round-robin mode should skip currently banned proxies"
106+
107+
108+
def test_smart_rotation_random_mode(providers):
109+
"""Test that random mode picks proxies randomly from the list."""
110+
middleware = SmartProxyRotatorMiddleware(
111+
providers, ban_threshold=2, cooldown_time=2, rotation_mode="random"
112+
)
113+
spider = DummySpider()
114+
115+
used = set()
116+
for i in range(5):
117+
req = make_request(f"http://random/{i}")
118+
middleware.process_request(req, spider)
119+
used.add(req.meta["proxy"])
120+
121+
assert all(u in middleware.proxies for u in used), "All used proxies must be from the configured list"
122+
assert len(used) <= len(providers), "Random mode must not generate unknown proxies"

0 commit comments

Comments
 (0)