|
| 1 | +import random |
| 2 | +import logging |
| 3 | +import time |
| 4 | +from scrapy import signals |
| 5 | +from scrapy.exceptions import NotConfigured |
| 6 | + |
| 7 | +logger = logging.getLogger(__name__) |
| 8 | + |
| 9 | + |
| 10 | +class BaseProxyRotator: |
| 11 | + """Base class with shared proxy setup logic and stats.""" |
| 12 | + |
| 13 | + def __init__(self, proxy_providers): |
| 14 | + self.proxy_providers = proxy_providers |
| 15 | + self.proxies = self._build_proxy_list(proxy_providers) |
| 16 | + self.proxy_stats = { |
| 17 | + proxy: {"requests": 0, "success": 0, "fails": 0, "banned_until": 0} |
| 18 | + for proxy in self.proxies |
| 19 | + } |
| 20 | + logger.info(f"ProxyRotator initialized with {len(self.proxies)} proxies") |
| 21 | + |
| 22 | + def _build_proxy_list(self, providers_dict): |
| 23 | + proxies = [] |
| 24 | + for provider, data in providers_dict.items(): |
| 25 | + user = data.get("user") |
| 26 | + password = data.get("password") |
| 27 | + url = data.get("url") |
| 28 | + port = data.get("port") |
| 29 | + if user and password: |
| 30 | + proxy = f"http://{user}:{password}@{url}:{port}" |
| 31 | + else: |
| 32 | + proxy = f"http://{url}:{port}" |
| 33 | + proxies.append(proxy) |
| 34 | + return proxies |
| 35 | + |
| 36 | + def _record_success(self, proxy): |
| 37 | + if proxy in self.proxy_stats: |
| 38 | + self.proxy_stats[proxy]["success"] += 1 |
| 39 | + |
| 40 | + def _record_failure(self, proxy): |
| 41 | + if proxy in self.proxy_stats: |
| 42 | + self.proxy_stats[proxy]["fails"] += 1 |
| 43 | + |
| 44 | + def _record_request(self, proxy): |
| 45 | + if proxy in self.proxy_stats: |
| 46 | + self.proxy_stats[proxy]["requests"] += 1 |
| 47 | + |
| 48 | + def log_summary(self, spider): |
| 49 | + logger.info("=" * 60) |
| 50 | + logger.info("PROXY USAGE SUMMARY") |
| 51 | + logger.info("=" * 60) |
| 52 | + for proxy, stats in self.proxy_stats.items(): |
| 53 | + total = stats["requests"] |
| 54 | + fails = stats["fails"] |
| 55 | + success = stats["success"] |
| 56 | + rate = (success / total * 100) if total else 0 |
| 57 | + banned = "YES" if stats.get("banned_until", 0) > time.time() else "NO" |
| 58 | + spider.logger.info( |
| 59 | + f"Proxy: {proxy}\n" |
| 60 | + f" Total requests: {total}\n" |
| 61 | + f" Successes: {success}\n" |
| 62 | + f" Failures: {fails}\n" |
| 63 | + f" Success rate: {rate:.1f}%\n" |
| 64 | + f" Banned: {banned}\n" |
| 65 | + f"{'-' * 50}" |
| 66 | + ) |
| 67 | + logger.info("=" * 60) |
| 68 | + |
| 69 | + |
| 70 | +class SequentialProxyRotatorMiddleware(BaseProxyRotator): |
| 71 | + """ |
| 72 | + Simple sequential rotation (round-robin) with stats. |
| 73 | + to DOWNLOADER_MIDDLEWARES option:: |
| 74 | +
|
| 75 | + DOWNLOADER_MIDDLEWARES = { |
| 76 | + # ... |
| 77 | + 'ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware': 620, |
| 78 | + # ... |
| 79 | + } |
| 80 | +
|
| 81 | + Settings: |
| 82 | + * ``PROXY_PROVIDERS`` - a list of proxies to choose from; |
| 83 | + """ |
| 84 | + |
| 85 | + def __init__(self, proxy_providers): |
| 86 | + super().__init__(proxy_providers) |
| 87 | + self.current_index = 0 |
| 88 | + |
| 89 | + @classmethod |
| 90 | + def from_crawler(cls, crawler): |
| 91 | + providers = crawler.settings.get("PROXY_PROVIDERS") |
| 92 | + if not providers: |
| 93 | + raise NotConfigured("PROXY_PROVIDERS not configured") |
| 94 | + |
| 95 | + middleware = cls(providers) |
| 96 | + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) |
| 97 | + return middleware |
| 98 | + |
| 99 | + def get_next_proxy(self): |
| 100 | + proxy = self.proxies[self.current_index] |
| 101 | + self.current_index = (self.current_index + 1) % len(self.proxies) |
| 102 | + return proxy |
| 103 | + |
| 104 | + def process_request(self, request, spider): |
| 105 | + proxy = self.get_next_proxy() |
| 106 | + request.meta["proxy"] = proxy |
| 107 | + self._record_request(proxy) |
| 108 | + logger.debug(f"[Sequential] Using proxy: {proxy}") |
| 109 | + return None |
| 110 | + |
| 111 | + def process_response(self, request, response, spider): |
| 112 | + proxy = request.meta.get("proxy") |
| 113 | + if proxy: |
| 114 | + if response.status < 400: |
| 115 | + self._record_success(proxy) |
| 116 | + else: |
| 117 | + self._record_failure(proxy) |
| 118 | + return response |
| 119 | + |
| 120 | + def process_exception(self, request, exception, spider): |
| 121 | + proxy = request.meta.get("proxy") |
| 122 | + if proxy: |
| 123 | + self._record_failure(proxy) |
| 124 | + logger.warning(f"[Sequential] Proxy {proxy} exception: {exception}") |
| 125 | + return None |
| 126 | + |
| 127 | + def spider_closed(self, spider): |
| 128 | + self.log_summary(spider) |
| 129 | + |
| 130 | + |
| 131 | +class SmartProxyRotatorMiddleware(BaseProxyRotator): |
| 132 | + """ |
| 133 | + Advanced rotation with failure tracking, cooldown bans, and stats. |
| 134 | +
|
| 135 | + To enable it, add it to DOWNLOADER_MIDDLEWARES option:: |
| 136 | +
|
| 137 | + DOWNLOADER_MIDDLEWARES = { |
| 138 | + # ... |
| 139 | + 'ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware': 620, |
| 140 | + # ... |
| 141 | + } |
| 142 | +
|
| 143 | + Settings: |
| 144 | + * ``PROXY_PROVIDERS`` - a list of proxies to choose from |
| 145 | + * ``PROXY_BAN_THRESHOLD`` - number of failures before the proxy is banned |
| 146 | + * ``PROXY_COOLDOWN`` - seconds that the proxy is deactivated |
| 147 | + * ``PROXY_ROTATION_MODE`` - 'random' or 'round_robin' |
| 148 | + """ |
| 149 | + |
| 150 | + def __init__( |
| 151 | + self, |
| 152 | + proxy_providers, |
| 153 | + ban_threshold=3, |
| 154 | + cooldown_time=300, |
| 155 | + rotation_mode="random", |
| 156 | + ): |
| 157 | + super().__init__(proxy_providers) |
| 158 | + self.ban_threshold = ban_threshold |
| 159 | + self.cooldown_time = cooldown_time |
| 160 | + self.rotation_mode = rotation_mode |
| 161 | + self.current_index = 0 # for round robin |
| 162 | + |
| 163 | + @classmethod |
| 164 | + def from_crawler(cls, crawler): |
| 165 | + providers = crawler.settings.get("PROXY_PROVIDERS") |
| 166 | + if not providers: |
| 167 | + raise NotConfigured("PROXY_PROVIDERS not configured") |
| 168 | + |
| 169 | + ban_threshold = crawler.settings.getint("PROXY_BAN_THRESHOLD", 3) |
| 170 | + cooldown_time = crawler.settings.getint("PROXY_COOLDOWN", 300) |
| 171 | + rotation_mode = crawler.settings.get("PROXY_ROTATION_MODE", "random") |
| 172 | + |
| 173 | + middleware = cls(providers, ban_threshold, cooldown_time, rotation_mode) |
| 174 | + crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed) |
| 175 | + return middleware |
| 176 | + |
| 177 | + def get_available_proxies(self): |
| 178 | + """Return only proxies not currently banned.""" |
| 179 | + now = time.time() |
| 180 | + return [p for p, s in self.proxy_stats.items() if s["banned_until"] < now] |
| 181 | + |
| 182 | + def get_next_proxy(self): |
| 183 | + available = self.get_available_proxies() |
| 184 | + |
| 185 | + if not available: |
| 186 | + logger.warning("[Smart] All proxies are banned! Resetting bans.") |
| 187 | + for s in self.proxy_stats.values(): |
| 188 | + s["banned_until"] = 0 |
| 189 | + available = self.proxies |
| 190 | + |
| 191 | + if self.rotation_mode == "round_robin": |
| 192 | + # Skip banned ones but keep round-robin order |
| 193 | + for _ in range(len(self.proxies)): |
| 194 | + proxy = self.proxies[self.current_index] |
| 195 | + self.current_index = (self.current_index + 1) % len(self.proxies) |
| 196 | + if proxy in available: |
| 197 | + return proxy |
| 198 | + # fallback if somehow none available |
| 199 | + return random.choice(available) |
| 200 | + else: |
| 201 | + return random.choice(available) |
| 202 | + |
| 203 | + def _ban_proxy(self, proxy): |
| 204 | + stats = self.proxy_stats[proxy] |
| 205 | + stats["banned_until"] = time.time() + self.cooldown_time |
| 206 | + logger.info(f"[Smart] Proxy temporarily banned: {proxy}") |
| 207 | + |
| 208 | + def register_failure(self, proxy): |
| 209 | + stats = self.proxy_stats[proxy] |
| 210 | + stats["fails"] += 1 |
| 211 | + if stats["fails"] >= self.ban_threshold: |
| 212 | + self._ban_proxy(proxy) |
| 213 | + stats["fails"] = 0 # reset after ban |
| 214 | + |
| 215 | + def process_request(self, request, spider): |
| 216 | + proxy = self.get_next_proxy() |
| 217 | + request.meta["proxy"] = proxy |
| 218 | + self._record_request(proxy) |
| 219 | + logger.debug(f"[Smart] Using proxy: {proxy}") |
| 220 | + |
| 221 | + def process_response(self, request, response, spider): |
| 222 | + proxy = request.meta.get("proxy") |
| 223 | + if proxy: |
| 224 | + if response.status >= 400: |
| 225 | + self.register_failure(proxy) |
| 226 | + self._record_failure(proxy) |
| 227 | + logger.warning(f"[Smart] Proxy {proxy} failed (HTTP {response.status})") |
| 228 | + else: |
| 229 | + self._record_success(proxy) |
| 230 | + return response |
| 231 | + |
| 232 | + def process_exception(self, request, exception, spider): |
| 233 | + proxy = request.meta.get("proxy") |
| 234 | + if proxy: |
| 235 | + self.register_failure(proxy) |
| 236 | + self._record_failure(proxy) |
| 237 | + logger.warning(f"[Smart] Proxy {proxy} raised exception: {exception}") |
| 238 | + return None |
| 239 | + |
| 240 | + def spider_closed(self, spider): |
| 241 | + self.log_summary(spider) |
0 commit comments