import base64 from urllib.parse import unquote, urlunparse from urllib.request import getproxies, proxy_bypass, _parse_proxy from scrapy.exceptions import NotConfigured from scrapy.utils.httpobj import urlparse_cached from scrapy.utils.python import to_bytes class HttpProxyMiddleware: def __init__(self, auth_encoding='latin-1'): self.auth_encoding = auth_encoding self.proxies = {} for type_, url in getproxies().items(): try: self.proxies[type_] = self._get_proxy(url, type_) # some values such as '/var/run/docker.sock' can't be parsed # by _parse_proxy and as such should be skipped except ValueError: continue @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING') return cls(auth_encoding) def _basic_auth_header(self, username, password): user_pass = to_bytes( f'{unquote(username)}:{unquote(password)}', encoding=self.auth_encoding) return base64.b64encode(user_pass) def _get_proxy(self, url, orig_type): proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None return creds, proxy_url def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme) def _set_proxy(self, request, scheme): creds, proxy = self.proxies[scheme] request.meta['proxy'] = proxy if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds