Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions scrapy_proxies/randomproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
import base64
import logging

proxy_regex = r'(\w+://)([^:]+?:.+@)?(.+)'
log = logging.getLogger('scrapy.proxies')


class Mode:
RANDOMIZE_PROXY_EVERY_REQUESTS, RANDOMIZE_PROXY_ONCE, SET_CUSTOM_PROXY = range(3)

Expand All @@ -43,7 +43,7 @@ def __init__(self, settings):
fin = open(self.proxy_list)
try:
for line in fin.readlines():
parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', line.strip())
parts = re.match(proxy_regex, line.strip())
if not parts:
continue

Expand All @@ -61,7 +61,7 @@ def __init__(self, settings):
elif self.mode == Mode.SET_CUSTOM_PROXY:
custom_proxy = settings.get('CUSTOM_PROXY')
self.proxies = {}
parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', custom_proxy.strip())
parts = re.match(proxy_regex, custom_proxy.strip())
if not parts:
raise ValueError('CUSTOM_PROXY is not well formatted')

Expand All @@ -78,9 +78,12 @@ def from_crawler(cls, crawler):
return cls(crawler.settings)

def process_request(self, request, spider):
if self.mode < 0:
log.warning("Skipping Random Proxy selection(disabled)!")
return;
# Don't overwrite with a random one (server-side state for IP)
if 'proxy' in request.meta:
if request.meta["exception"] is False:
if 'proxy' in request.meta or ('splash' in request.meta and 'proxy' in request.meta['splash']['args']):
if request.meta.get("exception", False) is False:
return
request.meta["exception"] = False
if len(self.proxies) == 0:
Expand All @@ -93,20 +96,20 @@ def process_request(self, request, spider):

proxy_user_pass = self.proxies[proxy_address]

if proxy_user_pass:
request.meta['proxy'] = proxy_address
basic_auth = 'Basic ' + base64.b64encode(proxy_user_pass.encode()).decode()
request.headers['Proxy-Authorization'] = basic_auth
else:
log.debug('Proxy user pass not found')
self.add_scrapy_proxy(request, proxy_address, proxy_user_pass)

log.debug('Using proxy <%s>, %d proxies left' % (
proxy_address, len(self.proxies)))

def process_exception(self, request, exception, spider):
if 'proxy' not in request.meta:
if self.mode < 0 or ('proxy' not in request.meta and not('splash' in request.meta and 'proxy' in request.meta['splash']['args'])):
return
if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE:
proxy = request.meta['proxy']
if ('splash' in request.meta and 'proxy' in request.meta['splash']['args']):
parts = re.match(proxy_regex, request.meta['splash']['args']['proxy'].strip())
proxy = parts.group(1) + parts.group(3)
else:
proxy = request.meta['proxy']
try:
del self.proxies[proxy]
except KeyError:
Expand All @@ -116,3 +119,16 @@ def process_exception(self, request, exception, spider):
self.chosen_proxy = random.choice(list(self.proxies.keys()))
log.info('Removing failed proxy <%s>, %d proxies left' % (
proxy, len(self.proxies)))

def add_scrapy_proxy(self, request, address, user_pass = None):

if('splash' in request.meta):
# In case there is splash, just forward the proxy to it
parts = re.match('(\w+://)([\w\W]+)', address.strip())
request.meta['splash']['args']['proxy'] = parts.group(1) + ((user_pass + '@') if len(user_pass) > 0 else '') + parts.group(2)
else:
request.meta['proxy'] = address
if user_pass:
basic_auth = 'Basic ' + base64.b64encode(user_pass.encode()).decode()
request.headers['Proxy-Authorization'] = basic_auth