-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
103 lines (86 loc) · 3.07 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3.6
import argparse
import asyncio
import logging
import sys
import crawling
import reporting
ARGS = argparse.ArgumentParser(description="Web crawler")
ARGS.add_argument(
'--iocp', action='store_true', dest='iocp',
default=False, help='Use IOCP event loop (Windows only)')
ARGS.add_argument(
'--select', action='store_true', dest='select',
default=False, help='Use Select event loop instead of default')
ARGS.add_argument(
'roots', nargs='*',
default=[], help='Root URL (may be repeated)')
ARGS.add_argument(
'--max_redirect', action='store', type=int, metavar='N',
default=10, help='Limit redirection chains (for 301, 302 etc.)')
ARGS.add_argument(
'--max_tries', action='store', type=int, metavar='N',
default=4, help='Limit retries on network errors')
ARGS.add_argument(
'--max_tasks', action='store', type=int, metavar='N',
default=100, help='Limit concurrent connections')
ARGS.add_argument(
'--exclude', action='store', metavar='REGEX',
help='Exclude matching URLs')
ARGS.add_argument(
'--strict', action='store_true',
default=True, help='Strict host matching (default)')
ARGS.add_argument(
'--lenient', action='store_false', dest='strict',
default=False, help='Lenient host matching')
ARGS.add_argument(
'-v', '--verbose', action='count', dest='level',
default=2, help='Verbose logging (repeat for more verbose)')
ARGS.add_argument(
'-q', '--quiet', action='store_const', const=0, dest='level',
default=2, help='Only log errors')
def fix_url(url):
"""Prefix a schema-less URL with http://."""
if '://' not in url:
url = 'http://' + url
return url
def main():
"""Main program.
Parse arguments, set up event loop, run crawler, print report.
"""
args = ARGS.parse_args()
if not args.roots:
print('Use --help for command line help')
return
levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
logging.basicConfig(level=levels[min(args.level, len(levels)-1)])
if args.iocp:
from asyncio.windows_events import ProactorEventLoop
loop = ProactorEventLoop()
asyncio.set_event_loop(loop)
elif args.select:
loop = asyncio.SelectorEventLoop()
asyncio.set_event_loop(loop)
else:
loop = asyncio.get_event_loop()
roots = {fix_url(root) for root in args.roots}
crawler = crawling.Crawler(roots,
exclude=args.exclude,
strict=args.strict,
max_redirect=args.max_redirect,
max_tries=args.max_tries,
max_tasks=args.max_tasks)
try:
loop.run_until_complete(crawler.crawl())
except KeyboardInterrupt:
sys.stderr.flush()
print('\nInterrupted\n')
finally:
reporting.report(crawler)
crawler.close()
# next two lines are required for actual aiohttp resource cleanup
loop.stop()
loop.run_forever()
loop.close()
if __name__ == '__main__':
main()