-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
46 lines (39 loc) · 1.29 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import wikipedia
import indexer
from collections import defaultdict
seeds = ['Djokovic']
pages = set()
visited_pages = set()
NUMBER_PAGES = 100000
NUMBER_PER_PAGE = 100
def get_page_content(page_name):
try:
page = wikipedia.page(page_name)
global pages
for link in page.links:
if link not in pages and link not in visited_pages:
pages.add(link)
return page.content
except:
return None
if __name__ == '__main__':
wikipedia.set_rate_limiting(True)
for seed in seeds:
pages.add(seed)
reverse_index = defaultdict(list)
i = 0
while pages and i < NUMBER_PAGES:
page_name = pages.pop()
visited_pages.add(page_name)
page_content = get_page_content(page_name)
if not page_content:
continue
text = indexer.parse_document(page_content)
index = indexer.construct_reverse_index(text, page_name)
reverse_index = indexer.merge_reverse_indices(reverse_index, index)
print "Just visited page %s (%dth page)..." % (page_name, (i + 1))
i += 1
if (i % NUMBER_PER_PAGE == 0):
indexer.dump_index(
reverse_index, ('indices/index_%d.txt' % (i / NUMBER_PER_PAGE)))
reverse_index = defaultdict(list)