-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamazon_scrape.py
59 lines (42 loc) · 1.43 KB
/
amazon_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
Looking at Amazon's books for data analysis
'http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A283155%2Cn%3A%211000%2Cn%3A5%2Cn%3A549646%2Cn%3A3654&page=1&ie=UTF8'
'''
import multiprocessing
import re
import bs4
import requests
from urllib.robotparser import RobotFileParser
base_url = 'http://www.amazon.com'
ext_url = '/s/ref=sr_pg_2?rh=n%3A283155%2Cn%3A%211000%2Cn%3A5%2Cn%3A549646%2Cn%3A3654&page=1&ie=UTF8'
res_regex = re.compile('result_[0-9]{,}')
def check_robots(base_url, ext_url):
'''
Check the robots.txt
Prints note if base_url + ext_url is legal for crawling
'''
bot = RobotFileParser(base_url + '/robots.txt')
bot.read()
if bot.can_fetch('*', base_url + ext_url):
print('robots.txt permits parsing')
else:
print('Do not parse')
return bot
def extract(tag):
'''
Want to extract title, author, publication date, review stars.
Just doing title for the moment
'''
return tag.find('div', 'productTitle').a.text
def fetch_one(k=1):
'''
Fetch the kth page of the search results
'''
response = requests.get('http://www.amazon.com/s/ref=sr_pg_{}'
'?rh=n%3A283155%2Cn%3A%211000%2Cn%3A5%2Cn%3A549646%2Cn'
'%3A3654&page=1&ie=UTF8'.format(k))
soup = bs4.BeautifulSoup(response.content)
booktags = soup.find_all(id=res_regex)
return list(map(extract, booktags))
#soup = fetch_one()
pool = multiprocessing.Pool(processes=5)