-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
77 lines (71 loc) · 2.91 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
import json
from lxml import html
from multiprocessing import Pool
import signal
def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)
def get_page_links(text):
root = html.fromstring(text)
links = root.xpath("//table[@id=\"tblPrjSummary\"]/tr[@class=\"prjSearchResult\"]/td[@class=\"tdPrjRef\"]/a/@href")
return ["http://www.itf.gov.hk/l-eng/" + link for link in links]
def get_page_detail(pair):
link, cookies = pair
detail_response = requests.get(link, cookies=cookies)
root = html.fromstring(detail_response.text)
sels = root.xpath("//table[@id=\"tblPrjProfile\"]/tr")
count = len(sels)
d = {}
if count > 0:
for sel in sels:
key = ""
for text in sel.xpath("td[@class=\"prjProfile1\"]//text()"):
key = key + text
key = key.strip()
value = ""
values = [v.strip() for v in sel.xpath("td[@class=\"prjProfile2\"]//text()")]
value = "".join(values).strip()
d[key] = value
return d
else:
print "fucked %s "% (response.meta['title'])
raise Exception("No record found")
url = "http://www.itf.gov.hk/l-eng/Prj_Search.asp?code=108"
response = requests.get(url)
root = html.fromstring(response.text)
token = root.xpath("//input[@name=\"token\"]/@value")[0]
print token
print response.cookies
options = root.xpath("//select[@id=\"techArea\"]/option")
output = {}
for option in options:
category = option.xpath("text()")[0]
value = option.xpath("@value")[0]
print category + " [" + value + "]"
formdata = {"techArea": value, 'token': token, 'submit': 'Search'}
page_response = requests.post('http://www.itf.gov.hk/l-eng/Prj_SearchResult.asp', data=formdata, cookies=response.cookies)
root = html.fromstring(page_response.text)
total_pages = int("".join([x.strip() for x in root.xpath("//table[@id=\"prjSearchPageTable\"]//tr[@class=\"prjSearchResult\"]/td//text()")]).split(" of ")[1].strip())
details = []
for i in range(1, total_pages + 1):
print "Page:%d" % (i)
formdata = {"techArea": value, 'token': token, 'submit': 'Search', 'page_no': str(i)}
page_response = requests.post('http://www.itf.gov.hk/l-eng/Prj_SearchResult.asp', data=formdata, cookies=response.cookies)
links = get_page_links(page_response.text)
p = Pool(10, init_worker)
try:
details_per_page = p.map(get_page_detail, [(link, response.cookies) for link in links])
for detail in details_per_page:
print json.dumps(detail)
details = details + details_per_page
except KeyboardInterrupt:
p.terminate()
p.join()
finally:
p.close()
print "Number of Projects %d" % (len(details))
output[category] = details
print "Writing File..."
f = open("projects_uncleansed.json", "w")
f.write(json.dumps(output))
f.close()