-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathscrape.py
118 lines (112 loc) · 3.51 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import html5lib
from bs4 import BeautifulSoup
import json
from selenium.webdriver import Chrome
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
# adjust the sleep timers according to your connection speed and run the script
# if you get error for element not found recheck the class names in the website incase they have changed
base="https://summerofcode.withgoogle.com"
data=[]
pg=webdriver.Chrome(executable_path="/Applications/driver/chromedriver")
for year in reversed(range(2016,2023)):
url="https://summerofcode.withgoogle.com/archive/"+str(year)+'/organizations'
print(url,'url')
pg.maximize_window()
pg.get(url)
sleep(5)
soup=BeautifulSoup(pg.page_source,'html5lib')
x=True
org=soup.find_all('a','content')
while x==True:
next_page=pg.find_element(By.CLASS_NAME,"mat-focus-indicator.mat-tooltip-trigger.mat-paginator-navigation-next.mat-icon-button.mat-button-base")
if next_page.get_property('disabled'):
break
next_page.click()
print(next_page.get_property('disabled'))
sleep(2)
soup=BeautifulSoup(pg.page_source,'html5lib')
sleep(2)
org2=soup.find_all('a','content')
org=org+org2
print("Scraping", year,"first loop")
print(len(org), 'length')
for i in range(len(org)):
dic={}
urlo=base+(org[i].attrs['href'])
pg.get(urlo)
sleep(8)
sp=BeautifulSoup(pg.page_source,'html5lib')
try:
name=sp.find('span','title').string
except:
pg.get(urlo)
sleep(8)
sp=BeautifulSoup(pg.page_source,'html5lib')
name=sp.find('span','title').string
f=0
for d in data:
if d['name']==name:
d['year'].append(year)
proj=sp.find_all('div',"contributor__content")
d['proj'].append(len(proj))
dic['url']=urlo
f=1
if(f==0):
dic['url']=urlo
dic['name']=sp.find('span','title').string
tech_string = sp.find('div',"tech__content").string
tech=tech_string.split(',')
for j in range(len(tech)):
tech[j]=tech[j].strip()
dic['tech']=tech
dic['cat']=''
topic_string = sp.find('div','topics__content').string
top=topic_string.split(',')
for j in range(len(top)):
top[j]=top[j].strip()
dic['top']=top
dic['year']=[year]
proj=sp.find_all('div',"contributor__content")
dic['proj']=[len(proj)]
data.append(dic)
for year in reversed(range(2009,2016)):
url="https://www.google-melange.com/archive/gsoc/"+str(year)
base="https://www.google-melange.com"
pg.get(url)
sleep(2)
soup=BeautifulSoup(pg.page_source,'html5lib')
org=soup.find_all('span',class_="mdl-list__item-primary-content")
print("Scraping", year,"second loop")
for i in range(len(org)):
link=org[i].find('a')
urlo=base+(link.attrs['href'])
print(urlo,"second loop testing")
try:
pg.get(urlo)
except:
print('org not found',urlo)
else:
sp=BeautifulSoup(pg.page_source,'html5lib')
name=link.text
f=0
for d in data:
if d['name']==name:
d['year'].append(year)
proj=sp.findAll('li',class_="mdl-list__item mdl-list__item--two-line")
d['proj'].append(len(proj))
for d in data:
d['project']=[]
for year in range(2009,2023):
f=1
for y in range(len(d['year'])):
if year==d['year'][y]:
d['project'].append(d['proj'][y])
f=0
break
if f==1:d['project'].append(0)
d.pop('proj')
with open('data.json', 'w') as fout:
json.dump(data , fout,indent = 6)
print(data)