Skip to content

Commit f49bee3

Browse files
committed
init commit
1 parent 0ecd647 commit f49bee3

31 files changed

+1626
-0
lines changed

AHA-selenium.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from selenium import webdriver # powers the browser interaction
2+
from selenium.webdriver.support.ui import Select # selects menu options
3+
from bs4 import BeautifulSoup # to parse HTML
4+
import csv # to write CSV
5+
import pandas as pd # to see CSV
6+
import time
7+
import os
8+
import random
9+
import requests
10+
11+
driver = webdriver.Chrome()
12+
driver.get("http://careers.historians.org/jobs/?page=1")
13+
14+
base_url = 'http://careers.historians.org'
15+
all_rows = []
16+
pages = ["http://careers.historians.org/jobs/?page=1",
17+
"http://careers.historians.org/jobs/?page=2"]
18+
19+
for p in pages:
20+
driver.get(p)
21+
soup = BeautifulSoup(driver.page_source, 'html5lib')
22+
23+
rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'})
24+
for r in rows:
25+
title = r.find('a').text.strip()
26+
link = base_url + r.find('a')['href']
27+
employer = r.find(
28+
'div', {
29+
'class': 'bti-ui-job-result-detail-employer'}).text.strip()
30+
location = r.find(
31+
'div', {
32+
'class': 'bti-ui-job-result-detail-location'}).text.strip()
33+
date_posted = r.find(
34+
'div', {
35+
'class': 'bti-ui-job-result-detail-age'}).text.strip()
36+
37+
driver.get(link)
38+
39+
soup = BeautifulSoup(driver.page_source, 'html5lib')
40+
41+
try:
42+
job_description = soup.find(
43+
'div', {'class': 'bti-jd-description'}).text.strip()
44+
45+
details = soup.find('div', {'class': 'bti-jd-details-container'})
46+
47+
details_titles = [
48+
x.text.replace(
49+
':', '').lower().strip() for x in details.find_all(
50+
'div', {
51+
'class': 'bti-jd-detail-title'})]
52+
details_text = [
53+
x.text.strip() for x in details.find_all(
54+
'div', {
55+
'class': 'bti-jd-detail-text'})]
56+
57+
details_dict = {}
58+
59+
for i in range(len(details_titles)):
60+
t = details_titles[i]
61+
if 'categories' in t:
62+
t = 'category'
63+
elif 'required' in t:
64+
t = 'preferred education'
65+
details_dict[t] = details_text[i]
66+
67+
details_dict['title'] = title
68+
details_dict['link'] = link
69+
details_dict['employer'] = employer
70+
details_dict['location'] = location
71+
details_dict['date_posted'] = date_posted
72+
details_dict['job_description'] = job_description
73+
74+
try:
75+
details_dict['employer_about'] = soup.find(
76+
'div', {'class': 'bti-jd-employer-info'}).text.strip()
77+
except:
78+
details_dict['employer_about'] = ''
79+
80+
all_rows.append(details_dict)
81+
82+
except:
83+
pass
84+
85+
time.sleep(1)
86+
87+
header = ["title",
88+
"employer",
89+
"location",
90+
"posted",
91+
"date_posted",
92+
"primary field",
93+
"category",
94+
"preferred education",
95+
"salary",
96+
"type",
97+
"employment type",
98+
"job_description",
99+
"employer_about",
100+
"link"
101+
]
102+
103+
104+
with open('AHA-data.csv', 'w') as f:
105+
w = csv.DictWriter(f, header)
106+
w.writeheader()
107+
w.writerows(all_rows)

BoardGameCapital-selenium.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from selenium import webdriver # powers the browser interaction
2+
from selenium.webdriver.support.ui import Select # selects menu options
3+
from bs4 import BeautifulSoup # to parse HTML
4+
import csv # to write CSV
5+
import pandas as pd # to see CSV
6+
import time
7+
import os
8+
import random
9+
import requests
10+
import time as time_lib
11+
12+
driver = webdriver.Chrome()
13+
next_page = "http://www.boardgamecapital.com/board-game-rules.htm"
14+
driver.get(next_page)
15+
16+
soup = BeautifulSoup(driver.page_source, 'html5lib')
17+
game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1]
18+
19+
game_dict = {}
20+
21+
for g in game_cells:
22+
game_dict[g.text] = {}
23+
game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \
24+
g.find('a')['href']
25+
26+
for k in game_dict.keys():
27+
print(k)
28+
driver.get(game_dict[k]['link'])
29+
30+
soup = BeautifulSoup(driver.page_source, 'html5lib')
31+
32+
gstats1 = [x.split(':') for x in soup.find(
33+
'div', {'class': 'gstats1'}).text.split('\n')]
34+
price = gstats1[0][1].strip()[1:]
35+
time = gstats1[1][1].strip()
36+
37+
gstats2 = [x.split(':') for x in soup.find(
38+
'div', {'class': 'gstats2'}).text.split('\n')]
39+
age = gstats2[0][1].strip()
40+
players = gstats2[1][1].strip()
41+
42+
text = soup.find('div', {'class', 'mainbody'}).text
43+
44+
pdf_links = [
45+
a for a in soup.find(
46+
'div', {
47+
'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text]
48+
49+
paths = []
50+
for url in pdf_links:
51+
path = 'pdfs/{}.pdf'.format(url.text)
52+
with open(path, 'wb') as f:
53+
f.write(requests.get(url['href']).content)
54+
55+
paths.append(path)
56+
57+
paths = ';'.join(paths)
58+
59+
game_dict[k]['price'] = price
60+
game_dict[k]['time'] = time
61+
game_dict[k]['age'] = age
62+
game_dict[k]['players'] = players
63+
game_dict[k]['paths'] = paths
64+
game_dict[k]['web_text'] = text
65+
66+
time_lib.sleep(1)

CTSNet-selenium.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from selenium import webdriver # powers the browser interaction
2+
from selenium.webdriver.support.ui import Select # selects menu options
3+
from bs4 import BeautifulSoup # to parse HTML
4+
import csv # to write CSV
5+
import pandas as pd # to see CSV
6+
import time
7+
import os
8+
import random
9+
10+
11+
driver = webdriver.PhantomJS()
12+
next_page = "https://www.ctsnet.org/surgeons/surgeons-advanced-search?ln=&fn=&subspecialty=adult_cardiac_surgery&city=&country=gb&province=&o"
13+
14+
with open("IT-cardi.csv", "a") as f:
15+
csv_w_interv = csv.writer(f)
16+
csv_w_interv.writerow(["Name",
17+
"Hospital",
18+
"Phone",
19+
"Interests",
20+
"Practice-Areas",
21+
"City-Region",
22+
"Country",
23+
"Street", "URL"])
24+
25+
26+
for i in range(1000):
27+
28+
driver.get(next_page)
29+
30+
soup = BeautifulSoup(driver.page_source, "html5lib")
31+
32+
try:
33+
next_page = "https://www.ctsnet.org" + \
34+
soup.find('a', {'title': 'Go to next page'})['href']
35+
except:
36+
next_page = ""
37+
38+
td_a = soup.find_all(
39+
"td", {"class": "views-field views-field-field-contact-last-name"})
40+
41+
if i == 0:
42+
links = ["https://www.ctsnet.org" +
43+
x.find("a")['href'] for x in td_a[48:]]
44+
else:
45+
links = ["https://www.ctsnet.org" + x.find("a")['href'] for x in td_a]
46+
47+
for l in links:
48+
49+
driver.get(l)
50+
soup = BeautifulSoup(driver.page_source, "html5lib")
51+
52+
try:
53+
name = soup.find('h1', {"class": 'page-title'}).text.strip()
54+
print(name)
55+
except:
56+
continue
57+
58+
try:
59+
hospital = soup.find(
60+
'div', {
61+
"class": 'contact-institution'}).text.strip()
62+
except:
63+
continue
64+
65+
try:
66+
country = soup.find('div',
67+
{"class": 'contact-country'}).text.strip()
68+
69+
except:
70+
country = ''
71+
72+
try:
73+
street = soup.find('div', {"class": 'contact-street'}).text.strip()
74+
except:
75+
street = ''
76+
77+
try:
78+
city = soup.find(
79+
'div', {
80+
"class": 'contact-city-province-code'}).text.strip()
81+
82+
except:
83+
city = ''
84+
85+
try:
86+
phone = soup.find('div', {"class": 'contact-numbers'}).text.strip()
87+
except:
88+
continue
89+
90+
try:
91+
fields = soup.find(
92+
'div', {
93+
"class": 'views-field views-field-field-contact-subspecialty'}).text.strip().replace(
94+
'\n', '; ')
95+
except:
96+
fields = ''
97+
98+
try:
99+
100+
interests = soup.find(
101+
'div', {
102+
"class": 'field field--name-field-contact-interest field--type-text-long field--label-hidden'}).text.strip().replace(
103+
'\n', '; ')
104+
except:
105+
interests = ''
106+
107+
if len(phone) > 0:
108+
109+
with open("IT-cardi.csv", "a") as f:
110+
csv_w_interv = csv.writer(f)
111+
csv_w_interv.writerow(
112+
[name, hospital, phone, interests, fields, city, country, street, l])
113+
114+
time.sleep(random.randint(1, 3))
115+
time.sleep(random.randint(1, 3))

CrunchBase-API.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import requests
2+
import json
3+
from __future__ import division
4+
import math
5+
import csv
6+
7+
# set key
8+
key = "PUT_KEY_HERE"
9+
10+
# set base url
11+
base_url = "https://api.crunchbase.com/v/3/organizations"
12+
13+
# set response format
14+
response_format = ".json"
15+
16+
# set search parameters
17+
search_params = {"name": "uber",
18+
"user_key": key,
19+
"page": "1"}
20+
21+
# make request
22+
r = requests.get(base_url + response_format, params=search_params)
23+
response_text = r.text
24+
25+
# Convert JSON response to a dictionary
26+
data = json.loads(response_text)
27+
28+
print(data.keys())

0 commit comments

Comments
 (0)