henchc
diff --git a/‎AHA-selenium.py
+107 b/‎AHA-selenium.py
+107
diff --git a/‎BoardGameCapital-selenium.py
+66 b/‎BoardGameCapital-selenium.py
+66
diff --git a/‎CTSNet-selenium.py
+115 b/‎CTSNet-selenium.py
+115
diff --git a/‎CrunchBase-API.py
+28 b/‎CrunchBase-API.py
+28
@@ -0,0 +1,107 @@
+from selenium import webdriver  # powers the browser interaction
+from selenium.webdriver.support.ui import Select  # selects menu options
+from bs4 import BeautifulSoup  # to parse HTML
+import csv  # to write CSV
+import pandas as pd  # to see CSV
+import time
+import os
+import random
+import requests
+
+driver = webdriver.Chrome()
+driver.get("http://careers.historians.org/jobs/?page=1")
+
+base_url = 'http://careers.historians.org'
+all_rows = []
+pages = ["http://careers.historians.org/jobs/?page=1",
+         "http://careers.historians.org/jobs/?page=2"]
+
+for p in pages:
+    driver.get(p)
+    soup = BeautifulSoup(driver.page_source, 'html5lib')
+
+    rows = soup.find_all('div', {'class': 'bti-ui-job-detail-container'})
+    for r in rows:
+        title = r.find('a').text.strip()
+        link = base_url + r.find('a')['href']
+        employer = r.find(
+            'div', {
+                'class': 'bti-ui-job-result-detail-employer'}).text.strip()
+        location = r.find(
+            'div', {
+                'class': 'bti-ui-job-result-detail-location'}).text.strip()
+        date_posted = r.find(
+            'div', {
+                'class': 'bti-ui-job-result-detail-age'}).text.strip()
+
+        driver.get(link)
+
+        soup = BeautifulSoup(driver.page_source, 'html5lib')
+
+        try:
+            job_description = soup.find(
+                'div', {'class': 'bti-jd-description'}).text.strip()
+
+            details = soup.find('div', {'class': 'bti-jd-details-container'})
+
+            details_titles = [
+                x.text.replace(
+                    ':', '').lower().strip() for x in details.find_all(
+                    'div', {
+                        'class': 'bti-jd-detail-title'})]
+            details_text = [
+                x.text.strip() for x in details.find_all(
+                    'div', {
+                        'class': 'bti-jd-detail-text'})]
+
+            details_dict = {}
+
+            for i in range(len(details_titles)):
+                t = details_titles[i]
+                if 'categories' in t:
+                    t = 'category'
+                elif 'required' in t:
+                    t = 'preferred education'
+                details_dict[t] = details_text[i]
+
+            details_dict['title'] = title
+            details_dict['link'] = link
+            details_dict['employer'] = employer
+            details_dict['location'] = location
+            details_dict['date_posted'] = date_posted
+            details_dict['job_description'] = job_description
+
+            try:
+                details_dict['employer_about'] = soup.find(
+                    'div', {'class': 'bti-jd-employer-info'}).text.strip()
+            except:
+                details_dict['employer_about'] = ''
+
+            all_rows.append(details_dict)
+
+        except:
+            pass
+
+        time.sleep(1)
+
+header = ["title",
+          "employer",
+          "location",
+          "posted",
+          "date_posted",
+          "primary field",
+          "category",
+          "preferred education",
+          "salary",
+          "type",
+          "employment type",
+          "job_description",
+          "employer_about",
+          "link"
+          ]
+
+
+with open('AHA-data.csv', 'w') as f:
+    w = csv.DictWriter(f, header)
+    w.writeheader()
+    w.writerows(all_rows)
@@ -0,0 +1,66 @@
+from selenium import webdriver  # powers the browser interaction
+from selenium.webdriver.support.ui import Select  # selects menu options
+from bs4 import BeautifulSoup  # to parse HTML
+import csv  # to write CSV
+import pandas as pd  # to see CSV
+import time
+import os
+import random
+import requests
+import time as time_lib
+
+driver = webdriver.Chrome()
+next_page = "http://www.boardgamecapital.com/board-game-rules.htm"
+driver.get(next_page)
+
+soup = BeautifulSoup(driver.page_source, 'html5lib')
+game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1]
+
+game_dict = {}
+
+for g in game_cells:
+    game_dict[g.text] = {}
+    game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \
+        g.find('a')['href']
+
+for k in game_dict.keys():
+    print(k)
+    driver.get(game_dict[k]['link'])
+
+    soup = BeautifulSoup(driver.page_source, 'html5lib')
+
+    gstats1 = [x.split(':') for x in soup.find(
+        'div', {'class': 'gstats1'}).text.split('\n')]
+    price = gstats1[0][1].strip()[1:]
+    time = gstats1[1][1].strip()
+
+    gstats2 = [x.split(':') for x in soup.find(
+        'div', {'class': 'gstats2'}).text.split('\n')]
+    age = gstats2[0][1].strip()
+    players = gstats2[1][1].strip()
+
+    text = soup.find('div', {'class', 'mainbody'}).text
+
+    pdf_links = [
+        a for a in soup.find(
+            'div', {
+                'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text]
+
+    paths = []
+    for url in pdf_links:
+        path = 'pdfs/{}.pdf'.format(url.text)
+        with open(path, 'wb') as f:
+            f.write(requests.get(url['href']).content)
+
+        paths.append(path)
+
+    paths = ';'.join(paths)
+
+    game_dict[k]['price'] = price
+    game_dict[k]['time'] = time
+    game_dict[k]['age'] = age
+    game_dict[k]['players'] = players
+    game_dict[k]['paths'] = paths
+    game_dict[k]['web_text'] = text
+
+    time_lib.sleep(1)
@@ -0,0 +1,115 @@
+from selenium import webdriver  # powers the browser interaction
+from selenium.webdriver.support.ui import Select  # selects menu options
+from bs4 import BeautifulSoup  # to parse HTML
+import csv  # to write CSV
+import pandas as pd  # to see CSV
+import time
+import os
+import random
+
+
+driver = webdriver.PhantomJS()
+next_page = "https://www.ctsnet.org/surgeons/surgeons-advanced-search?ln=&fn=&subspecialty=adult_cardiac_surgery&city=&country=gb&province=&o"
+
+with open("IT-cardi.csv", "a") as f:
+    csv_w_interv = csv.writer(f)
+    csv_w_interv.writerow(["Name",
+                           "Hospital",
+                           "Phone",
+                           "Interests",
+                           "Practice-Areas",
+                           "City-Region",
+                           "Country",
+                           "Street", "URL"])
+
+
+for i in range(1000):
+
+    driver.get(next_page)
+
+    soup = BeautifulSoup(driver.page_source, "html5lib")
+
+    try:
+        next_page = "https://www.ctsnet.org" + \
+            soup.find('a', {'title': 'Go to next page'})['href']
+    except:
+        next_page = ""
+
+    td_a = soup.find_all(
+        "td", {"class": "views-field views-field-field-contact-last-name"})
+
+    if i == 0:
+        links = ["https://www.ctsnet.org" +
+                 x.find("a")['href'] for x in td_a[48:]]
+    else:
+        links = ["https://www.ctsnet.org" + x.find("a")['href'] for x in td_a]
+
+    for l in links:
+
+        driver.get(l)
+        soup = BeautifulSoup(driver.page_source, "html5lib")
+
+        try:
+            name = soup.find('h1', {"class": 'page-title'}).text.strip()
+            print(name)
+        except:
+            continue
+
+        try:
+            hospital = soup.find(
+                'div', {
+                    "class": 'contact-institution'}).text.strip()
+        except:
+            continue
+
+        try:
+            country = soup.find('div',
+                                {"class": 'contact-country'}).text.strip()
+
+        except:
+            country = ''
+
+        try:
+            street = soup.find('div', {"class": 'contact-street'}).text.strip()
+        except:
+            street = ''
+
+        try:
+            city = soup.find(
+                'div', {
+                    "class": 'contact-city-province-code'}).text.strip()
+
+        except:
+            city = ''
+
+        try:
+            phone = soup.find('div', {"class": 'contact-numbers'}).text.strip()
+        except:
+            continue
+
+        try:
+            fields = soup.find(
+                'div', {
+                    "class": 'views-field views-field-field-contact-subspecialty'}).text.strip().replace(
+                '\n', '; ')
+        except:
+            fields = ''
+
+        try:
+
+            interests = soup.find(
+                'div', {
+                    "class": 'field field--name-field-contact-interest field--type-text-long field--label-hidden'}).text.strip().replace(
+                '\n', '; ')
+        except:
+            interests = ''
+
+        if len(phone) > 0:
+
+            with open("IT-cardi.csv", "a") as f:
+                csv_w_interv = csv.writer(f)
+                csv_w_interv.writerow(
+                    [name, hospital, phone, interests, fields, city, country, street, l])
+
+        time.sleep(random.randint(1, 3))
+    time.sleep(random.randint(1, 3))
@@ -0,0 +1,28 @@
+import requests
+import json
+from __future__ import division
+import math
+import csv
+
+# set key
+key = "PUT_KEY_HERE"
+
+# set base url
+base_url = "https://api.crunchbase.com/v/3/organizations"
+
+# set response format
+response_format = ".json"
+
+# set search parameters
+search_params = {"name": "uber",
+                 "user_key": key,
+                 "page": "1"}
+
+# make request
+r = requests.get(base_url + response_format, params=search_params)
+response_text = r.text
+
+# Convert JSON response to a dictionary
+data = json.loads(response_text)
+
+print(data.keys())