-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
140 lines (120 loc) · 5.21 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import concurrent.futures
import requests
import pandas as pd
import pickle
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Constants for URLs
LOGIN_URL = "https://student.naviance.com/stuyvesant"
GET_COLLEGES_URL = "https://blue-ridge-api.naviance.com/college/search"
URL_STATS_BASE = "https://blue-ridge-api.naviance.com/application-statistics/uuid/"
def login_to_naviance_and_get_token():
driver = webdriver.Chrome()
driver.get(LOGIN_URL)
try:
WebDriverWait(driver, 120).until(
EC.visibility_of_element_located((By.XPATH,
'//*[@id="main-container"]/article/div/div/div/div/div[2]/neon-0_12_0-card-standard/div[1]/h2'))
)
driver.minimize_window()
token = driver.execute_script("return window.localStorage.getItem('deepLinkingAuthorizedToken');")
return driver, token
except Exception as e:
print(f"An error occurred during login: {e}")
driver.quit()
def fetch_college_data_and_urls(token):
headers = {"Authorization": token, "Content-Type": "application/json"}
params = {"limit": 99999, "page": 1}
response = requests.get(GET_COLLEGES_URL, headers=headers, params=params)
college_data = response.json()
college_urls = [
(URL_STATS_BASE + college['coreMapping']['uuid'], college['name'])
for college in college_data['data']
if college.get('coreMapping') and college['coreMapping'].get('uuid')
]
return college_urls
def process_data(data, school):
records = []
for test_type in ['act', 'sat']:
if data.get(test_type):
for appType in data[test_type]['apps']:
for app in data[test_type]['apps'][appType]:
waitList = appType.startswith("waitlist")
waitList_result = "unknown"
if waitList:
if "Accepted" in appType:
waitList_result = "accepted"
elif "Denied" in appType:
waitList_result = "denied"
if waitList_result != "unknown":
record = {
"school": school,
"gpa": app['gpa'],
"act": app.get('actComposite'),
"sat": app.get('highestComboSat'),
"accepted": waitList_result == "accepted",
"appType": appType[-2:],
}
records.append(record)
else:
record = {
"school": school,
"gpa": app['gpa'],
"act": app.get('actComposite'),
"sat": app.get('highestComboSat'),
"accepted": appType.startswith("accepted"),
"appType": appType[-2:],
}
records.append(record)
return records
def fetch_and_process_data(url, token, known_empties):
if url[0] in known_empties:
return []
response = requests.get(url[0], headers={"Authorization": token})
if response.status_code == 200:
data = response.json().get('scattergrams', {}).get('gpa', {})
if data.get('gpaCount', 0) == 0 or "DO NOT USE" in url[1]:
known_empties.add(url[0])
print("x", end="")
return []
return process_data(data, url[1])
return []
def main():
driver, token = login_to_naviance_and_get_token()
driver.quit()
refresh_data = input("Do you want to refresh the data? (y/n): ")
refresh_data = refresh_data.lower() == "y"
if refresh_data:
if os.path.exists("college_urls.pkl"):
os.remove("college_urls.pkl")
if os.path.exists("known_empties.pkl"):
os.remove("known_empties.pkl")
known_empties = set()
college_urls = fetch_college_data_and_urls(token)
with open("college_urls.pkl", "wb") as file:
pickle.dump(college_urls, file)
else:
known_empties = set()
if os.path.exists("known_empties.pkl"):
with open("known_empties.pkl", "rb") as file:
known_empties = pickle.load(file)
college_urls = fetch_college_data_and_urls(token)
data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
futures = {executor.submit(fetch_and_process_data, url, token, known_empties): url for url in college_urls}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result:
print(f"\nProcessed {futures[future][1]}.")
data.extend(result)
if known_empties:
with open("known_empties.pkl", "wb") as file:
pickle.dump(known_empties, file)
df = pd.DataFrame(data)
df.to_pickle("college_data.pkl")
print("Data saved to 'college_data.pkl'.")
if __name__ == "__main__":
main()