Skip to content

Commit c9a8bd0

Browse files
committed
Includes pagination
Since results are paginated by Github, this script now traverses and includes results from every page. Results are capped at 1000 per request, so this doesn't include everything, but still much better than the 30 per query from before. Careful running this: Since you have to sleep 10 seconds between requests, this takes a very long time to run. The current package list was a few hours.
1 parent 09ee68a commit c9a8bd0

File tree

2 files changed

+18172
-75
lines changed

2 files changed

+18172
-75
lines changed

crossResourcePyScript.py

+82-41
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ def main(language, packages_filename, ghtoken=""):
2222
ghtoken = read_gh_token()
2323

2424
# Read packages list from CSV file
25-
packlist = read_pkg_csv(os.path.join("data", "import", packages_filename))
25+
packlist = read_pkg_csv(os.path.join("data", "input", packages_filename))
2626
# Query Github with the package list
27-
results = send_query(packlist, ghtoken)
27+
results = collect_data(packlist, ghtoken)
2828
# Write out the results to a csv file
2929
write_csv(results)
3030
return
@@ -38,9 +38,11 @@ def read_gh_token():
3838
"""
3939
# If you have your Github token stored in a file, read it in from the file.
4040
try:
41-
with open(os.path.join("data", "import", "gh.token"), "r") as f:
41+
print(os.listdir())
42+
with open(os.path.join("data", "input", "gh.token"), "r") as f:
4243
reader = f.readlines()
4344
ghtoken = reader[0]
45+
print(ghtoken)
4446
except FileNotFoundError:
4547
print("No Github token provided and no 'gh.token' file was found. Please provide one and try again.")
4648
sys.exit(0)
@@ -101,7 +103,7 @@ def write_csv(results):
101103
return
102104

103105

104-
def send_query(packlist, ghtoken):
106+
def collect_data(packlist, ghtoken):
105107
"""
106108
Make a github query for each package in the package list.
107109
@@ -115,44 +117,83 @@ def send_query(packlist, ghtoken):
115117
# Loop for each package
116118
for pkg in packlist:
117119
print("Querying Github: {}".format(pkg))
118-
try:
119-
# Make the github request. Look for 'import <package_name>' in code in python files
120-
r = requests.get(
121-
'https://api.github.com/search/code?q="import {}"+in:file+language:"python"+extension:"py"'.format(pkg),
122-
headers={"Authorization": "token {}".format(ghtoken), "Accept": "application/vnd.github.v3+json"})
123-
124-
# Did the query come back successful?
125-
if r.status_code == 200:
126-
try:
127-
# Load the response json as a Python dictionary
128-
r_text = json.loads(r.text)
129-
# Loop through each query result
130-
for result in r_text["items"]:
131-
try:
132-
# We don't need all the data from the results. Save the few pieces of info that
133-
# we're interested in.
134-
results.append({"package": pkg, "crossover_file": result["html_url"],
135-
"crossover_package": result["repository"]["name"]})
136-
except KeyError:
137-
# This result was missing a piece of data that we need.
138-
print("Error parsing a result for: {}".format(pkg))
139-
except Exception as e:
140-
# There was a problem trying to parse the json response, or 'items' key is not in the response
141-
# results
142-
print("Missing data from Github response object for package: {}".format(pkg))
143-
else:
144-
# There was a bad HTTP response from Github. If the error is 403,
145-
# then we likely hit the rate limiter and need to increase the sleep time between requests.
146-
print("Received error from Github API: Status Code: {}".format(r.status_code))
147-
148-
# Don't query github too fast or you'll hit the limiter and get a bad response. Give it some time.
149-
time.sleep(10)
150-
151-
except Exception as e:
152-
# Did your internet connection go out? Something is wrong with sending out the request
153-
print("Unable to make Github request. Connection issues.")
154-
120+
# Format the URL to GET
121+
next_url ='https://api.github.com/search/code?q="import {}"+in:file+language:"python"+extension:"py"'.format(pkg)
122+
# In case there are multiple pages of results, we need to keep requesting the "next" page until all results
123+
# are collected
124+
while next_url:
125+
# Send the query, and organize the results
126+
results, next_url = send_query(pkg, next_url, ghtoken, results)
127+
# Is there another page of results?
128+
if not next_url:
129+
# No, we're done querying pages for this package.
130+
break
131+
132+
# All done. Return all results.
155133
return results
156134

157135

136+
def send_query(pkg, req_url, ghtoken, results):
137+
"""
138+
Send one request to the Github API. Sort the results into a list of objects, and return them.
139+
140+
:param str pkg: Current package being queried
141+
:param str req_url: URL for the GET request
142+
:param str ghtoken: Github Token
143+
:param list results: Results (so far)
144+
:return list results: Results (with new additions)
145+
"""
146+
147+
# Placeholder for the next page url
148+
next_url = ""
149+
try:
150+
# Make the github request. Look for 'import <package_name>' in code in python files
151+
r = requests.get(
152+
req_url,
153+
headers={"Authorization": "token {}".format(ghtoken), "Accept": "application/vnd.github.v3+json"})
154+
155+
# Did the query come back successful?
156+
if r.status_code == 200:
157+
try:
158+
# Load the response json as a Python dictionary
159+
r_text = json.loads(r.text)
160+
# Loop through each query result
161+
print("Result Items: {}".format(r_text["items"]))
162+
163+
# Is there another page of results?
164+
if r.links["next"]:
165+
# Store the link to the next page
166+
next_url = r.links["next"]["url"]
167+
168+
# Loop each result in the response
169+
for result in r_text["items"]:
170+
try:
171+
# We don't need all the data from the results. Save the few pieces of info that
172+
# we're interested in.
173+
results.append({"package": pkg, "crossover_file": result["html_url"],
174+
"crossover_package": result["repository"]["name"]})
175+
except KeyError:
176+
# This result was missing a piece of data that we need.
177+
print("Error parsing a result for: {}".format(pkg))
178+
except Exception as e:
179+
# There was a problem trying to parse the json response, or 'items' key is not in the response
180+
# results
181+
print("Missing data from Github response object for package: {}".format(pkg))
182+
else:
183+
# There was a bad HTTP response from Github. If the error is 403,
184+
# then we likely hit the rate limiter and need to increase the sleep time between requests.
185+
print("Received error from Github API: Status Code: {}".format(r.status_code))
186+
187+
# Don't query github too fast or you'll hit the limiter and get a bad response. Give it some time.
188+
time.sleep(10)
189+
190+
except Exception as e:
191+
# Did your internet connection go out? Something is wrong with sending out the request
192+
print("Unable to make Github request. Connection issues.")
193+
194+
return results, next_url
195+
196+
197+
198+
158199
main("python", "packagesToScrape.csv", ghtoken="")

0 commit comments

Comments
 (0)