@@ -22,9 +22,9 @@ def main(language, packages_filename, ghtoken=""):
22
22
ghtoken = read_gh_token ()
23
23
24
24
# Read packages list from CSV file
25
- packlist = read_pkg_csv (os .path .join ("data" , "import " , packages_filename ))
25
+ packlist = read_pkg_csv (os .path .join ("data" , "input " , packages_filename ))
26
26
# Query Github with the package list
27
- results = send_query (packlist , ghtoken )
27
+ results = collect_data (packlist , ghtoken )
28
28
# Write out the results to a csv file
29
29
write_csv (results )
30
30
return
@@ -38,9 +38,11 @@ def read_gh_token():
38
38
"""
39
39
# If you have your Github token stored in a file, read it in from the file.
40
40
try :
41
- with open (os .path .join ("data" , "import" , "gh.token" ), "r" ) as f :
41
+ print (os .listdir ())
42
+ with open (os .path .join ("data" , "input" , "gh.token" ), "r" ) as f :
42
43
reader = f .readlines ()
43
44
ghtoken = reader [0 ]
45
+ print (ghtoken )
44
46
except FileNotFoundError :
45
47
print ("No Github token provided and no 'gh.token' file was found. Please provide one and try again." )
46
48
sys .exit (0 )
@@ -101,7 +103,7 @@ def write_csv(results):
101
103
return
102
104
103
105
104
- def send_query (packlist , ghtoken ):
106
+ def collect_data (packlist , ghtoken ):
105
107
"""
106
108
Make a github query for each package in the package list.
107
109
@@ -115,44 +117,83 @@ def send_query(packlist, ghtoken):
115
117
# Loop for each package
116
118
for pkg in packlist :
117
119
print ("Querying Github: {}" .format (pkg ))
118
- try :
119
- # Make the github request. Look for 'import <package_name>' in code in python files
120
- r = requests .get (
121
- 'https://api.github.com/search/code?q="import {}"+in:file+language:"python"+extension:"py"' .format (pkg ),
122
- headers = {"Authorization" : "token {}" .format (ghtoken ), "Accept" : "application/vnd.github.v3+json" })
123
-
124
- # Did the query come back successful?
125
- if r .status_code == 200 :
126
- try :
127
- # Load the response json as a Python dictionary
128
- r_text = json .loads (r .text )
129
- # Loop through each query result
130
- for result in r_text ["items" ]:
131
- try :
132
- # We don't need all the data from the results. Save the few pieces of info that
133
- # we're interested in.
134
- results .append ({"package" : pkg , "crossover_file" : result ["html_url" ],
135
- "crossover_package" : result ["repository" ]["name" ]})
136
- except KeyError :
137
- # This result was missing a piece of data that we need.
138
- print ("Error parsing a result for: {}" .format (pkg ))
139
- except Exception as e :
140
- # There was a problem trying to parse the json response, or 'items' key is not in the response
141
- # results
142
- print ("Missing data from Github response object for package: {}" .format (pkg ))
143
- else :
144
- # There was a bad HTTP response from Github. If the error is 403,
145
- # then we likely hit the rate limiter and need to increase the sleep time between requests.
146
- print ("Received error from Github API: Status Code: {}" .format (r .status_code ))
147
-
148
- # Don't query github too fast or you'll hit the limiter and get a bad response. Give it some time.
149
- time .sleep (10 )
150
-
151
- except Exception as e :
152
- # Did your internet connection go out? Something is wrong with sending out the request
153
- print ("Unable to make Github request. Connection issues." )
154
-
120
+ # Format the URL to GET
121
+ next_url = 'https://api.github.com/search/code?q="import {}"+in:file+language:"python"+extension:"py"' .format (pkg )
122
+ # In case there are multiple pages of results, we need to keep requesting the "next" page until all results
123
+ # are collected
124
+ while next_url :
125
+ # Send the query, and organize the results
126
+ results , next_url = send_query (pkg , next_url , ghtoken , results )
127
+ # Is there another page of results?
128
+ if not next_url :
129
+ # No, we're done querying pages for this package.
130
+ break
131
+
132
+ # All done. Return all results.
155
133
return results
156
134
157
135
136
+ def send_query (pkg , req_url , ghtoken , results ):
137
+ """
138
+ Send one request to the Github API. Sort the results into a list of objects, and return them.
139
+
140
+ :param str pkg: Current package being queried
141
+ :param str req_url: URL for the GET request
142
+ :param str ghtoken: Github Token
143
+ :param list results: Results (so far)
144
+ :return list results: Results (with new additions)
145
+ """
146
+
147
+ # Placeholder for the next page url
148
+ next_url = ""
149
+ try :
150
+ # Make the github request. Look for 'import <package_name>' in code in python files
151
+ r = requests .get (
152
+ req_url ,
153
+ headers = {"Authorization" : "token {}" .format (ghtoken ), "Accept" : "application/vnd.github.v3+json" })
154
+
155
+ # Did the query come back successful?
156
+ if r .status_code == 200 :
157
+ try :
158
+ # Load the response json as a Python dictionary
159
+ r_text = json .loads (r .text )
160
+ # Loop through each query result
161
+ print ("Result Items: {}" .format (r_text ["items" ]))
162
+
163
+ # Is there another page of results?
164
+ if r .links ["next" ]:
165
+ # Store the link to the next page
166
+ next_url = r .links ["next" ]["url" ]
167
+
168
+ # Loop each result in the response
169
+ for result in r_text ["items" ]:
170
+ try :
171
+ # We don't need all the data from the results. Save the few pieces of info that
172
+ # we're interested in.
173
+ results .append ({"package" : pkg , "crossover_file" : result ["html_url" ],
174
+ "crossover_package" : result ["repository" ]["name" ]})
175
+ except KeyError :
176
+ # This result was missing a piece of data that we need.
177
+ print ("Error parsing a result for: {}" .format (pkg ))
178
+ except Exception as e :
179
+ # There was a problem trying to parse the json response, or 'items' key is not in the response
180
+ # results
181
+ print ("Missing data from Github response object for package: {}" .format (pkg ))
182
+ else :
183
+ # There was a bad HTTP response from Github. If the error is 403,
184
+ # then we likely hit the rate limiter and need to increase the sleep time between requests.
185
+ print ("Received error from Github API: Status Code: {}" .format (r .status_code ))
186
+
187
+ # Don't query github too fast or you'll hit the limiter and get a bad response. Give it some time.
188
+ time .sleep (10 )
189
+
190
+ except Exception as e :
191
+ # Did your internet connection go out? Something is wrong with sending out the request
192
+ print ("Unable to make Github request. Connection issues." )
193
+
194
+ return results , next_url
195
+
196
+
197
+
198
+
158
199
main ("python" , "packagesToScrape.csv" , ghtoken = "" )
0 commit comments