Skip to content

Commit a72963e

Browse files
committed
Parameter for choosing language
- If you want to run the scraper for R or Python, just pass through the language into the function and it'll use the correct query string accordingly. - We mentioned wanting to possibly look in python multi-import statements for packages as well as standard imports ("import name" vs "import name1, name2, name3") Multi-import statments are not PEP8 standard and not common, so I'm leaving the query commented out for now.
1 parent c9a8bd0 commit a72963e

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

crossResourcePyScript.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def main(language, packages_filename, ghtoken=""):
2424
# Read packages list from CSV file
2525
packlist = read_pkg_csv(os.path.join("data", "input", packages_filename))
2626
# Query Github with the package list
27-
results = collect_data(packlist, ghtoken)
27+
results = collect_data(packlist, ghtoken, language)
2828
# Write out the results to a csv file
2929
write_csv(results)
3030
return
@@ -49,6 +49,7 @@ def read_gh_token():
4949

5050
return ghtoken
5151

52+
5253
def read_pkg_csv(filename):
5354
"""
5455
Open the CSV file and read in the package list.
@@ -103,22 +104,33 @@ def write_csv(results):
103104
return
104105

105106

106-
def collect_data(packlist, ghtoken):
107+
def collect_data(packlist, ghtoken, language):
107108
"""
108109
Make a github query for each package in the package list.
109110
110111
:param list packlist: Package list
111112
:param str ghtoken: Github token
113+
:param str language: python or r
112114
:return list results: Results from Github queries
113115
"""
114-
116+
next_url = ""
115117
results = []
116118

117119
# Loop for each package
118120
for pkg in packlist:
119121
print("Querying Github: {}".format(pkg))
120-
# Format the URL to GET
121-
next_url ='https://api.github.com/search/code?q="import {}"+in:file+language:"python"+extension:"py"'.format(pkg)
122+
# Format the URL to GET. Query format depends on the language
123+
if language.lower() == "r":
124+
next_url = 'https://api.github.com/search/code?q="library({})"+in:file+language:"r"+extension:"r"' \
125+
'+extension:"rmd"'.format(pkg)
126+
if language.lower() == "python" or language.lower() == "py":
127+
next_url = 'https://api.github.com/search/code?q="import {}"+in:file+language:"python"+' \
128+
'extension:"py"'.format(pkg)
129+
# TODO Do we also try to query this string for packages listed in multi-import statements too? This is not
130+
# a PEP8 standard, but some people still do multi-imports like this anyways. Rare
131+
# next_url = 'https://api.github.com/search/code?q=", {}"+in:file+language:"python"+' \
132+
# 'extension:"py"'.format(pkg)
133+
122134
# In case there are multiple pages of results, we need to keep requesting the "next" page until all results
123135
# are collected
124136
while next_url:
@@ -194,6 +206,4 @@ def send_query(pkg, req_url, ghtoken, results):
194206
return results, next_url
195207

196208

197-
198-
199209
main("python", "packagesToScrape.csv", ghtoken="")

0 commit comments

Comments
 (0)