Template request | Bug report | Generate Data Product
Tags: #github #repos #stars #snippet
Author: Sanjeet Attili
Description: This notebook provides a list of the most popular GitHub repositories based on the number of stars they have received.
import requests
import pandas as pd
import plotly.express as px
import naas
-
The Github search API provides up to 1,000 results for each search.
-
Please visit this link to know more about Github search limitation.
# Query number of repositories with stars greater than the given threshold
threshold = 500 # provides list of repos with stars greater than 500
# Setup how many top repository results are to be shown
top_n = 250
# if you want to fetch all the repository results with the
# given threshold instead of top_n number, then put in top_n value to 'all'
# Github token
GITHUB_TOKEN = None or naas.secret.get("GITHUB_TOKEN")
def fetch_results(top_n, threshold, token):
URL = (
f"https://api.github.com/search/repositories?q=stars:%3E{threshold}&sort=stars"
)
headers = {"Authorization": f"token {token}"}
df = pd.DataFrame()
cnt, page = 0, 1
while True:
params = {
"state": "open",
"per_page": "100",
"page": page,
}
res = requests.get(URL, headers=headers, params=params)
try:
res.raise_for_status()
except requests.HTTPError as e:
if "422 Client Error: Unprocessable Entity for url:" in str(e):
print("Github Search API limit reached!")
print("Collecting the search results")
break
res_json = res.json()
for r in res_json["items"]:
df.loc[cnt, "repo_id"] = r["id"]
df.loc[cnt, "name"], df.loc[cnt, "url"] = r["name"], r["html_url"]
df.loc[cnt, "stars"], df.loc[cnt, "forks"], df.loc[cnt, "issues_open"] = (
r["watchers"],
r["forks"],
r["open_issues"],
)
df.loc[cnt, "created_at"], df.loc[cnt, "updated_at"] = (
r["created_at"],
r["updated_at"],
)
if len(r["topics"]):
df.loc[cnt, "topics"] = ",".join(r["topics"])
else:
df.loc[cnt, "topics"] = "None"
if r["description"]:
df.loc[cnt, "description"] = r["description"]
else:
df.loc[cnt, "description"] = "None"
cnt += 1
if cnt == top_n:
break
if cnt == top_n:
break
page += 1
df.stars, df.forks, df.issues_open, df.repo_id = (
df.stars.astype("int"),
df.forks.astype("int"),
df.issues_open.astype("int"),
df.repo_id.astype("int"),
)
return df
df_results = fetch_results(top_n, threshold, GITHUB_TOKEN)
df_results.shape
df_results.head(10)