Template request | Bug report | Generate Data Product

Tags: #github #repos #stars #snippet

Description: This notebook provides a list of the most popular GitHub repositories based on the number of stars they have received.

Input

Import libraries

import requests
import pandas as pd
import plotly.express as px
import naas

Setup Variables

The Github search API provides up to 1,000 results for each search.
Please visit this link to know more about Github search limitation.

# Query number of repositories with stars greater than the given threshold
threshold = 500  # provides list of repos with stars greater than 500

# Setup how many top repository results are to be shown
top_n = 250

# if you want to fetch all the repository results with the
# given threshold instead of top_n number, then put in top_n value to 'all'

# Github token
GITHUB_TOKEN = None or naas.secret.get("GITHUB_TOKEN")

Model

Get most starred repos

def fetch_results(top_n, threshold, token):
    URL = (
        f"https://api.github.com/search/repositories?q=stars:%3E{threshold}&sort=stars"
    )
    headers = {"Authorization": f"token {token}"}
    df = pd.DataFrame()
    cnt, page = 0, 1

    while True:
        params = {
            "state": "open",
            "per_page": "100",
            "page": page,
        }
        res = requests.get(URL, headers=headers, params=params)
        try:
            res.raise_for_status()
        except requests.HTTPError as e:
            if "422 Client Error: Unprocessable Entity for url:" in str(e):
                print("Github Search API limit reached!")
                print("Collecting the search results")
                break
        res_json = res.json()

        for r in res_json["items"]:
            df.loc[cnt, "repo_id"] = r["id"]
            df.loc[cnt, "name"], df.loc[cnt, "url"] = r["name"], r["html_url"]
            df.loc[cnt, "stars"], df.loc[cnt, "forks"], df.loc[cnt, "issues_open"] = (
                r["watchers"],
                r["forks"],
                r["open_issues"],
            )
            df.loc[cnt, "created_at"], df.loc[cnt, "updated_at"] = (
                r["created_at"],
                r["updated_at"],
            )
            if len(r["topics"]):
                df.loc[cnt, "topics"] = ",".join(r["topics"])
            else:
                df.loc[cnt, "topics"] = "None"

            if r["description"]:
                df.loc[cnt, "description"] = r["description"]
            else:
                df.loc[cnt, "description"] = "None"

            cnt += 1
            if cnt == top_n:
                break

        if cnt == top_n:
            break

        page += 1

    df.stars, df.forks, df.issues_open, df.repo_id = (
        df.stars.astype("int"),
        df.forks.astype("int"),
        df.issues_open.astype("int"),
        df.repo_id.astype("int"),
    )
    return df


df_results = fetch_results(top_n, threshold, GITHUB_TOKEN)
df_results.shape

Output

Display result

df_results.head(10)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GitHub_Get_most_starred_repos.md

GitHub_Get_most_starred_repos.md

Input

Import libraries

Setup Variables

Model

Get most starred repos

Output

Display result

Files

GitHub_Get_most_starred_repos.md

Latest commit

History

GitHub_Get_most_starred_repos.md

File metadata and controls

Input

Import libraries

Setup Variables

Model

Get most starred repos

Output

Display result