Skip to content

Commit 3ea90c2

Browse files
committed
Fetch PyPI data from ClickHouse instead of BigQuery
1 parent 5f67ef6 commit 3ea90c2

File tree

3 files changed

+71
-6
lines changed

3 files changed

+71
-6
lines changed

.ruff.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ lint.ignore = [
2424
"E221", # Multiple spaces before operator
2525
"E226", # Missing whitespace around arithmetic operator
2626
"E241", # Multiple spaces after ','
27+
"S310", # suspicious-url-open-usage
28+
"S608", # hardcoded-sql-expression
2729
"UP038", # Makes code slower and more verbose
2830
]
2931
lint.isort.required-imports = [ "from __future__ import annotations" ]

clickhouse.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# /// script
2+
# requires-python = ">=3.10"
3+
# ///
4+
from __future__ import annotations
5+
6+
import datetime as dt
7+
import json
8+
import urllib.parse
9+
import urllib.request
10+
from pathlib import Path
11+
12+
13+
def get_clickhouse_data() -> str:
14+
params = {"user": "demo", "default_format": "JSON"}
15+
16+
today = dt.datetime.now()
17+
first_of_this_month = today.replace(day=1)
18+
last_month = first_of_this_month - dt.timedelta(days=1)
19+
last_month = last_month.strftime("%Y-%m-01")
20+
print(f"{last_month=}")
21+
query = f"""
22+
SELECT SUM(count) AS download_count, project
23+
FROM pypi.pypi_downloads_per_month
24+
WHERE month = '{last_month}'
25+
GROUP BY project
26+
ORDER BY download_count DESC
27+
LIMIT 15000"""
28+
29+
url = "https://sql-clickhouse.clickhouse.com?" + urllib.parse.urlencode(params)
30+
req = urllib.request.Request(url, data=query.encode("utf-8"), method="POST")
31+
with urllib.request.urlopen(req) as response:
32+
data = response.read().decode("utf-8")
33+
return data
34+
35+
36+
def reformat_clickhouse_json(input_data: dict) -> None:
37+
rows = [
38+
{"download_count": int(row["download_count"]), "project": row["project"]}
39+
for row in input_data["data"]
40+
]
41+
42+
reformatted_data = {
43+
"last_update": dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
44+
"source": "ClickHouse",
45+
}
46+
# Rename rows->total_rows and data->rows
47+
for k, v in input_data.items():
48+
if k == "rows":
49+
reformatted_data["total_rows"] = v
50+
elif k == "data":
51+
reformatted_data["rows"] = rows
52+
else:
53+
reformatted_data[k] = v
54+
55+
Path("top-pypi-packages.json").write_text(
56+
json.dumps(reformatted_data, indent=0) + "\n"
57+
)
58+
print("Saved to top-pypi-packages.json")
59+
60+
61+
def main() -> None:
62+
data = get_clickhouse_data()
63+
data = json.loads(data)
64+
reformat_clickhouse_json(data)
65+
66+
67+
if __name__ == "__main__":
68+
main()

generate.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,15 @@ set -e
55

66
# Check versions
77
python3 -m pip --version
8-
/home/botuser/.local/bin/pypinfo --version
98

109
# Ensure newest pip and pypinfo
1110
python3 -m pip install -U pip
12-
python3 -m pip install -U pypinfo
1311

1412
# Check versions
1513
python3 -m pip --version
16-
/home/botuser/.local/bin/pypinfo --version
1714

1815
# Generate and minify
19-
days=22
20-
/home/botuser/.local/bin/pypinfo --all --json --indent 0 --limit 15000 --days $days --test "" project
21-
/home/botuser/.local/bin/pypinfo --all --json --indent 0 --limit 15000 --days $days "" project > top-pypi-packages.json
16+
python3 clickhouse.py
2217
jq -c . < top-pypi-packages.json > top-pypi-packages.min.json
2318
echo 'download_count,project' > top-pypi-packages.csv
2419
jq -r '.rows[] | [.download_count, .project] | @csv' top-pypi-packages.json >> top-pypi-packages.csv

0 commit comments

Comments
 (0)