Skip to content

Commit 791ccec

Browse files
committed
add osv_supply_chain
1 parent 71e1f86 commit 791ccec

File tree

7 files changed

+664
-1
lines changed

7 files changed

+664
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
text_representation:
5+
extension: .md
6+
format_name: markdown
7+
format_version: '1.3'
8+
jupytext_version: 1.16.6
9+
kernelspec:
10+
display_name: Python 3 (ipykernel)
11+
language: python
12+
name: python3
13+
---
14+
15+
```python vscode={"languageId": "python"}
16+
import pandas as pd
17+
18+
df = pd.read_csv("../../data/osv/processed/osv_ecosystem_summary.csv")
19+
df.head(1)
20+
```
21+
22+
```python vscode={"languageId": "python"}
23+
from great_tables import GT, md, html, nanoplot_options
24+
import pandas as pd
25+
import numpy as np
26+
import ast
27+
28+
29+
# Ensure `trend_data` is properly formatted as a comma-separated string
30+
df["trend_data"] = df["trend_data"].apply(
31+
lambda x: ", ".join(map(str, np.array(ast.literal_eval(x))))
32+
if isinstance(x, str) and x.startswith("[")
33+
else ", ".join(map(str, x))
34+
if isinstance(x, (list, np.ndarray))
35+
else str(x)
36+
)
37+
38+
df["icon"] = df["type"].replace({"Vulnerability": "bug", "Malicious Code": "skull"})
39+
40+
color_map = {"bug": "purple", "skull": "red"}
41+
42+
# Create a sorted table DataFrame with the desired columns
43+
table_df = df[
44+
["ecosystem", "icon", "total_affected", "peak_attack_year", "trend_data"]
45+
].sort_values(["total_affected", "ecosystem"], ascending=[False, True])
46+
47+
# Generate the Great Table
48+
gt_table = (
49+
GT(table_df)
50+
.tab_header(
51+
title=md("**OSV Security Trends**"),
52+
subtitle=md(
53+
"_Malicious Code & Vulnerability Insights Across Software Supply Chains_"
54+
),
55+
)
56+
.tab_stub(rowname_col="ecosystem")
57+
.tab_stubhead(label="Ecosystem")
58+
.tab_source_note(source_note=md("*Year Trends from 2014-2024*"))
59+
.tab_source_note(
60+
source_note=md(
61+
"Data sourced from [OSV.dev](https://osv.dev) (Open Source Vulnerability) and analyzed for vulnerability & malicious code trends. Covers PyPI, npm, Maven, Go, RubyGems, NuGet, Packagist, Pub, CRAN, Hackage, Hex, and crates.io. Last updated: February 2025."
62+
)
63+
)
64+
.tab_source_note(
65+
source_note=md("**Legend:** Bug = Vulnerability | Skull = Malicious Code")
66+
)
67+
.tab_stubhead(label="Ecosystem")
68+
.cols_label(
69+
ecosystem="Ecosystem",
70+
icon="Type",
71+
total_affected="Total",
72+
peak_attack_year="Peak",
73+
trend_data="Year Trend",
74+
)
75+
.fmt_nanoplot(
76+
"trend_data",
77+
plot_type="bar",
78+
reference_line="mean",
79+
options=nanoplot_options(
80+
data_bar_stroke_color="black",
81+
data_bar_stroke_width=2,
82+
data_bar_fill_color="darkgray",
83+
reference_line_color="pink",
84+
),
85+
)
86+
.fmt_number(columns="total_affected", sep_mark=",", decimals=0)
87+
.cols_align(align="left", columns=["ecosystem"])
88+
.cols_align(
89+
align="center",
90+
columns=["icon", "total_affected", "peak_attack_year", "trend_data"],
91+
)
92+
.fmt_icon(columns="icon", fill_color=color_map)
93+
)
94+
95+
96+
# Generate the raw HTML from the table
97+
html_output = gt_table.as_raw_html()
98+
99+
# Save it to an HTML file
100+
with open(
101+
"../../data/osv/processed/osv_security_trends.html", "w", encoding="utf-8"
102+
) as f:
103+
f.write(html_output)
104+
105+
# Display the table
106+
gt_table
107+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"ExecuteTime": {
8+
"end_time": "2025-02-08T19:31:35.646100Z",
9+
"start_time": "2025-02-08T19:31:34.663998Z"
10+
},
11+
"vscode": {
12+
"languageId": "python"
13+
}
14+
},
15+
"outputs": [
16+
{
17+
"data": {
18+
"text/html": [
19+
"<div>\n",
20+
"<style scoped>\n",
21+
" .dataframe tbody tr th:only-of-type {\n",
22+
" vertical-align: middle;\n",
23+
" }\n",
24+
"\n",
25+
" .dataframe tbody tr th {\n",
26+
" vertical-align: top;\n",
27+
" }\n",
28+
"\n",
29+
" .dataframe thead th {\n",
30+
" text-align: right;\n",
31+
" }\n",
32+
"</style>\n",
33+
"<table border=\"1\" class=\"dataframe\">\n",
34+
" <thead>\n",
35+
" <tr style=\"text-align: right;\">\n",
36+
" <th></th>\n",
37+
" <th>ecosystem</th>\n",
38+
" <th>type</th>\n",
39+
" <th>total_affected</th>\n",
40+
" <th>most_affected_package</th>\n",
41+
" <th>peak_attack_year</th>\n",
42+
" <th>trend_data</th>\n",
43+
" </tr>\n",
44+
" </thead>\n",
45+
" <tbody>\n",
46+
" <tr>\n",
47+
" <th>0</th>\n",
48+
" <td>CRAN</td>\n",
49+
" <td>Vulnerability</td>\n",
50+
" <td>10</td>\n",
51+
" <td>readxl</td>\n",
52+
" <td>2023</td>\n",
53+
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0]</td>\n",
54+
" </tr>\n",
55+
" </tbody>\n",
56+
"</table>\n",
57+
"</div>"
58+
],
59+
"text/plain": [
60+
" ecosystem type total_affected most_affected_package \\\n",
61+
"0 CRAN Vulnerability 10 readxl \n",
62+
"\n",
63+
" peak_attack_year trend_data \n",
64+
"0 2023 [0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0] "
65+
]
66+
},
67+
"execution_count": 1,
68+
"metadata": {},
69+
"output_type": "execute_result"
70+
}
71+
],
72+
"source": [
73+
"import pandas as pd\n",
74+
"\n",
75+
"df = pd.read_csv(\"../../data/osv/processed/osv_ecosystem_summary.csv\")\n",
76+
"df.head(1)"
77+
]
78+
},
79+
{
80+
"cell_type": "code",
81+
"execution_count": null,
82+
"metadata": {
83+
"vscode": {
84+
"languageId": "python"
85+
}
86+
},
87+
"outputs": [],
88+
"source": [
89+
"from great_tables import GT, md, html, nanoplot_options\n",
90+
"import pandas as pd\n",
91+
"import numpy as np\n",
92+
"import ast\n",
93+
"\n",
94+
"\n",
95+
"# Ensure `trend_data` is properly formatted as a comma-separated string\n",
96+
"df[\"trend_data\"] = df[\"trend_data\"].apply(\n",
97+
" lambda x: \", \".join(map(str, np.array(ast.literal_eval(x))))\n",
98+
" if isinstance(x, str) and x.startswith(\"[\")\n",
99+
" else \", \".join(map(str, x))\n",
100+
" if isinstance(x, (list, np.ndarray))\n",
101+
" else str(x)\n",
102+
")\n",
103+
"\n",
104+
"df[\"icon\"] = df[\"type\"].replace({\"Vulnerability\": \"bug\", \"Malicious Code\": \"skull\"})\n",
105+
"\n",
106+
"color_map = {\"bug\": \"purple\", \"skull\": \"red\"}\n",
107+
"\n",
108+
"# Create a sorted table DataFrame with the desired columns\n",
109+
"table_df = df[\n",
110+
" [\"ecosystem\", \"icon\", \"total_affected\", \"peak_attack_year\", \"trend_data\"]\n",
111+
"].sort_values([\"total_affected\", \"ecosystem\"], ascending=[False, True])\n",
112+
"\n",
113+
"# Generate the Great Table\n",
114+
"gt_table = (\n",
115+
" GT(table_df)\n",
116+
" .tab_header(\n",
117+
" title=md(\"**OSV Security Trends**\"),\n",
118+
" subtitle=md(\n",
119+
" \"_Malicious Code & Vulnerability Insights Across Software Supply Chains_\"\n",
120+
" ),\n",
121+
" )\n",
122+
" .tab_stub(rowname_col=\"ecosystem\")\n",
123+
" .tab_stubhead(label=\"Ecosystem\")\n",
124+
" .tab_source_note(source_note=md(\"*Year Trends from 2014-2024*\"))\n",
125+
" .tab_source_note(\n",
126+
" source_note=md(\n",
127+
" \"Data sourced from [OSV.dev](https://osv.dev) (Open Source Vulnerability) and analyzed for vulnerability & malicious code trends. Covers PyPI, npm, Maven, Go, RubyGems, NuGet, Packagist, Pub, CRAN, Hackage, Hex, and crates.io. Last updated: February 2025.\"\n",
128+
" )\n",
129+
" )\n",
130+
" .tab_source_note(\n",
131+
" source_note=md(\"**Legend:** Bug = Vulnerability | Skull = Malicious Code\")\n",
132+
" )\n",
133+
" .tab_stubhead(label=\"Ecosystem\")\n",
134+
" .cols_label(\n",
135+
" ecosystem=\"Ecosystem\",\n",
136+
" icon=\"Type\",\n",
137+
" total_affected=\"Total\",\n",
138+
" peak_attack_year=\"Peak\",\n",
139+
" trend_data=\"Year Trend\",\n",
140+
" )\n",
141+
" .fmt_nanoplot(\n",
142+
" \"trend_data\",\n",
143+
" plot_type=\"bar\",\n",
144+
" reference_line=\"mean\",\n",
145+
" options=nanoplot_options(\n",
146+
" data_bar_stroke_color=\"black\",\n",
147+
" data_bar_stroke_width=2,\n",
148+
" data_bar_fill_color=\"darkgray\",\n",
149+
" reference_line_color=\"pink\",\n",
150+
" ),\n",
151+
" )\n",
152+
" .fmt_number(columns=\"total_affected\", sep_mark=\",\", decimals=0)\n",
153+
" .cols_align(align=\"left\", columns=[\"ecosystem\"])\n",
154+
" .cols_align(\n",
155+
" align=\"center\",\n",
156+
" columns=[\"icon\", \"total_affected\", \"peak_attack_year\", \"trend_data\"],\n",
157+
" )\n",
158+
" .fmt_icon(columns=\"icon\", fill_color=color_map)\n",
159+
")\n",
160+
"\n",
161+
"\n",
162+
"# Generate the raw HTML from the table\n",
163+
"html_output = gt_table.as_raw_html()\n",
164+
"\n",
165+
"# Save it to an HTML file\n",
166+
"with open(\n",
167+
" \"../../data/osv/processed/osv_security_trends.html\", \"w\", encoding=\"utf-8\"\n",
168+
") as f:\n",
169+
" f.write(html_output)\n",
170+
"\n",
171+
"# Display the table\n",
172+
"gt_table"
173+
]
174+
}
175+
],
176+
"metadata": {
177+
"kernelspec": {
178+
"display_name": "Python 3 (ipykernel)",
179+
"language": "python",
180+
"name": "python3"
181+
},
182+
"language_info": {
183+
"name": "plaintext"
184+
}
185+
},
186+
"nbformat": 4,
187+
"nbformat_minor": 2
188+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import requests
2+
import zipfile
3+
import os
4+
from pathlib import Path
5+
import logging
6+
7+
# Set up basic logging
8+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
9+
10+
11+
def create_directory(path: str) -> None:
12+
"""
13+
Create a directory if it doesn't exist.
14+
"""
15+
p = Path(path)
16+
p.mkdir(parents=True, exist_ok=True)
17+
logging.info(f"Directory ensured: {p}")
18+
19+
20+
def download_file(url: str, local_path: str) -> None:
21+
"""
22+
Download a file from the specified URL to a local path.
23+
"""
24+
headers = {
25+
"User-Agent": "TypeError/vuln-data-science (https://github.com/TypeError/vuln-data-science)"
26+
}
27+
with requests.get(url, stream=True, headers=headers) as response:
28+
response.raise_for_status()
29+
with open(local_path, "wb") as f:
30+
for chunk in response.iter_content(chunk_size=8192):
31+
if chunk:
32+
f.write(chunk)
33+
logging.info(f"Downloaded: {local_path}")
34+
35+
36+
def extract_zip(zip_path: str, extract_to: str) -> None:
37+
"""
38+
Extract a ZIP file to the specified directory and remove the ZIP file.
39+
"""
40+
try:
41+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
42+
zip_ref.extractall(extract_to)
43+
os.remove(zip_path)
44+
logging.info(f"Extracted to: {extract_to} and removed ZIP file.")
45+
except zipfile.BadZipFile as e:
46+
logging.error(f"Bad ZIP file {zip_path}: {e}")
47+
raise
48+
49+
50+
def download_and_extract_osv(ecosystem: str, base_dir: str = "./data") -> None:
51+
"""
52+
Download and extract OSV data for a given ecosystem.
53+
"""
54+
output_dir = Path(base_dir) / ecosystem
55+
create_directory(output_dir)
56+
57+
url = f"https://osv-vulnerabilities.storage.googleapis.com/{ecosystem}/all.zip"
58+
local_zip = output_dir / "all.zip"
59+
60+
download_file(url, str(local_zip))
61+
extract_zip(str(local_zip), str(output_dir))
62+
63+
64+
if __name__ == "__main__":
65+
base_data_dir = "./data/osv/raw"
66+
ecosystems = [
67+
"CRAN", # R packages
68+
"crates.io", # Rust packages
69+
"Go", # Go modules
70+
"Hackage", # Haskell packages
71+
"Hex", # Elixir/Erlang packages
72+
"Maven", # Java packages
73+
"npm", # JavaScript/Node.js packages
74+
"NuGet", # .NET packages
75+
"Packagist", # PHP packages
76+
"Pub", # Dart packages
77+
"PyPI", # Python packages
78+
"RubyGems", # Ruby packages
79+
]
80+
81+
create_directory(base_data_dir)
82+
83+
for ecosystem in ecosystems:
84+
try:
85+
download_and_extract_osv(ecosystem, base_data_dir)
86+
except requests.exceptions.RequestException as e:
87+
logging.error(f"Error downloading {ecosystem}: {e}")
88+
except zipfile.BadZipFile as e:
89+
logging.error(f"Error extracting {ecosystem}: {e}")
90+
except Exception as e:
91+
logging.error(f"Unexpected error for {ecosystem}: {e}")

0 commit comments

Comments
 (0)