Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ dependencies = [
]

[project.optional-dependencies]
model_scraping = ["requests>=2.31.0"]

vllm = ["vllm>=0.4.0"]

dev = [
Expand Down
71 changes: 71 additions & 0 deletions scripts/closed_source_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import requests
import json
from pathlib import Path


def main():
# Set up output path relative to script location
ROOT = Path(__file__).resolve().parent.parent
output_path = ROOT / "configs" / "openrouter_llm_list.jsonl"

url = "https://openrouter.ai/api/v1/models"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()

def is_generative_text_model(model):
"""
Determine if the model is a generative text model (decoder-only or encoder-decoder)
with text input and text output.

Conditions:
- Output modalities must include "text".
- Input modalities must include "text".
"""
architecture = model.get("architecture", {})
input_mods = architecture.get("input_modalities", [])
output_mods = architecture.get("output_modalities", [])

# Must output text
if "text" not in output_mods:
return False

# Input must include text
if "text" not in input_mods:
return False

return True

models = []
for m in data["data"]:
model_id = m["id"].lower()
if not is_generative_text_model(m):
continue

# Construct modality from input and output modalities
architecture = m.get("architecture", {})
input_mods = architecture.get("input_modalities", [])
output_mods = architecture.get("output_modalities", [])
modality = f"{'|'.join(input_mods)} -> {'|'.join(output_mods)}"

# Collect model info
models.append({
"model_id": m["id"],
"provider": "openrouter",
"architecture": "unknown",
"context_length": m.get("context_length"),
"modality": modality
})

# Write to JSONL file
with open(output_path, "w") as f:
for m in models:
f.write(json.dumps(m) + "\n")

print(f"Number of eligible generative text models: {len(models)}")
print("Sample models:")
for m in models[:5]:
print(f" {m['model_id']} (modality: {m['modality']})")

if __name__ == "__main__":
main()
57 changes: 57 additions & 0 deletions scripts/open_source_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from huggingface_hub import HfApi
import json
from pathlib import Path

TARGET = 12000


def main():
# Set up output path relative to script location
ROOT = Path(__file__).resolve().parent.parent
output_path = ROOT / "configs" / "huggingface_llm_list.jsonl"

api = HfApi()

def is_valid_model(m):
"""Filter unwanted models"""
# keep only generation models
if m.pipeline_tag not in ["text-generation", "text2text-generation"]:
return False

return True

print("Fetching models from HuggingFace...")

models = api.list_models(
sort="downloads"
)

selected = []

for m in models:
if not is_valid_model(m):
continue

entry = {
"model_id": m.modelId,
"provider": "huggingface",
"task": m.pipeline_tag,
"downloads": m.downloads,
}

selected.append(entry)

if len(selected) >= TARGET:
break

print(f"Collected {len(selected)} models")

with open(output_path, "w") as f:
for model in selected:
f.write(json.dumps(model) + "\n")

print(f"Saved to {output_path}")


if __name__ == "__main__":
main()