Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ runpod.toml
.env
test/*
vllm-base/vllm-*
.DS_Store
.DS_Store
build/
*.lock
*.egg-info
52 changes: 52 additions & 0 deletions Dockerfile.custom
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
FROM nvidia/cuda:12.1.0-base-ubuntu22.04

RUN apt-get update -y \
&& apt-get install -y python3-pip

RUN ldconfig /usr/local/cuda-12.1/compat/

# Install Python dependencies
COPY builder/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.10.0 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

RUN pip install --extra-index-url https://miropsota.github.io/torch_packages_builder flash-attn==2.8.3+pt2.7.0cu126

# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
ARG TOKENIZER_NAME=""
ARG BASE_PATH="/runpod-volume"
ARG QUANTIZATION=""
ARG MODEL_REVISION=""
ARG TOKENIZER_REVISION=""

ENV MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$MODEL_REVISION \
TOKENIZER_NAME=$TOKENIZER_NAME \
TOKENIZER_REVISION=$TOKENIZER_REVISION \
BASE_PATH=$BASE_PATH \
QUANTIZATION=$QUANTIZATION \
HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
HF_HUB_ENABLE_HF_TRANSFER=0

ENV PYTHONPATH="/:/vllm-workspace"


COPY src /src
RUN --mount=type=secret,id=HF_TOKEN,required=false \
if [ -f /run/secrets/HF_TOKEN ]; then \
export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
fi && \
if [ -n "$MODEL_NAME" ]; then \
python3 /src/download_model.py; \
fi

# Start the handler
CMD ["python3", "/src/handler_custom.py"]
1 change: 1 addition & 0 deletions VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.0.0
69 changes: 69 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
[build-system]
requires = ["setuptools>=77.0.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "vllm-worker"
dynamic = ["version"]
description = "OpenAI-compatible vLLM worker for serverless inference. Forked from https://github.com/runpod-workers/worker-vllm"
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
authors = [
{name = "Arief Wijaya", email = "[email protected]"}
]
keywords = ["vllm", "llm", "inference", "openai", "serverless"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
]

dependencies = [
"ray",
"pandas",
"pyarrow",
"runpod~=1.7.7",
"huggingface-hub",
"packaging",
"typing-extensions>=4.8.0",
"pydantic",
"pydantic-settings",
"hf-transfer",
"transformers>=4.51.3",
"bitsandbytes>=0.45.0",
"kernels",
"torch==2.6.0",
]

[project.optional-dependencies]
dev = [
"pytest",
"pytest-asyncio",
"black",
"flake8",
"mypy",
]

[project.urls]
Homepage = "https://github.com/ariefwijaya/worker-vllm"
Repository = "https://github.com/ariefwijaya/worker-vllm"
Documentation = "https://github.com/ariefwijaya/worker-vllm/blob/main/README.md"

[tool.setuptools]
package-dir = {"vllm_worker" = "src"}
packages = ["vllm_worker"]

[tool.setuptools.dynamic]
version = {file = "VERSION"}

[tool.black]
line-length = 100
target-version = ['py310', 'py311', 'py312']

[tool.pytest.ini_options]
testpaths = ["tests"]
asyncio_mode = "auto"
37 changes: 37 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
vLLM Worker - OpenAI-compatible vLLM inference engine

Usage:
from vllm_worker import vLLMEngine, OpenAIvLLMEngine, JobInput
"""

try:
from importlib.metadata import version, PackageNotFoundError
except ImportError:
# Python < 3.8
from importlib_metadata import version, PackageNotFoundError

try:
__version__ = version("vllm-worker")
except PackageNotFoundError:
# Package is not installed, fallback to reading VERSION file
from pathlib import Path
_version_file = Path(__file__).parent.parent / "VERSION"
__version__ = _version_file.read_text().strip()

# Import main classes for easy access
from .engine import vLLMEngine, OpenAIvLLMEngine
from .utils import JobInput, DummyRequest, BatchSize, create_error_response
from .tokenizer import TokenizerWrapper
from .engine_args import get_engine_args

__all__ = [
"vLLMEngine",
"OpenAIvLLMEngine",
"JobInput",
"DummyRequest",
"BatchSize",
"TokenizerWrapper",
"get_engine_args",
"create_error_response",
]
8 changes: 7 additions & 1 deletion src/download_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import glob
from shutil import rmtree
from huggingface_hub import snapshot_download
from utils import timer_decorator

try:
# Try relative imports (when installed as package)
from .utils import timer_decorator
except ImportError:
# Fall back to absolute imports (when running directly)
from utils import timer_decorator

BASE_DIR = "/"
TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]]
Expand Down
10 changes: 5 additions & 5 deletions src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from vllm.entrypoints.openai.serving_models import BaseModelPath, LoRAModulePath, OpenAIServingModels


from utils import DummyRequest, JobInput, BatchSize, create_error_response
from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE
from tokenizer import TokenizerWrapper
from engine_args import get_engine_args
from .utils import DummyRequest, JobInput, BatchSize, create_error_response
from .constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE
from .tokenizer import TokenizerWrapper
from .engine_args import get_engine_args

class vLLMEngine:
def __init__(self, engine = None):
Expand Down Expand Up @@ -204,7 +204,7 @@ def _load_lora_adapters(self):
async def _initialize_engines(self):
self.model_config = await self.llm.get_model_config()
self.base_model_paths = [
BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model)
BaseModelPath(name=self.served_model_name, model_path=self.engine_args.model)
]

self.serving_models = OpenAIServingModels(
Expand Down
8 changes: 7 additions & 1 deletion src/engine_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
from torch.cuda import device_count
from vllm import AsyncEngineArgs
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from src.utils import convert_limit_mm_per_prompt

try:
# Try relative imports (when installed as package)
from .utils import convert_limit_mm_per_prompt
except ImportError:
# Fall back to absolute imports (when running directly)
from utils import convert_limit_mm_per_prompt

RENAME_ARGS_MAP = {
"MODEL_NAME": "model",
Expand Down
11 changes: 9 additions & 2 deletions src/handler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import os
import runpod
from utils import JobInput
from engine import vLLMEngine, OpenAIvLLMEngine

try:
# Try relative imports (when installed as package)
from .utils import JobInput
from .engine import vLLMEngine, OpenAIvLLMEngine
except ImportError:
# Fall back to absolute imports (when running directly)
from utils import JobInput
from engine import vLLMEngine, OpenAIvLLMEngine

vllm_engine = vLLMEngine()
OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine)
Expand Down
Loading