runpod-workers · ariefwijaya · Sep 28, 2025 · Oct 8, 2025 · Oct 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,7 @@ runpod.toml
 .env
 test/*
 vllm-base/vllm-*
-.DS_Store
+.DS_Store
+build/
+*.lock
+*.egg-info
diff --git a/Dockerfile.custom b/Dockerfile.custom
@@ -0,0 +1,52 @@
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
+# Install Python dependencies
+COPY builder/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
+
+# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
+RUN python3 -m pip install vllm==0.10.0 && \
+    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
+
+RUN pip install --extra-index-url https://miropsota.github.io/torch_packages_builder flash-attn==2.8.3+pt2.7.0cu126
+
+# Setup for Option 2: Building the Image with the Model included
+ARG MODEL_NAME=""
+ARG TOKENIZER_NAME=""
+ARG BASE_PATH="/runpod-volume"
+ARG QUANTIZATION=""
+ARG MODEL_REVISION=""
+ARG TOKENIZER_REVISION=""
+
+ENV MODEL_NAME=$MODEL_NAME \
+    MODEL_REVISION=$MODEL_REVISION \
+    TOKENIZER_NAME=$TOKENIZER_NAME \
+    TOKENIZER_REVISION=$TOKENIZER_REVISION \
+    BASE_PATH=$BASE_PATH \
+    QUANTIZATION=$QUANTIZATION \
+    HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
+    HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
+    HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
+    HF_HUB_ENABLE_HF_TRANSFER=0 
+
+ENV PYTHONPATH="/:/vllm-workspace"
+
+
+COPY src /src
+RUN --mount=type=secret,id=HF_TOKEN,required=false \
+    if [ -f /run/secrets/HF_TOKEN ]; then \
+    export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
+    fi && \
+    if [ -n "$MODEL_NAME" ]; then \
+    python3 /src/download_model.py; \
+    fi
+
+# Start the handler
+CMD ["python3", "/src/handler_custom.py"]
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+1.0.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,69 @@
+[build-system]
+requires = ["setuptools>=77.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vllm-worker"
+dynamic = ["version"]
+description = "OpenAI-compatible vLLM worker for serverless inference. Forked from https://github.com/runpod-workers/worker-vllm"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+authors = [
+    {name = "Arief Wijaya", email = "[email protected]"}
+]
+keywords = ["vllm", "llm", "inference", "openai", "serverless"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+dependencies = [
+    "ray",
+    "pandas",
+    "pyarrow",
+    "runpod~=1.7.7",
+    "huggingface-hub",
+    "packaging",
+    "typing-extensions>=4.8.0",
+    "pydantic",
+    "pydantic-settings",
+    "hf-transfer",
+    "transformers>=4.51.3",
+    "bitsandbytes>=0.45.0",
+    "kernels",
+    "torch==2.6.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-asyncio",
+    "black",
+    "flake8",
+    "mypy",
+]
+
+[project.urls]
+Homepage = "https://github.com/ariefwijaya/worker-vllm"
+Repository = "https://github.com/ariefwijaya/worker-vllm"
+Documentation = "https://github.com/ariefwijaya/worker-vllm/blob/main/README.md"
+
+[tool.setuptools]
+package-dir = {"vllm_worker" = "src"}
+packages = ["vllm_worker"]
+
+[tool.setuptools.dynamic]
+version = {file = "VERSION"}
+
+[tool.black]
+line-length = 100
+target-version = ['py310', 'py311', 'py312']
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+asyncio_mode = "auto"
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,37 @@
+"""
+vLLM Worker - OpenAI-compatible vLLM inference engine
+
+Usage:
+    from vllm_worker import vLLMEngine, OpenAIvLLMEngine, JobInput
+"""
+
+try:
+    from importlib.metadata import version, PackageNotFoundError
+except ImportError:
+    # Python < 3.8
+    from importlib_metadata import version, PackageNotFoundError
+
+try:
+    __version__ = version("vllm-worker")
+except PackageNotFoundError:
+    # Package is not installed, fallback to reading VERSION file
+    from pathlib import Path
+    _version_file = Path(__file__).parent.parent / "VERSION"
+    __version__ = _version_file.read_text().strip()
+
+# Import main classes for easy access
+from .engine import vLLMEngine, OpenAIvLLMEngine
+from .utils import JobInput, DummyRequest, BatchSize, create_error_response
+from .tokenizer import TokenizerWrapper
+from .engine_args import get_engine_args
+
+__all__ = [
+    "vLLMEngine",
+    "OpenAIvLLMEngine",
+    "JobInput",
+    "DummyRequest",
+    "BatchSize",
+    "TokenizerWrapper",
+    "get_engine_args",
+    "create_error_response",
+]
diff --git a/src/download_model.py b/src/download_model.py
@@ -4,7 +4,13 @@
 import glob
 from shutil import rmtree
 from huggingface_hub import snapshot_download
-from utils import timer_decorator
+
+try:
+    # Try relative imports (when installed as package)
+    from .utils import timer_decorator
+except ImportError:
+    # Fall back to absolute imports (when running directly)
+    from utils import timer_decorator
 
 BASE_DIR = "/" 
 TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]]

diff --git a/src/engine.py b/src/engine.py
@@ -15,10 +15,10 @@
 from vllm.entrypoints.openai.serving_models import BaseModelPath, LoRAModulePath, OpenAIServingModels
 
 
-from utils import DummyRequest, JobInput, BatchSize, create_error_response
-from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE
-from tokenizer import TokenizerWrapper
-from engine_args import get_engine_args
+from .utils import DummyRequest, JobInput, BatchSize, create_error_response
+from .constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE
+from .tokenizer import TokenizerWrapper
+from .engine_args import get_engine_args
 
 class vLLMEngine:
     def __init__(self, engine = None):
@@ -204,7 +204,7 @@ def _load_lora_adapters(self):
     async def _initialize_engines(self):
         self.model_config = await self.llm.get_model_config()
         self.base_model_paths = [
-            BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model)
+            BaseModelPath(name=self.served_model_name, model_path=self.engine_args.model)
         ]
 
         self.serving_models = OpenAIServingModels(

diff --git a/src/engine_args.py b/src/engine_args.py
@@ -4,7 +4,13 @@
 from torch.cuda import device_count
 from vllm import AsyncEngineArgs
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from src.utils import convert_limit_mm_per_prompt
+
+try:
+    # Try relative imports (when installed as package)
+    from .utils import convert_limit_mm_per_prompt
+except ImportError:
+    # Fall back to absolute imports (when running directly)
+    from utils import convert_limit_mm_per_prompt
 
 RENAME_ARGS_MAP = {
     "MODEL_NAME": "model",

diff --git a/src/handler.py b/src/handler.py
@@ -1,7 +1,14 @@
 import os
 import runpod
-from utils import JobInput
-from engine import vLLMEngine, OpenAIvLLMEngine
+
+try:
+    # Try relative imports (when installed as package)
+    from .utils import JobInput
+    from .engine import vLLMEngine, OpenAIvLLMEngine
+except ImportError:
+    # Fall back to absolute imports (when running directly)
+    from utils import JobInput
+    from engine import vLLMEngine, OpenAIvLLMEngine
 
 vllm_engine = vLLMEngine()
 OpenAIvLLMEngine = OpenAIvLLMEngine(vllm_engine)