Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0

# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV VLLM_USE_PRECOMPILED=""
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
export VLLM_USE_PRECOMPILED=1 && \
echo "Using precompiled wheels"; \
else \
unset VLLM_USE_PRECOMPILED && \
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
fi
ARG VLLM_USE_PRECOMPILED=""

# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
Expand All @@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
Expand All @@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi

# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
echo "Cleaning up extra wheels in dist/..." && \
# Identify the most recent manylinux1_x86_64 wheel
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
if [ -n "$KEEP_WHEEL" ]; then \
echo "Keeping wheel: $KEEP_WHEEL"; \
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
fi; \
fi

# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
Expand Down Expand Up @@ -370,9 +376,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--pre pytorch_triton==3.3.0+gitab727c40 ; \
fi

# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
ls -l dist/

# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
ls -l dist/ && \
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

Expand Down
74 changes: 52 additions & 22 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import re
import subprocess
import sys
import shutil
from pathlib import Path
from shutil import which

Expand Down Expand Up @@ -297,6 +298,10 @@
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]

# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
return upstream_main_commit

# Check if the upstream_main_commit exists in the local repo
try:
subprocess.check_output(
Expand Down Expand Up @@ -357,19 +362,49 @@
# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)

print(f"Downloading wheel from {wheel_location} to {wheel_path}")

from urllib.request import urlretrieve

try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError

raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e
f"Failed to get vLLM wheel from {wheel_location}"
) from e

Check failure on line 374 in setup.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

setup.py:374:81: E501 Line too long (87 > 80)
# During a docker build shortcut: clean dist/, determine correct filename, copy
if envs.VLLM_DOCKER_BUILD_CONTEXT:
dist_dir = "/workspace/dist"
os.makedirs(dist_dir, exist_ok=True)
# Determine correct wheel filename from METADATA
with zipfile.ZipFile(wheel_path, "r") as z:
metadata_file = next(
(n for n in z.namelist() if n.endswith(".dist-info/METADATA")),
None,
)
if not metadata_file:
raise RuntimeError("Could not find METADATA in precompiled wheel.")
metadata = z.read(metadata_file).decode()
version_line = next(
(l for l in metadata.splitlines() if l.startswith("Version: ")), None

Check failure on line 389 in setup.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E741)

setup.py:389:44: E741 Ambiguous variable name: `l`
)
if not version_line:
raise RuntimeError("Could not determine version from METADATA.")
version = version_line.split(": ")[1].strip()

# Build correct filename using internal version
arch_tag = "cp38-abi3-manylinux1_x86_64"
corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)

print(
f"Docker build context detected, copying precompiled wheel "
f"({version}) to {final_wheel_path}"
)
shutil.copy2(wheel_path, final_wheel_path)
return

# Unzip the wheel when not in Docker context
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
Expand All @@ -378,36 +413,28 @@
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]

file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))

# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
)

Check failure on line 426 in setup.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

setup.py:426:81: E501 Line too long (83 > 80)
for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
print(f"Extracting and including {file.filename} from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)

if package_name not in package_data:
package_data[package_name] = []

wheel.extract(file)
if file_name.endswith(".py"):
# python files shouldn't be added to package_data
continue

package_data[package_name].append(file_name)
if not file_name.endswith(".py"):
package_data[package_name].append(file_name)


def _is_hpu() -> bool:
Expand Down Expand Up @@ -438,6 +465,9 @@


def _is_cuda() -> bool:
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
return True
has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu() or _is_hpu()))
Expand Down
10 changes: 8 additions & 2 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
VLLM_NO_DEPRECATION_WARNING: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
Expand Down Expand Up @@ -219,8 +220,13 @@

# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED":
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),

# Used to mark that setup.py is running in a Docker build context, in order to force

Check failure on line 226 in vllm/envs.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/envs.py:226:81: E501 Line too long (88 > 80)
# the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT":
lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in ("1", "true"),

# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
Expand Down
Loading