docker: precompiled wheel support + docker-aware setup tweaks

dougbtv · dougbtv · commit 2ac5db21289a · 2025-07-17T11:58:27.000-04:00
Main goal is in the context of CI, in order to not build wheels when unnecessary, and speed up CI builds overall.

- added VLLM_DOCKER_BUILD_CONTEXT to envs to skip git + unzip logic in setup.py
- normalized VLLM_USE_PRECOMPILED, treat only "1" or "true" as true
- setup.py now copies contextually-named precompiled wheel into dist/ during docker builds.
- smoother precompiled wheel flow, overall, in docker

Signed-off-by: dougbtv &lt;dosmith@redhat.com&gt;
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -209,16 +209,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
 # Flag to control whether to use pre-built vLLM wheels
-ARG VLLM_USE_PRECOMPILED
-# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
-ENV VLLM_USE_PRECOMPILED=""
-RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
-        export VLLM_USE_PRECOMPILED=1 && \
-        echo "Using precompiled wheels"; \
-    else \
-        unset VLLM_USE_PRECOMPILED && \
-        echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
-    fi
+ARG VLLM_USE_PRECOMPILED=""
 
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -235,6 +226,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
         && export SCCACHE_IDLE_TIMEOUT=0 \
         && export CMAKE_BUILD_TYPE=Release \
+        && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
+        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
         && sccache --show-stats \
         && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
         && sccache --show-stats; \
@@ -248,9 +241,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         # Clean any existing CMake artifacts
         rm -rf .deps && \
         mkdir -p .deps && \
+        export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
+        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
+RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \
+        echo "Cleaning up extra wheels in dist/..." && \
+        # Identify the most recent manylinux1_x86_64 wheel
+        KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
+        if [ -n "$KEEP_WHEEL" ]; then \
+            echo "Keeping wheel: $KEEP_WHEEL"; \
+            find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
+        fi; \
+    fi
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -370,9 +376,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
             --pre pytorch_triton==3.3.0+gitab727c40 ; \
     fi
 
+    # Install vllm wheel first, so that torch etc will be installed.
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    ls -l dist/
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
+    ls -l dist/ && \
     uv pip install --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 import re
 import subprocess
 import sys
+import shutil
 from pathlib import Path
 from shutil import which
 
@@ -297,6 +298,10 @@ def get_base_commit_in_main_branch(self) -> str:
             ]).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
+            # In Docker build context, .git may be immutable or missing.
+            if envs.VLLM_DOCKER_BUILD_CONTEXT:
+                return upstream_main_commit
+
             # Check if the upstream_main_commit exists in the local repo
             try:
                 subprocess.check_output(
@@ -357,19 +362,49 @@ def run(self) -> None:
             # create a temporary directory to store the wheel
             temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
             wheel_path = os.path.join(temp_dir, wheel_filename)
-
             print(f"Downloading wheel from {wheel_location} to {wheel_path}")
-
             from urllib.request import urlretrieve
-
             try:
                 urlretrieve(wheel_location, filename=wheel_path)
             except Exception as e:
                 from setuptools.errors import SetupError
-
                 raise SetupError(
-                    f"Failed to get vLLM wheel from {wheel_location}") from e
+                    f"Failed to get vLLM wheel from {wheel_location}"
+                ) from e
+
+        # During a docker build shortcut: clean dist/, determine correct filename, copy
+        if envs.VLLM_DOCKER_BUILD_CONTEXT:
+            dist_dir = "/workspace/dist"
+            os.makedirs(dist_dir, exist_ok=True)
+            # Determine correct wheel filename from METADATA
+            with zipfile.ZipFile(wheel_path, "r") as z:
+                metadata_file = next(
+                    (n for n in z.namelist() if n.endswith(".dist-info/METADATA")),
+                    None,
+                )
+                if not metadata_file:
+                    raise RuntimeError("Could not find METADATA in precompiled wheel.")
+                metadata = z.read(metadata_file).decode()
+                version_line = next(
+                    (l for l in metadata.splitlines() if l.startswith("Version: ")), None
+                )
+                if not version_line:
+                    raise RuntimeError("Could not determine version from METADATA.")
+                version = version_line.split(": ")[1].strip()
+
+            # Build correct filename using internal version
+            arch_tag = "cp38-abi3-manylinux1_x86_64"
+            corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
+            final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)
+
+            print(
+                f"Docker build context detected, copying precompiled wheel "
+                f"({version}) to {final_wheel_path}"
+            )
+            shutil.copy2(wheel_path, final_wheel_path)
+            return
 
+        # Unzip the wheel when not in Docker context
         with zipfile.ZipFile(wheel_path) as wheel:
             files_to_copy = [
                 "vllm/_C.abi3.so",
@@ -378,36 +413,28 @@ def run(self) -> None:
                 "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                 "vllm/cumem_allocator.abi3.so",
-                # "vllm/_version.py", # not available in nightly wheels yet
             ]
-
             file_members = list(
-                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
-
-            # vllm_flash_attn python code:
-            # Regex from
-            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
+                filter(lambda x: x.filename in files_to_copy, wheel.filelist)
+            )
             compiled_regex = re.compile(
-                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+            )
             file_members += list(
-                filter(lambda x: compiled_regex.match(x.filename),
-                       wheel.filelist))
+                filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+            )
 
             for file in file_members:
-                print(f"Extracting and including {file.filename} "
-                      "from existing wheel")
+                print(f"Extracting and including {file.filename} from existing wheel")
                 package_name = os.path.dirname(file.filename).replace("/", ".")
                 file_name = os.path.basename(file.filename)
 
                 if package_name not in package_data:
                     package_data[package_name] = []
 
                 wheel.extract(file)
-                if file_name.endswith(".py"):
-                    # python files shouldn't be added to package_data
-                    continue
-
-                package_data[package_name].append(file_name)
+                if not file_name.endswith(".py"):
+                    package_data[package_name].append(file_name)
 
 
 def _is_hpu() -> bool:
@@ -438,6 +465,9 @@ def _no_device() -> bool:
 
 
 def _is_cuda() -> bool:
+    # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
+    if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT:
+        return True
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
             and not (_is_neuron() or _is_tpu() or _is_hpu()))
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -67,6 +67,7 @@
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -219,8 +220,13 @@ def get_vllm_port() -> Optional[int]:
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
-        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+    lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
+        or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
+
+    # Used to mark that setup.py is running in a Docker build context, in order to force
+    # the use of precompiled binaries.
+    "VLLM_DOCKER_BUILD_CONTEXT":
+    lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in ("1", "true"),
 
     # Whether to force using nightly wheel in python build.
     # This is used for testing the nightly wheel in python build.