Skip to content

Commit 06ec486

Browse files
authored
Install flash_attn in Docker image (vllm-project#3396)
1 parent 8fe8386 commit 06ec486

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

Diff for: Dockerfile

+24
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
5757
RUN python3 setup.py build_ext --inplace
5858
#################### EXTENSION Build IMAGE ####################
5959

60+
#################### FLASH_ATTENTION Build IMAGE ####################
61+
FROM dev as flash-attn-builder
62+
# max jobs used for build
63+
ARG max_jobs=2
64+
ENV MAX_JOBS=${max_jobs}
65+
# flash attention version
66+
ARG flash_attn_version=v2.5.6
67+
ENV FLASH_ATTN_VERSION=${flash_attn_version}
68+
69+
WORKDIR /usr/src/flash-attention-v2
70+
71+
# Download the wheel or build it if a pre-compiled release doesn't exist
72+
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
73+
--no-build-isolation --no-deps --no-cache-dir
74+
75+
#################### FLASH_ATTENTION Build IMAGE ####################
6076

6177
#################### TEST IMAGE ####################
6278
# image to run unit testing suite
@@ -68,6 +84,9 @@ WORKDIR /vllm-workspace
6884
# ADD is used to preserve directory structure
6985
ADD . /vllm-workspace/
7086
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
87+
# Install flash attention (from pre-built wheel)
88+
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
89+
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
7190
# ignore build dependencies installation because we are using pre-complied extensions
7291
RUN rm pyproject.toml
7392
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
@@ -88,6 +107,11 @@ WORKDIR /workspace
88107
COPY requirements.txt requirements.txt
89108
RUN --mount=type=cache,target=/root/.cache/pip \
90109
pip install -r requirements.txt
110+
111+
# Install flash attention (from pre-built wheel)
112+
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
113+
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
114+
91115
#################### RUNTIME BASE IMAGE ####################
92116

93117

0 commit comments

Comments
 (0)