Skip to content

Commit a6760f6

Browse files
sanketkaleossSanket Kalemgoin
authored
[Feature] vLLM ARM Enablement for AARCH64 CPUs (vllm-project#9228)
Signed-off-by: Sanket Kale <[email protected]> Co-authored-by: Sanket Kale <[email protected]> Co-authored-by: mgoin <[email protected]>
1 parent 45ac4ff commit a6760f6

9 files changed

+678
-16
lines changed

Dockerfile.arm

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
2+
3+
FROM ubuntu:22.04 AS cpu-test-arm
4+
5+
ENV CCACHE_DIR=/root/.cache/ccache
6+
7+
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
8+
9+
RUN --mount=type=cache,target=/var/cache/apt \
10+
apt-get update -y \
11+
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
12+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
13+
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
14+
15+
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
16+
RUN --mount=type=cache,target=/root/.cache/pip \
17+
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
18+
19+
# Set LD_PRELOAD for tcmalloc on ARM
20+
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
21+
22+
RUN echo 'ulimit -c 0' >> ~/.bashrc
23+
24+
WORKDIR /workspace
25+
26+
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
27+
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
28+
RUN --mount=type=cache,target=/root/.cache/pip \
29+
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
30+
pip install --upgrade pip && \
31+
pip install -r requirements-build.txt
32+
33+
FROM cpu-test-arm AS build
34+
35+
WORKDIR /workspace/vllm
36+
37+
RUN --mount=type=cache,target=/root/.cache/pip \
38+
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
39+
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
40+
pip install -v -r requirements-cpu.txt
41+
42+
COPY . .
43+
ARG GIT_REPO_CHECK=0
44+
RUN --mount=type=bind,source=.git,target=.git \
45+
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
46+
47+
# Disabling AVX512 specific optimizations for ARM
48+
ARG VLLM_CPU_DISABLE_AVX512="true"
49+
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
50+
51+
RUN --mount=type=cache,target=/root/.cache/pip \
52+
--mount=type=cache,target=/root/.cache/ccache \
53+
--mount=type=bind,source=.git,target=.git \
54+
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
55+
pip install dist/*.whl && \
56+
rm -rf dist
57+
58+
WORKDIR /workspace/
59+
60+
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
61+
62+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

cmake/cpu_extension.cmake

+24-9
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,15 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
1616
#
1717
# Check the compile flags
1818
#
19-
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
20-
list(APPEND CXX_COMPILE_FLAGS
21-
"-fopenmp"
22-
"-DVLLM_CPU_EXTENSION")
23-
else()
19+
20+
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
2421
list(APPEND CXX_COMPILE_FLAGS
25-
"-fopenmp"
2622
"-mf16c"
27-
"-DVLLM_CPU_EXTENSION")
23+
)
2824
endif()
25+
list(APPEND CXX_COMPILE_FLAGS
26+
"-fopenmp"
27+
"-DVLLM_CPU_EXTENSION")
2928

3029
execute_process(COMMAND cat /proc/cpuinfo
3130
RESULT_VARIABLE CPUINFO_RET
@@ -59,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
5958
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
6059
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
6160
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
61+
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
62+
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
6263

6364
if (AVX512_FOUND AND NOT AVX512_DISABLED)
6465
list(APPEND CXX_COMPILE_FLAGS
@@ -78,18 +79,32 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
7879
else()
7980
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
8081
endif()
82+
8183
elseif (AVX2_FOUND)
8284
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
8385
message(WARNING "vLLM CPU backend using AVX2 ISA")
86+
8487
elseif (POWER9_FOUND OR POWER10_FOUND)
8588
message(STATUS "PowerPC detected")
8689
# Check for PowerPC VSX support
8790
list(APPEND CXX_COMPILE_FLAGS
8891
"-mvsx"
8992
"-mcpu=native"
9093
"-mtune=native")
94+
95+
elseif (ASIMD_FOUND)
96+
message(STATUS "ARMv8 or later architecture detected")
97+
if(ARM_BF16_FOUND)
98+
message(STATUS "BF16 extension detected")
99+
set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
100+
add_compile_definitions(ARM_BF16_SUPPORT)
101+
else()
102+
message(WARNING "BF16 functionality is not available")
103+
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
104+
endif()
105+
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
91106
else()
92-
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
107+
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
93108
endif()
94109

95110
#
@@ -159,4 +174,4 @@ define_gpu_extension_target(
159174
WITH_SOABI
160175
)
161176

162-
message(STATUS "Enabling C extension.")
177+
message(STATUS "Enabling C extension.")

csrc/cpu/attention.cpp

+17-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ struct KernelVecType<c10::BFloat16> {
5151
using v_load_vec_type = vec_op::BF16Vec16;
5252
};
5353
#else
54+
#ifdef __aarch64__
55+
#ifndef ARM_BF16_SUPPORT
56+
// pass
57+
#else
5458
template <>
5559
struct KernelVecType<c10::BFloat16> {
5660
using q_load_vec_type = vec_op::BF16Vec8;
@@ -60,6 +64,18 @@ struct KernelVecType<c10::BFloat16> {
6064
using qk_acc_vec_type = vec_op::FP32Vec16;
6165
using v_load_vec_type = vec_op::BF16Vec16;
6266
};
67+
#endif
68+
#else
69+
template <>
70+
struct KernelVecType<c10::BFloat16> {
71+
using q_load_vec_type = vec_op::BF16Vec8;
72+
using q_vec_type = vec_op::FP32Vec16;
73+
using k_load_vec_type = vec_op::BF16Vec16;
74+
using k_vec_type = vec_op::FP32Vec16;
75+
using qk_acc_vec_type = vec_op::FP32Vec16;
76+
using v_load_vec_type = vec_op::BF16Vec16;
77+
};
78+
#endif
6379
#endif
6480

6581
template <typename T>
@@ -779,4 +795,4 @@ void paged_attention_v2(
779795
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
780796
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
781797
});
782-
}
798+
}

csrc/cpu/cpu_types.hpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
#ifndef CPU_TYPES_HPP
32
#define CPU_TYPES_HPP
43

@@ -8,8 +7,11 @@
87
#elif defined(__POWER9_VECTOR__)
98
//ppc implementation
109
#include "cpu_types_vsx.hpp"
10+
#elif defined(__aarch64__)
11+
//arm implementation
12+
#include "cpu_types_arm.hpp"
1113
#else
1214
#warning "unsupported vLLM cpu implementation"
1315
#endif
1416

15-
#endif
17+
#endif

0 commit comments

Comments
 (0)