Skip to content

Commit 7ecee34

Browse files
authored
[Kernel][RFC] Refactor the punica kernel based on Triton (vllm-project#5036)
1 parent 7eb0cb4 commit 7ecee34

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+3175
-4364
lines changed

.github/workflows/scripts/build.sh

-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@ $python_executable -m pip install -r requirements-cuda.txt
1313

1414
# Limit the number of parallel jobs to avoid OOM
1515
export MAX_JOBS=1
16-
# Make sure punica is built for the release (for LoRA)
17-
export VLLM_INSTALL_PUNICA_KERNELS=1
1816
# Make sure release wheels are built for the following architectures
1917
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
2018
# Build

CMakeLists.txt

-62
Original file line numberDiff line numberDiff line change
@@ -223,61 +223,7 @@ define_gpu_extension_target(
223223
USE_SABI 3
224224
WITH_SOABI)
225225

226-
#
227-
# _punica_C extension
228-
#
229-
230-
set(VLLM_PUNICA_EXT_SRC
231-
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
232-
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
233-
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
234-
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
235-
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
236-
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
237-
"csrc/punica/punica_ops.cu"
238-
"csrc/punica/torch_bindings.cpp")
239-
240-
#
241-
# Copy GPU compilation flags+update for punica
242-
#
243-
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
244-
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
245-
"-D__CUDA_NO_HALF_OPERATORS__"
246-
"-D__CUDA_NO_HALF_CONVERSIONS__"
247-
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
248-
"-D__CUDA_NO_HALF2_OPERATORS__")
249-
250-
#
251-
# Filter out CUDA architectures < 8.0 for punica.
252-
#
253-
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
254-
set(VLLM_PUNICA_GPU_ARCHES)
255-
foreach(ARCH ${VLLM_GPU_ARCHES})
256-
string_to_ver(CODE_VER ${ARCH})
257-
if (CODE_VER GREATER_EQUAL 8.0)
258-
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
259-
endif()
260-
endforeach()
261-
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
262-
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
263-
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
264-
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
265-
endif()
266226

267-
if (VLLM_PUNICA_GPU_ARCHES)
268-
define_gpu_extension_target(
269-
_punica_C
270-
DESTINATION vllm
271-
LANGUAGE ${VLLM_GPU_LANG}
272-
SOURCES ${VLLM_PUNICA_EXT_SRC}
273-
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
274-
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
275-
USE_SABI 3
276-
WITH_SOABI)
277-
else()
278-
message(WARNING "Unable to create _punica_C target because none of the "
279-
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
280-
endif()
281227

282228
#
283229
# Add the `default` target which detects which extensions should be
@@ -301,12 +247,4 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
301247
message(STATUS "Enabling moe extension.")
302248
add_dependencies(default _moe_C)
303249

304-
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
305-
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
306-
# there are supported target arches.
307-
if (VLLM_PUNICA_GPU_ARCHES AND
308-
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
309-
message(STATUS "Enabling punica extension.")
310-
add_dependencies(default _punica_C)
311-
endif()
312250
endif()

Dockerfile

-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,6 @@ ENV MAX_JOBS=${max_jobs}
8888
# number of threads used by nvcc
8989
ARG nvcc_threads=8
9090
ENV NVCC_THREADS=$nvcc_threads
91-
# make sure punica kernels are built (for LoRA)
92-
ENV VLLM_INSTALL_PUNICA_KERNELS=1
9391

9492
ARG buildkite_commit
9593
ENV BUILDKITE_COMMIT=${buildkite_commit}

Dockerfile.rocm

+1-2
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ COPY . .
131131
RUN --mount=type=cache,target=/root/.cache/pip \
132132
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
133133

134-
# Make sure punica kernels are built (for LoRA)
135-
ENV VLLM_INSTALL_PUNICA_KERNELS=1
134+
136135
# Workaround for ray >= 2.10.0
137136
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
138137
# Silences the HF Tokenizers warning

csrc/punica/LICENSE

-217
This file was deleted.

csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu

-5
This file was deleted.

csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu

-5
This file was deleted.

0 commit comments

Comments
 (0)