1
1
cmake_minimum_required (VERSION 3.26)
2
2
3
+ # When building directly using CMake, make sure you run the install step
4
+ # (it places the .so files in the correct location).
5
+ #
6
+ # Example:
7
+ # mkdir build && cd build
8
+ # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
9
+ # cmake --build . --target install
10
+ #
11
+ # If you want to only build one target, make sure to install it manually:
12
+ # cmake --build . --target _C
13
+ # cmake --install . --component _C
3
14
project (vllm_extensions LANGUAGES CXX)
4
15
5
16
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
13
24
# Suppress potential warnings about unused manually-specified variables
14
25
set (ignoreMe "${VLLM_PYTHON_PATH} " )
15
26
27
+ # Prevent installation of dependencies (cutlass) by default.
28
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
29
+
16
30
#
17
31
# Supported python versions. These versions will be searched in order, the
18
32
# first match will be selected. These should be kept in sync with setup.py.
@@ -70,19 +84,6 @@ endif()
70
84
find_package (Torch REQUIRED)
71
85
72
86
#
73
- # Add the `default` target which detects which extensions should be
74
- # built based on platform/architecture. This is the same logic that
75
- # setup.py uses to select which extensions should be built and should
76
- # be kept in sync.
77
- #
78
- # The `default` target makes direct use of cmake easier since knowledge
79
- # of which extensions are supported has been factored in, e.g.
80
- #
81
- # mkdir build && cd build
82
- # cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
83
- # cmake --build . --target default
84
- #
85
- add_custom_target (default)
86
87
message (STATUS "Enabling core extension." )
87
88
88
89
# Define _core_C extension
@@ -100,8 +101,6 @@ define_gpu_extension_target(
100
101
USE_SABI 3
101
102
WITH_SOABI)
102
103
103
- add_dependencies (default _core_C)
104
-
105
104
#
106
105
# Forward the non-CUDA device extensions to external CMake scripts.
107
106
#
@@ -167,6 +166,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
167
166
list (APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS} " )
168
167
endif ()
169
168
169
+ include (FetchContent)
170
+
170
171
#
171
172
# Define other extension targets
172
173
#
@@ -190,7 +191,6 @@ set(VLLM_EXT_SRC
190
191
"csrc/torch_bindings.cpp" )
191
192
192
193
if (VLLM_GPU_LANG STREQUAL "CUDA" )
193
- include (FetchContent)
194
194
SET (CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" )
195
195
FetchContent_Declare(
196
196
cutlass
@@ -283,6 +283,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
283
283
csrc/quantization/machete/machete_pytorch.cu)
284
284
endif ()
285
285
286
+ message (STATUS "Enabling C extension." )
286
287
define_gpu_extension_target(
287
288
_C
288
289
DESTINATION vllm
@@ -313,6 +314,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
313
314
"csrc/moe/marlin_moe_ops.cu" )
314
315
endif ()
315
316
317
+ message (STATUS "Enabling moe extension." )
316
318
define_gpu_extension_target(
317
319
_moe_C
318
320
DESTINATION vllm
@@ -323,7 +325,6 @@ define_gpu_extension_target(
323
325
USE_SABI 3
324
326
WITH_SOABI)
325
327
326
-
327
328
if (VLLM_GPU_LANG STREQUAL "HIP" )
328
329
#
329
330
# _rocm_C extension
@@ -343,16 +344,63 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
343
344
WITH_SOABI)
344
345
endif ()
345
346
347
+ # vllm-flash-attn currently only supported on CUDA
348
+ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" )
349
+ return ()
350
+ endif ()
346
351
347
- if (VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP" )
348
- message (STATUS "Enabling C extension." )
349
- add_dependencies (default _C)
352
+ #
353
+ # Build vLLM flash attention from source
354
+ #
355
+ # IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
356
+ # Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
357
+ # They should be identical but if they aren't, this is a massive footgun.
358
+ #
359
+ # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
360
+ # To only install vllm-flash-attn, use --component vllm_flash_attn_c.
361
+ # If no component is specified, vllm-flash-attn is still installed.
350
362
351
- message (STATUS "Enabling moe extension." )
352
- add_dependencies (default _moe_C)
363
+ # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
364
+ # This is to enable local development of vllm-flash-attn within vLLM.
365
+ # It can be set as an environment variable or passed as a cmake argument.
366
+ # The environment variable takes precedence.
367
+ if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
368
+ set (VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR} )
353
369
endif ()
354
370
355
- if (VLLM_GPU_LANG STREQUAL "HIP" )
356
- message (STATUS "Enabling rocm extension." )
357
- add_dependencies (default _rocm_C)
371
+ if (VLLM_FLASH_ATTN_SRC_DIR)
372
+ FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR} )
373
+ else ()
374
+ FetchContent_Declare(
375
+ vllm-flash-attn
376
+ GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
377
+ GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
378
+ GIT_PROGRESS TRUE
379
+ )
358
380
endif ()
381
+
382
+ # Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
383
+ set (VLLM_PARENT_BUILD ON )
384
+
385
+ # Make sure vllm-flash-attn install rules are nested under vllm/
386
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
387
+ install (CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\$ {CMAKE_INSTALL_PREFIX}\" )" COMPONENT vllm_flash_attn_c)
388
+ install (CODE "set(CMAKE_INSTALL_PREFIX \"\$ {CMAKE_INSTALL_PREFIX}/vllm/\" )" COMPONENT vllm_flash_attn_c)
389
+
390
+ # Fetch the vllm-flash-attn library
391
+ FetchContent_MakeAvailable(vllm-flash-attn)
392
+ message (STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}" )
393
+
394
+ # Restore the install prefix
395
+ install (CODE "set(CMAKE_INSTALL_PREFIX \"\$ {OLD_CMAKE_INSTALL_PREFIX}\" )" COMPONENT vllm_flash_attn_c)
396
+ install (CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
397
+
398
+ # Copy over the vllm-flash-attn python files
399
+ install (
400
+ DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
401
+ DESTINATION vllm/vllm_flash_attn
402
+ COMPONENT vllm_flash_attn_c
403
+ FILES_MATCHING PATTERN "*.py"
404
+ )
405
+
406
+ # Nothing after vllm-flash-attn, see comment about macros above
0 commit comments