TODO: temp commit for vllm-fa

vllm-project · Feb 4, 2025 · 8132aba · 8132aba
1 parent acb3dc7
commit 8132aba
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 9 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -574,8 +574,8 @@ if(VLLM_FLASH_ATTN_SRC_DIR)
 else()
   FetchContent_Declare(
           vllm-flash-attn
-          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
+          GIT_REPOSITORY https://github.com/neuralmagic/flash-attention.git
+          GIT_TAG 3810d6a7d4a079ae195e66f39c96805c5ecabc43
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
@@ -30,7 +30,7 @@
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 
 #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
-  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
+  err = VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(               \
       ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
                                               BLOCK_SIZE, NUM_THREADS,      \
                                               KV_DTYPE, IS_BLOCK_SPARSE>),  \
@@ -96,6 +96,7 @@ void paged_attention_v1_launcher(
   dim3 block(NUM_THREADS);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  error_t err;
   switch (head_size) {
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
@@ -161,6 +162,9 @@ void paged_attention_v1_launcher(
     case 32:                                                      \
       CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
       break;                                                      \
+    case 128:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 128, KV_DTYPE);        \
+      break;                                                      \
     default:                                                      \
       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
       break;                                                      \

diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
@@ -168,6 +168,9 @@ void paged_attention_v2_launcher(
     case 32:                                                      \
       CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
       break;                                                      \
+    case 128:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 128, KV_DTYPE);        \
+      break;                                                      \
     default:                                                      \
       TORCH_CHECK(false, "Unsupported block size: ", block_size); \
       break;                                                      \

diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
@@ -5,12 +5,12 @@
 int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
   int device, value;
   if (device_id < 0) {
-    cudaGetDevice(&device);
+    auto ignoreErr = cudaGetDevice(&device);
   } else {
     device = device_id;
   }
-  cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
-                         device);
+  auto ignoreErr = cudaDeviceGetAttribute(
+      &value, static_cast<cudaDeviceAttr>(attribute), device);
   return value;
 }
 

diff --git a/setup.py b/setup.py
@@ -592,13 +592,13 @@ def _read_requirements(filename: str) -> List[str]:
 
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
 
-if _is_cuda() or _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if _is_cuda() and (envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0")):
+if _is_cuda():
+    if (envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0")):
         # FA3 requires CUDA 12.0 or later
         ext_modules.append(
             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))