Skip to content

Commit

Permalink
TODO: temp commit for vllm-fa
Browse files Browse the repository at this point in the history
  • Loading branch information
Luka Govedič committed Feb 4, 2025
1 parent acb3dc7 commit 8132aba
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 9 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -574,8 +574,8 @@ if(VLLM_FLASH_ATTN_SRC_DIR)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
GIT_REPOSITORY https://github.com/neuralmagic/flash-attention.git
GIT_TAG 3810d6a7d4a079ae195e66f39c96805c5ecabc43
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
Expand Down
6 changes: 5 additions & 1 deletion csrc/attention/paged_attention_v1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
err = VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, \
BLOCK_SIZE, NUM_THREADS, \
KV_DTYPE, IS_BLOCK_SPARSE>), \
Expand Down Expand Up @@ -96,6 +96,7 @@ void paged_attention_v1_launcher(
dim3 block(NUM_THREADS);
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
error_t err;
switch (head_size) {
// NOTE(woosuk): To reduce the compilation time, we only compile for the
// head sizes that we use in the model. However, we can easily extend this
Expand Down Expand Up @@ -161,6 +162,9 @@ void paged_attention_v1_launcher(
case 32: \
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
break; \
case 128: \
CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 128, KV_DTYPE); \
break; \
default: \
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
break; \
Expand Down
3 changes: 3 additions & 0 deletions csrc/attention/paged_attention_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ void paged_attention_v2_launcher(
case 32: \
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
break; \
case 128: \
CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 128, KV_DTYPE); \
break; \
default: \
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
break; \
Expand Down
6 changes: 3 additions & 3 deletions csrc/cuda_utils_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
int device, value;
if (device_id < 0) {
cudaGetDevice(&device);
auto ignoreErr = cudaGetDevice(&device);
} else {
device = device_id;
}
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
device);
auto ignoreErr = cudaDeviceGetAttribute(
&value, static_cast<cudaDeviceAttr>(attribute), device);
return value;
}

Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,13 +592,13 @@ def _read_requirements(filename: str) -> List[str]:

if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))

if _is_hip():
ext_modules.append(CMakeExtension(name="vllm._rocm_C"))

if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if _is_cuda() and (envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0")):
if _is_cuda():
if (envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0")):
# FA3 requires CUDA 12.0 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
Expand Down

0 comments on commit 8132aba

Please sign in to comment.