Refine

minminsun · minminsun · commit 92ae9df9e5d9 · 2025-01-06T11:25:38.000Z
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -206,10 +206,10 @@ void set_params_fprop_sparse(Flash_fwd_params &params,
         seqlenq_ngroups_swapped,
         unpadded_lse
     );
-    params.block_count = (int*)block_count.data_ptr();
-    params.block_offset = (int*)block_offset.data_ptr();
-    params.column_count = (int*)column_count.data_ptr();
-    params.column_index = (int*)column_index.data_ptr();
+    params.block_count = block_count.const_data_ptr<int>();
+    params.block_offset = block_offset.const_data_ptr<int>();
+    params.column_count = column_count.const_data_ptr<int>();
+    params.column_index = column_index.const_data_ptr<int>();
     TORCH_CHECK(block_count.size(2) == block_offset.size(2));
     TORCH_CHECK(column_index.size(2) == block_offset.size(2));
     TORCH_CHECK(column_count.size(2) == column_index.size(2));
diff --git a/vllm_flash_attn/flash_attn_interface.py b/vllm_flash_attn/flash_attn_interface.py
@@ -156,6 +156,38 @@ def sparse_attn_func(
     return_softmax_lse=False,
     out=None,
 ):
+    """Compute attention with virtical and slash sparsity patterns.
+    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
+    block_count and block_offset for slash sparsity patterns, and
+    column_count and column_index for virtical sparsity patterns.
+
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
+        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
     if softmax_scale is None:
         softmax_scale = q.shape[-1] ** (-0.5)
     out, softmax_lse = _sparse_attn_forward(