fix description

starcrown001 · starcrown001 · commit 34ec177378f2 · 2025-11-18T19:32:40.000+08:00
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu
@@ -850,8 +850,7 @@ void FlashMaskV2GradBaseKernel(
         &seqused_k_,  // b. If given, only this many elements of each batch
                       // element's keys are used.
     const paddle::optional<DenseTensor> &startend_row_indices_,
-    const paddle::optional<DenseTensor>
-        &block_mask_indices_,  // （(b,h,s//128,s//128)
+    const paddle::optional<DenseTensor> &block_mask_,  // （(b,h,s//128,s//128)
     int max_seqlen_q_,
     int max_seqlen_k_,
     float const softmax_scale,
@@ -1082,9 +1081,9 @@ void FlashMaskV2GradBaseKernel(
     }
   }
 
-  bool const is_blockmask = block_mask_indices_.is_initialized();
-  DenseTensor block_mask_indices;
-  if (is_blockmask) block_mask_indices = block_mask_indices_.get();
+  bool const is_blockmask = block_mask_.is_initialized();
+  DenseTensor block_mask;
+  if (is_blockmask) block_mask = block_mask_.get();
 
   if (is_blockmask) {
     PADDLE_ENFORCE_EQ(
@@ -1093,24 +1092,24 @@ void FlashMaskV2GradBaseKernel(
         common::errors::InvalidArgument(
             "blockmask should be used with flashmask at the same time "));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims().size(),
+    PADDLE_ENFORCE_EQ(block_mask.dims().size(),
                       4,
                       common::errors::InvalidArgument(
                           "blockmask receive blockmask_indices with dim "
                           "[batch_size, num_heads, blocklen_q, blocklen_k]"));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims()[2],
+    PADDLE_ENFORCE_EQ(block_mask.dims()[2],
                       (seqlen_q + 127) / 128,
                       common::errors::InvalidArgument(
                           "blockmask only supports blockdim_q = 128 now"));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims()[3],
+    PADDLE_ENFORCE_EQ(block_mask.dims()[3],
                       (seqlen_k + 127) / 128,
                       common::errors::InvalidArgument(
                           "blockmask only supports blockdim_k = 128 now"));
 
     PADDLE_ENFORCE_EQ(
-        block_mask_indices.dims()[1],
+        block_mask.dims()[1],
         startend_row_indices.dims()[1],
         common::errors::InvalidArgument(
             "blockmask only supports same dim num_heads with flashmask now"));
@@ -1503,8 +1502,8 @@ void FlashMaskV2GradBaseKernel(
     dynload::flashmaskv2_bwd_params_set_m_block_dim(params_handle, 128);
     dynload::flashmaskv2_bwd_params_set_n_block_dim(params_handle, 128);
     dynload::flashmaskv2_bwd_params_set_block_mask_ptr(
-        params_handle, (block_mask_indices.data<int32_t>()));
-    auto ptr = block_mask_indices.data<int32_t>();
+        params_handle, (block_mask.data<int32_t>()));
+    auto ptr = block_mask.data<int32_t>();
     std::cout << typeid(ptr).name() << std::endl;
   }
 #ifdef FLASHATTENTION_DISABLE_LOCAL
@@ -1554,7 +1553,7 @@ void FlashMaskV2GradKernel(
     const DenseTensor &out,
     const DenseTensor &softmax_lse,
     const DenseTensor &startend_row_indices,  // TODO(xiehaoyang): remove this
-    const paddle::optional<DenseTensor> &block_mask_indices,
+    const paddle::optional<DenseTensor> &block_mask,
     const DenseTensor &out_grad,
     float const softmax_scale,
     bool is_causal,
@@ -1591,7 +1590,7 @@ void FlashMaskV2GradKernel(
                                         paddle::none,
                                         paddle::none,
                                         startend_row_indices,
-                                        block_mask_indices,
+                                        block_mask,
                                         0,  // max_seqlen_q,
                                         0,  // max_seqlen_k,
                                         softmax_scale,
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_kernel.cu
@@ -1241,7 +1241,7 @@ void FlashMaskV2BaseKernel(
     const paddle::optional<DenseTensor>
         &startend_row_indices_,  // （b,h,s_1,[1,2,4])
     const paddle::optional<DenseTensor>
-        &block_mask_indices_,  // （(b,h,s// 128,s // 128)
+        &block_mask_,  // （(b,h,s// 128,s // 128)
     const int
         max_seqlen_q_,  // if max_seqlen_q_ is set to 0, it indicates that it is
                         // uninitialized and should not be referenced
@@ -1441,7 +1441,7 @@ void FlashMaskV2BaseKernel(
   }
 
   bool const is_flashmask = startend_row_indices_.is_initialized();
-  bool const is_blockmask = block_mask_indices_.is_initialized();
+  bool const is_blockmask = block_mask_.is_initialized();
 
   // This needs to go before kBlockM & kBlockN since we rely on the correct
   // window_size and is_causal to set kBlockM
@@ -2075,8 +2075,8 @@ void FlashMaskV2BaseKernel(
   // flashmask
   DenseTensor startend_row_indices;
   if (is_flashmask) startend_row_indices = startend_row_indices_.get();
-  DenseTensor block_mask_indices;
-  if (is_blockmask) block_mask_indices = block_mask_indices_.get();
+  DenseTensor block_mask;
+  if (is_blockmask) block_mask = block_mask_.get();
   DenseTensor flashmask_maxmin, lt_start_row_indices, lt_end_row_indices,
       ut_start_row_indices, ut_end_row_indices;
   if (is_flashmask) {
@@ -2158,24 +2158,24 @@ void FlashMaskV2BaseKernel(
         common::errors::InvalidArgument(
             "blockmask should be used with flashmask at the same time "));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims().size(),
+    PADDLE_ENFORCE_EQ(block_mask.dims().size(),
                       4,
                       common::errors::InvalidArgument(
                           "blockmask receive blockmask_indices with dim "
                           "[batch_size, num_heads, blocklen_q, blocklen_k]"));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims()[2],
+    PADDLE_ENFORCE_EQ(block_mask.dims()[2],
                       (seqlen_q + 127) / 128,
                       common::errors::InvalidArgument(
                           "blockmask is now only support blockdim_q = 128 "));
 
-    PADDLE_ENFORCE_EQ(block_mask_indices.dims()[3],
+    PADDLE_ENFORCE_EQ(block_mask.dims()[3],
                       (seqlen_k + 127) / 128,
                       common::errors::InvalidArgument(
                           "blockmask is now only support blockdim_k = 128 "));
 
     PADDLE_ENFORCE_EQ(
-        block_mask_indices.dims()[1],
+        block_mask.dims()[1],
         startend_row_indices.dims()[1],
         common::errors::InvalidArgument("blockmask is now only support same "
                                         "dim num_heads with flashmask "));
@@ -2186,7 +2186,7 @@ void FlashMaskV2BaseKernel(
     dynload::flashmaskv2_fwd_params_set_m_block_dim(params_handle, 128);
     dynload::flashmaskv2_fwd_params_set_n_block_dim(params_handle, 128);
     dynload::flashmaskv2_fwd_params_set_block_mask_ptr(
-        params_handle, (block_mask_indices.data<int32_t>()));
+        params_handle, (block_mask.data<int32_t>()));
   }
 
   if (is_flashmask) {
@@ -2302,7 +2302,7 @@ void FlashMaskV2Kernel(const Context &dev_ctx,
                        const DenseTensor &k,
                        const DenseTensor &v,
                        const DenseTensor &startend_row_indices,
-                       const paddle::optional<DenseTensor> &block_mask_indices,
+                       const paddle::optional<DenseTensor> &block_mask,
                        const float softmax_scale,
                        bool is_causal,
                        DenseTensor *out,
@@ -2333,7 +2333,7 @@ void FlashMaskV2Kernel(const Context &dev_ctx,
                                     paddle::none,  // v_descale_
                                     paddle::none,  // scheduler_metadata_
                                     startend_row_indices,
-                                    block_mask_indices,
+                                    block_mask,
                                     0,  // max_seqlen_q_
                                     0,  // max_seqlen_k_
                                     softmax_scale,
@@ -2378,6 +2378,5 @@ PD_REGISTER_KERNEL(flashmask_attention_v2,
                    phi::FlashMaskV2Kernel,
                    phi::float16,
                    phi::bfloat16) {
-  kernel->InputAt(4).SetBackend(
-      phi::Backend::ALL_BACKEND);  // block_mask_indices
+  kernel->InputAt(4).SetBackend(phi::Backend::ALL_BACKEND);  // block_mask
 }
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -1228,9 +1228,9 @@
     data_type: q
 
 - backward_op : flashmask_attention_v2_grad
-  forward : flashmask_attention_v2 (Tensor q, Tensor k, Tensor v, Tensor startend_row_indices,Tensor block_mask_indices, float softmax_scale, bool is_causal) -> Tensor(out), Tensor(softmax_lse)
-  args : (Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor startend_row_indices, Tensor block_mask_indices, Tensor out_grad, float softmax_scale, bool is_causal)
-  optional : block_mask_indices
+  forward : flashmask_attention_v2 (Tensor q, Tensor k, Tensor v, Tensor startend_row_indices,Tensor block_mask, float softmax_scale, bool is_causal) -> Tensor(out), Tensor(softmax_lse)
+  args : (Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor startend_row_indices, Tensor block_mask, Tensor out_grad, float softmax_scale, bool is_causal)
+  optional : block_mask
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
   infer_meta :
     func : FlashAttnGradInferMeta
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -2153,9 +2153,9 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : flashmask_attention_v2
-  args : (Tensor q, Tensor k, Tensor v, Tensor startend_row_indices, Tensor block_mask_indices, float softmax_scale, bool is_causal)
+  args : (Tensor q, Tensor k, Tensor v, Tensor startend_row_indices, Tensor block_mask, float softmax_scale, bool is_causal)
   output : Tensor(out), Tensor(softmax_lse)
-  optional : block_mask_indices
+  optional : block_mask
   infer_meta :
     func : FlashMaskV2InferMeta
     param : [q, k, v]
diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
@@ -1575,7 +1575,7 @@ def flashmask_attention(
     training: bool = True,
     name: str | None = None,
     softmax_scale: float | None = None,
-    block_mask_indices: Tensor | None = None,
+    block_mask: Tensor | None = None,
 ):
     r"""
     FlashMask: Official Implementation
@@ -1636,25 +1636,24 @@ def flashmask_attention(
         training (bool): Whether the module is in training mode. Default is True.
         name (str, optional): Name of the operation. Default is None. Normally, users do not need to set this property.
             For more information, refer to :ref:`api_guide_Name` .
-        block_mask_indices (tensor, optional):
-            block_mask_indices (Tensor, optional):
-                A 4-D integer mask tensor indicating whether each block in the attention matrix should be kept or masked. Must be used together with flashmask.
-                The shape should be [batch_size, num_heads, blocklen_q, blocklen_k], where:
+        block_mask (tensor, optional):
+            A 4-D integer mask tensor indicating whether each block in the attention matrix should be kept or masked. Must be used together with flashmask.
+            The shape should be [batch_size, num_heads, blocklen_q, blocklen_k], where:
 
-                blocklen_q = ceil(seqlen_q / 128), i.e., block_mask_indices.shape[2] must be (seqlen_q + 127) // 128
-                blocklen_k = ceil(seqlen_k / 128), i.e., block_mask_indices.shape[3] must be (seqlen_k + 127) // 128
-                block_mask_indices.shape[1] (number of heads) must match the num_heads dimension of the flashmask
-                Both seqlen_q and seqlen_k must be less than or equal to 128 * 1024
-                The dtype should be int32, and each element should be either 0 or 1.
-                A value of 1 indicates that the corresponding block is kept (not masked), while 0 means the block is masked.
+            blocklen_q = ceil(seqlen_q / 128), i.e., block_mask.shape[2] must be (seqlen_q + 127) // 128
+            blocklen_k = ceil(seqlen_k / 128), i.e., block_mask.shape[3] must be (seqlen_k + 127) // 128
+            block_mask.shape[1] (number of heads) must match the num_heads dimension of the flashmask
+            Both seqlen_q and seqlen_k must be less than or equal to 128 * 1024
+            The dtype should be int32, and each element should be either 0 or 1.
+            A value of 1 indicates that the corresponding block is kept (not masked), while 0 means the block is masked.
 
-                Usage Notes:
+            Usage Notes:
 
-                Only supported when blockdim_q = blockdim_k = 128 now.
-                Only supported when headdim = 128 now.
-                This argument must be provided together with flashmask.
-                The mask will be applied at the block level: each [i, j] position in block_mask_indices controls whether the corresponding [128 x 128] block in the attention matrix is masked.
-                Any mismatch in expected shape or head dimension will raise an error.
+            Only supported when blockdim_q = blockdim_k = 128 now.
+            Only supported when headdim = 128 now.
+            This argument must be provided together with flashmask.
+            The mask will be applied at the block level: each [i, j] position in block_mask controls whether the corresponding [128 x 128] block in the attention matrix is masked.
+            Any mismatch in expected shape or head dimension will raise an error.
 
 
     Returns
@@ -2228,7 +2227,7 @@ def flashmask_attention(
                 startend_row_indices, min=0, max=sq
             ).repeat_interleave(bsz, 0)
 
-    if block_mask_indices is not None:
+    if block_mask is not None:
         # xhy: can set a full startend_row_indices for block_mask_attn when using block_mask_attn?
         assert startend_row_indices is not None, (
             "must provide startend_row_indices when using block_mask_attn"
@@ -2275,26 +2274,24 @@ def flashmask_attention(
             "startend_row_indices head_num must be equal to 1(broadcast) or head_num_k."
         )
 
-        if block_mask_indices is not None:
-            assert block_mask_indices.dtype == paddle.int32, (
-                f"block_mask_indices.dtype must be paddle.int32, but got {block_mask_indices.dtype}"
+        if block_mask is not None:
+            assert block_mask.dtype == paddle.int32, (
+                f"block_mask.dtype must be paddle.int32, but got {block_mask.dtype}"
             )
 
-            assert block_mask_indices.shape[0] == key.shape[0], (
-                f"block_mask_indices.shape[0] must be equal to batch_size, but got {block_mask_indices.shape[0]} and {key.shape[0]}"
+            assert block_mask.shape[0] == key.shape[0], (
+                f"block_mask.shape[0] must be equal to batch_size, but got {block_mask.shape[0]} and {key.shape[0]}"
             )
 
-            assert (
-                block_mask_indices.shape[1] == startend_row_indices.shape[1]
-            ), (
-                f"block_mask_indices.shape[1] must be equal to startend_row_indices.shape[1], but got {block_mask_indices.shape[1]} and {key.shape[2]}"
+            assert block_mask.shape[1] == startend_row_indices.shape[1], (
+                f"block_mask.shape[1] must be equal to startend_row_indices.shape[1], but got {block_mask.shape[1]} and {key.shape[2]}"
             )
 
-            assert (
-                block_mask_indices.shape[2] == (query.shape[1] + 127) // 128
-            ), "block_size must be 128 when using block_mask_attn"
+            assert block_mask.shape[2] == (query.shape[1] + 127) // 128, (
+                "block_size must be 128 when using block_mask_attn"
+            )
 
-            assert block_mask_indices.shape[3] == (key.shape[1] + 127) // 128, (
+            assert block_mask.shape[3] == (key.shape[1] + 127) // 128, (
                 "block_size must be 128 when using block_mask_attn"
             )
 
@@ -2326,7 +2323,7 @@ def flashmask_attention(
         elif paddle.get_flags(["FLAGS_cudnn_deterministic"])[
             "FLAGS_cudnn_deterministic"
         ]:
-            assert block_mask_indices is None, (
+            assert block_mask is None, (
                 " blockmask attention no supports deterministic now ."
             )
             fa_version = 2
@@ -2340,7 +2337,7 @@ def flashmask_attention(
                 "flashmask_attention does not support setting softmax_scale, use flashmask_attention_v2 instead"
             )
 
-            assert block_mask_indices is None, (
+            assert block_mask is None, (
                 " blockmask attention only supports sm >= 90 now."
             )
 
@@ -2394,7 +2391,7 @@ def flashmask_attention(
                 key,
                 value,
                 startend_row_indices,
-                block_mask_indices,
+                block_mask,
                 softmax_scale,
                 causal,
             )
diff --git a/test/legacy_test/test_flashmask.py b/test/legacy_test/test_flashmask.py
@@ -260,7 +260,7 @@ def test_dot_scale_product(self):
             startend_row_indices=startend_row_indices,
             dropout=self.dropout,
             causal=self.causal,
-            block_mask_indices=blockmask,
+            block_mask=blockmask,
         )
         out_ = attention_naive_with_mask(q_, k_, v_, mask)
         out.backward(ograd)

Original file line number	Diff line number	Diff line change
`@@ -260,7 +260,7 @@ def test_dot_scale_product(self):`
`260`	`260`	`startend_row_indices=startend_row_indices,`
`261`	`261`	`dropout=self.dropout,`
`262`	`262`	`causal=self.causal,`
`263`		`- block_mask_indices=blockmask,`
	`263`	`+ block_mask=blockmask,`
`264`	`264`	`)`
`265`	`265`	`out_ = attention_naive_with_mask(q_, k_, v_, mask)`
`266`	`266`	`out.backward(ograd)`