Varlen mode for NSA layer

mutiann · mutiann · commit ff69e598301c · 2025-09-01T02:42:53.000+02:00
diff --git a/fla/layers/nsa.py b/fla/layers/nsa.py
@@ -13,6 +13,7 @@
 from fla.modules import RotaryEmbedding
 from fla.ops.nsa.parallel import parallel_nsa
 from fla.ops.utils.index import prepare_lens_from_mask
+from fla.layers.utils import pad_input, unpad_input
 
 if TYPE_CHECKING:
     from fla.models.utils import Cache
@@ -80,26 +81,24 @@ def forward(
                 "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
             )
 
-        batch_size, seq_len, _ = hidden_states.size()
+        batch_size, q_len, _ = hidden_states.size()
 
         q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         v = rearrange(self.v_proj(hidden_states), '... (h d) -> ... h d', d=self.head_dim)
         g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=3)
-        g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
 
         cu_seqlens = kwargs.get('cu_seqlens', None)
 
-        seqlen_offset, max_seqlen = 0, seq_len
+        seqlen_offset, max_seqlen = 0, q_len
         if past_key_values is not None:
             seqlen_offset = past_key_values.get_seq_length(self.layer_idx)
             max_seqlen = q.shape[1] + seqlen_offset
 
-            # Disable for now; varlen is not supported yet, and the "correct" RoPE offsets will disturb outputs
-            # if attention_mask is not None:
-            #     # to deliminate the offsets of padding tokens
-            #     seqlen_offset = seqlen_offset + prepare_lens_from_mask(attention_mask) - attention_mask.shape[-1]
-            #     max_seqlen = q.shape[1] + max(seqlen_offset)
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = seqlen_offset + prepare_lens_from_mask(attention_mask) - attention_mask.shape[-1]
+                max_seqlen = q.shape[1] + max(seqlen_offset)
 
         if self.max_position_embeddings is not None:
             max_seqlen = max(max_seqlen, self.max_position_embeddings)
@@ -110,26 +109,46 @@ def forward(
             k_cached, v_cached = past_key_values.update(
                 attn_state=(k.flatten(-2, -1), v.flatten(-2, -1)),
                 layer_idx=self.layer_idx,
-                offset=seq_len,
+                offset=q_len,
             )['attn_state']
             if cache_has_content:
                 k, v = k_cached, v_cached
                 k = rearrange(k, '... (h d) -> ... h d', d=self.head_dim)
                 v = rearrange(v, '... (h d) -> ... h d', d=self.head_dim)
 
-        o = parallel_nsa(
-            q=q,
-            k=k,
-            v=v,
-            g_cmp=g_cmp,
-            g_slc=g_slc,
-            g_swa=g_swa,
-            block_size=self.block_size,
-            block_counts=self.block_counts,
-            window_size=self.window_size,
-            cu_seqlens=cu_seqlens,
-        )
-        o = o.reshape(batch_size, seq_len, -1)
+        if attention_mask is not None:
+            (q, g), (k, v), indices_q, cu_seqlens, max_seq_lens = unpad_input(
+                (q, g), (k, v), attention_mask, q_len, keepdim=True)
+            g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+            o = parallel_nsa(
+                q=q,
+                k=k,
+                v=v,
+                g_cmp=g_cmp,
+                g_slc=g_slc,
+                g_swa=g_swa,
+                block_size=self.block_size,
+                block_counts=self.block_counts,
+                window_size=self.window_size,
+                cu_seqlens=cu_seqlens,
+            ).squeeze(0)
+            o = pad_input(o, indices_q, batch_size, q_len)
+        else:
+            g_cmp, g_slc, g_swa = g.sigmoid().unbind(-1)
+            o = parallel_nsa(
+                q=q,
+                k=k,
+                v=v,
+                g_cmp=g_cmp,
+                g_slc=g_slc,
+                g_swa=g_swa,
+                block_size=self.block_size,
+                block_counts=self.block_counts,
+                window_size=self.window_size,
+                cu_seqlens=cu_seqlens,
+            )
+
+        o = o.reshape(batch_size, q_len, -1)
         o = self.o_proj(o)
 
         if not output_attentions:
diff --git a/fla/layers/utils.py b/fla/layers/utils.py
@@ -3,7 +3,7 @@
 
 # Code is adapted from flash-attn.bert_padding.py
 
-from typing import Tuple
+from typing import Tuple, Union
 
 import torch
 from einops import rearrange, repeat
@@ -99,7 +99,7 @@ def get_unpad_data(
 
 
 def unpad_input(
-    q: torch.Tensor,
+    q: Union[torch.Tensor, Tuple[torch.Tensor]],
     states: Tuple[torch.Tensor],
     attention_mask: torch.Tensor,
     q_len: int,
@@ -111,8 +111,9 @@ def unpad_input(
 
 
     Arguments:
-        q (`torch.Tensor`):
+        q (`torch.Tensor` or `Tuple[torch.Tensor]`):
             Query state with padding. Shape: [batch_size, q_len, ...].
+            When it is a tuple, do unpadding for each tensor in the tuple.
         states (`Tuple[torch.Tensor]`):
             Attention state with padding. Shape: [batch_size, seq_len, ...].
         attention_mask (`torch.Tensor`):
@@ -123,9 +124,10 @@ def unpad_input(
             Whether to keep the batch dimension. Default: `False`.
 
     Return:
-        q (`torch.Tensor`):
+        q (`torch.Tensor` or `Tuple[torch.Tensor]`):
             Query state without padding.
             Shape: [1, total_target_length, ...] if `keepdim=True` else [total_target_length, ...].
+            When the `q` passed in is a tuple, return a tuple of such unpadded tensors.
         states (`Tuple[torch.Tensor]`):
             Attention state without padding.
             Shape: [1, total_source_length, ...] if `keepdim=True` else [total_source_length, ...].
@@ -146,23 +148,30 @@ def unpad_input(
         index_first_axis(rearrange(s, "b s ... -> (b s) ..."), indices_k)
         for s in states
     )
+    if isinstance(q, torch.Tensor):
+        q = (q,)
+        cast_tuple = True
+    else:
+        cast_tuple = False
 
     if q_len == seq_len:
-        q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices_k)
+        q = tuple(index_first_axis(rearrange(q_, "b s ... -> (b s) ..."), indices_k) for q_ in q)
         cu_seqlens_q = cu_seqlens_k
         max_seqlen_in_batch_q = max_seqlen_in_batch_k
         indices_q = indices_k
     elif q_len == 1:
         max_seqlen_in_batch_q = 1
-        cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+        cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q[0].device)
         indices_q = cu_seqlens_q[:-1]
-        q = q.squeeze(1)
+        q = tuple(q_.squeeze(1) for q_ in q)
     else:
         raise NotImplementedError("We only support either q_len == k_len (prefilling) or q_len == 1 (decoding)")
 
     if keepdim:
-        q = q.unsqueeze(0)
+        q = tuple(q_.unsqueeze(0) for q_ in q)
         state = tuple(s.unsqueeze(0) for s in state)
+    if cast_tuple:
+        q = q[0]
 
     return (
         q,