refactor(camb)

wanfengcxz · wanfengcxz · commit 0c08f8892ce5 · 2024-10-25T12:03:21.000+08:00
diff --git a/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py b/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py
@@ -14,7 +14,6 @@ def forward(self,
                 key: Tensor,
                 cos: Tensor,
                 sin: Tensor,
-                cu_seqlens: Tensor,
                 inplace: bool = True):
         """forward."""
         if inplace:
@@ -23,7 +22,7 @@ def forward(self,
         else:
             q_embed = torch.empty_like(query)
             k_embed = torch.empty_like(key)
-        return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed, cu_seqlens)
+        return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
 
 
 class DlinferApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -17,6 +17,7 @@ class DlinferAttentionMetadata(AttentionMetadata):
     max_kv_seq_len: int = 1
     cu_seqlens: Optional[Tensor] = None
     is_flash_attn_support_inplace: bool = True
+    is_mock_q_start_loc: bool = False
 
 class DlinferAttentionImpl(AttentionImpl[DlinferAttentionMetadata]):
     """dlinfer attention implementation."""
@@ -76,6 +77,7 @@ def forward(
         max_q_seq_len = attn_metadata.max_q_seq_len
         max_kv_seq_len = attn_metadata.max_kv_seq_len
         cu_seqlens = attn_metadata.cu_seqlens
+        is_mock_q_start_loc = attn_metadata.is_mock_q_start_loc
 
         # fill kv cache
         k_cache, v_cache = self.fill_kv_cache(key, value, k_cache, v_cache,
@@ -85,6 +87,9 @@ def forward(
             inplace = inplace if attn_metadata.is_flash_attn_support_inplace \
                     else False
 
+        if is_mock_q_start_loc:
+            q_start_loc = cu_seqlens
+
         if inplace:
             attn_output = query[..., :self.v_head_size]
         else:
@@ -107,7 +112,6 @@ def forward(
             max_kv_seq_len=max_kv_seq_len,
             is_decoding=is_decoding,
             block_size=block_size,
-            cu_seqlens=cu_seqlens,
             attn_mask=attn_mask,
             is_unpaged_prefill=is_unpaged_prefill,
         )
diff --git a/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py
@@ -99,6 +99,7 @@ def update_step_context(cls, step_context):
             max_kv_seq_len=max_kv_seq_len,
             cu_seqlens=cu_seqlens,
             is_flash_attn_support_inplace=False,
+            is_mock_q_start_loc=True,
         )
 
         step_context.attn_metadata = attn_metadata
diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
@@ -10,11 +10,10 @@ def apply_rotary_pos_emb(
     sin: Tensor,
     q_embed: Tensor = None,
     k_embed: Tensor = None,
-    cu_seqlens=None,
 ):
     query_states = query_states.contiguous()
     key_states = key_states.contiguous()
-    query_states, key_states = ext_ops.apply_rotary_pos_emb(query_states, key_states, cos, sin, None, cu_seqlens)
+    query_states, key_states = ext_ops.apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
 
     if q_embed is None:
         q_embed = query_states
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -17,23 +17,17 @@ def prefill_attention(
     kv_seq_len: Tensor,
     max_q_seq_len: int,
     block_size: int,
-    cu_seqlens: Tensor,
     attn_mask: Sequence[Optional[Tensor]],
     is_unpaged_prefill: Optional[bool],
 ):
-    num_q_heads = query_states.shape[1]
-    num_kv_heads = value_states.shape[1]
-
     if is_unpaged_prefill:
         return ext_ops.prefill_attention(
             query_states,
             key_states,
             value_states,
-            cu_seqlens,
+            q_start_loc,
             q_seq_len,
             max_q_seq_len,
-            num_q_heads,
-            num_kv_heads,
             attn_mask,
             attn_output=attn_output,
         )
@@ -56,11 +50,6 @@ def prefill_attention(
 
 def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
                           max_kv_seq_len, block_offsets, block_size):
-    num_q_heads = q.shape[1]
-    num_kv_heads = k_cache.shape[1]
-    q = q.unsqueeze(1)
-    attn_output = attn_output.unsqueeze(1)
-
     return ext_ops.paged_decode_attention(
         q,
         k_cache,
@@ -69,8 +58,6 @@ def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
         block_size,
         kv_seq_len,
         max_kv_seq_len,
-        num_q_heads,
-        num_kv_heads,
         attn_output=attn_output,
     )
 
@@ -90,7 +77,6 @@ def paged_attention_fwd(
     max_kv_seq_len: int,
     is_decoding: bool,
     block_size: int,
-    cu_seqlens: Tensor,
     attn_mask: Sequence[Optional[Tensor]] = (),
     is_unpaged_prefill: Optional[bool] = None,
 ):
@@ -108,7 +94,6 @@ def paged_attention_fwd(
             kv_seqlens,
             max_q_seq_len,
             block_size,
-            cu_seqlens,
             attn_mask,
             is_unpaged_prefill,
         )
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
@@ -75,15 +75,13 @@ def forward(
         query_states, key_states, value_states = self.wqkv.split_qkv(
             qkv_states)
 
-        cu_seqlens = attn_metadata.cu_seqlens
         # apply rotary embedding
         cos, sin = rotary_pos_emb
         query_states, key_states = self.apply_rotary_pos_emb(
             query_states,
             key_states,
             cos,
             sin,
-            cu_seqlens,
             inplace=True,
         )
 
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -43,7 +43,6 @@ def forward(self,
                 key: Tensor,
                 cos: Tensor,
                 sin: Tensor,
-                cu_seqlens: Tensor,
                 inplace: bool = True):
         """forward."""
-        return self.impl.forward(query, key, cos, sin, cu_seqlens, inplace)
+        return self.impl.forward(query, key, cos, sin, inplace)

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ def update_step_context(cls, step_context):`
`99`	`99`	`max_kv_seq_len=max_kv_seq_len,`
`100`	`100`	`cu_seqlens=cu_seqlens,`
`101`	`101`	`is_flash_attn_support_inplace=False,`
	`102`	`+ is_mock_q_start_loc=True,`
`102`	`103`	`)`
`103`	`104`
`104`	`105`	`step_context.attn_metadata = attn_metadata`