CyCle1024
diff --git a/‎lmdeploy/pytorch/backends/attention.py
+3 b/‎lmdeploy/pytorch/backends/attention.py
+3
diff --git a/‎lmdeploy/pytorch/backends/cuda/attention.py
+153 b/‎lmdeploy/pytorch/backends/cuda/attention.py
+153
diff --git a/‎lmdeploy/pytorch/backends/cuda/op_backend.py
+7 b/‎lmdeploy/pytorch/backends/cuda/op_backend.py
+7
diff --git a/‎lmdeploy/pytorch/config.py
+1 b/‎lmdeploy/pytorch/config.py
+1
diff --git a/‎lmdeploy/pytorch/configurations/deepseek_v2.py
+4-1 b/‎lmdeploy/pytorch/configurations/deepseek_v2.py
+4-1
diff --git a/‎lmdeploy/pytorch/configurations/utils.py
+19 b/‎lmdeploy/pytorch/configurations/utils.py
+19
diff --git a/‎lmdeploy/pytorch/engine/engine.py
+10-3 b/‎lmdeploy/pytorch/engine/engine.py
+10-3
diff --git a/‎lmdeploy/pytorch/engine/executor/base.py
+10-5 b/‎lmdeploy/pytorch/engine/executor/base.py
+10-5
diff --git a/‎lmdeploy/pytorch/kernels/cuda/__init__.py
+2 b/‎lmdeploy/pytorch/kernels/cuda/__init__.py
+2
@@ -35,6 +35,7 @@ def __init__(
         sliding_window: int = None,
         logit_softcapping: float = None,
         causal: bool = True,
+        use_flash_mla: bool = False,
         **kwargs,
     ) -> None:
         if scale is None:
@@ -55,6 +56,7 @@ def __init__(
         self.sliding_window = sliding_window
         self.logit_softcapping = logit_softcapping
         self.causal = causal
+        self.use_flash_mla = use_flash_mla
 
     @abstractmethod
     def forward(
@@ -85,6 +87,7 @@ def build(
         sliding_window: int = None,
         logical_softcapping: float = None,
         causal: bool = True,
+        use_flash_mla: bool = False,
         **kwargs,
     ) -> AttentionImpl[T]:
         """build."""
 
@@ -22,6 +22,9 @@ class TritonAttentionMetadata(AttentionMetadata):
     fill_seqlens: torch.Tensor = None
     quant_policy: Literal[0, 4, 8] = 0
     kv_flatten_size: int = None
+    # flash mla
+    tile_scheduler_metadata: torch.Tensor = None
+    num_splits: torch.Tensor = None
 
 
 def _cdiv(a, b):
@@ -196,6 +199,144 @@ def forward(
         return attn_output
 
 
+class FlashMLAImpl(TritonAttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float = None,
+        num_kv_heads: int = None,
+        v_head_size: int = None,
+        alibi: bool = False,
+        sliding_window: int = None,
+        logit_softcapping: float = None,
+        causal: bool = True,
+        **kwargs,
+    ):
+        assert sliding_window is None, 'sliding window not supported for FlashMLA'
+        assert alibi is False, 'alibi not supported for FlashMLA'
+        assert logit_softcapping is None, 'logit_softcapping not supported for FlashMLA'
+        super().__init__(
+            num_heads=num_heads,
+            head_size=head_size,
+            scale=scale,
+            num_kv_heads=num_kv_heads,
+            v_head_size=v_head_size,
+            alibi=alibi,
+            sliding_window=sliding_window,
+            logit_softcapping=logit_softcapping,
+            causal=causal,
+            **kwargs,
+        )
+
+        from lmdeploy.pytorch.kernels.cuda import flash_mla_fwd
+        self.flash_mla_fwd = flash_mla_fwd
+        assert num_kv_heads == 1, 'MLA requires num kv heads equal to 1'
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        attn_metadata: TritonAttentionMetadata,
+        k_scales_zeros: torch.Tensor = None,
+        v_scales_zeros: torch.Tensor = None,
+        inplace: bool = True,
+    ) -> torch.Tensor:
+        """forward."""
+
+        block_offsets = attn_metadata.block_offsets
+        q_start_loc = attn_metadata.q_start_loc
+        fill_q_start_loc = q_start_loc
+        q_seqlens = attn_metadata.q_seqlens
+        fill_seqlens = q_seqlens
+        kv_start_loc = attn_metadata.kv_start_loc
+        kv_seqlens = attn_metadata.kv_seqlens
+        kv_flatten_size = attn_metadata.kv_flatten_size
+        quant_policy = attn_metadata.quant_policy
+        if attn_metadata.is_decoding:
+            max_q_seqlen = 1
+        else:
+            max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
+        fill_max_q_seqlen = max_q_seqlen
+        if attn_metadata.fill_seqlens is not None:
+            fill_seqlens = attn_metadata.fill_seqlens
+            fill_max_q_seqlen = key.numel() // (key.size(-1) * key.size(-2))
+            fill_q_start_loc = fill_seqlens.cumsum(0) - fill_seqlens
+
+        # fill kv cache
+        if key is not None and value is not None:
+            self.fill_kv_cache(
+                key,
+                value,
+                k_cache,
+                v_cache,
+                fill_q_start_loc,
+                fill_seqlens,
+                kv_seq_length=kv_seqlens,
+                max_q_seq_length=fill_max_q_seqlen,
+                block_offsets=block_offsets,
+                k_scales_zeros=k_scales_zeros,
+                v_scales_zeros=v_scales_zeros,
+                quant_policy=quant_policy,
+            )
+
+        q_shape = query.shape
+        o_shape = q_shape[:-1] + (self.v_head_size, )
+        attn_output = query.new_empty(o_shape)
+
+        is_decoding = attn_metadata.is_decoding
+        if is_decoding:
+            query = query.unsqueeze(1)
+            if kv_seqlens.dtype == torch.int64:
+                kv_seqlens = kv_seqlens.to(torch.int32)
+            attn_output = self.flash_mla_fwd(query,
+                                             k_cache=k_cache,
+                                             block_table=block_offsets,
+                                             cache_seqlens=kv_seqlens,
+                                             head_dim_v=self.v_head_size,
+                                             softmax_scale=self.scale,
+                                             tile_scheduler_metadata=attn_metadata.tile_scheduler_metadata,
+                                             num_splits=attn_metadata.num_splits,
+                                             causal=True)
+
+        else:
+            BLOCK_BS = k_cache.size(1)
+            # pad one more block to avoid invalid kv visit
+            out_size = (_cdiv(kv_flatten_size, BLOCK_BS) * BLOCK_BS + BLOCK_BS)
+            flatten_k, flatten_v = self.flatten_kv_cache(
+                k_cache,
+                v_cache,
+                kv_seqlens,
+                block_offsets,
+                start_loc=kv_start_loc,
+                out_size=out_size,
+                out_dtype=query.dtype,
+                k_scales_zeros=k_scales_zeros,
+                v_scales_zeros=v_scales_zeros,
+                quant_policy=quant_policy,
+            )
+            self.flash_attention_fwd(
+                query,
+                flatten_k,
+                flatten_v,
+                attn_output,
+                q_start_loc=q_start_loc,
+                q_seqlens=q_seqlens,
+                kv_start_loc=kv_start_loc,
+                kv_seqlens=kv_seqlens,
+                max_seqlen=max_q_seqlen,
+                window_size=self.sliding_window,
+                sm_scale=self.scale,
+                logit_softcapping=self.logit_softcapping,
+                causal=self.causal,
+            )
+        return attn_output
+
+
 class TritonAttentionBuilder(AttentionBuilder[TritonAttentionMetadata]):
     """triton attention builder."""
 
@@ -210,9 +351,21 @@ def build(
         sliding_window: int = None,
         logical_softcapping: float = None,
         causal: bool = True,
+        use_flash_mla: bool = False,
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
+        if use_flash_mla is True:
+            return FlashMLAImpl(num_heads,
+                                head_size,
+                                scale=scale,
+                                num_kv_heads=num_kv_heads,
+                                v_head_size=v_head_size,
+                                alibi=alibi,
+                                sliding_window=sliding_window,
+                                logical_softcapping=logical_softcapping,
+                                causal=causal,
+                                **kwargs)
         return TritonAttentionImpl(num_heads,
                                    head_size,
                                    scale=scale,
 
@@ -125,6 +125,13 @@ def update_step_context(cls, step_context):
             kv_flatten_size=kv_flatten_size,
             quant_policy=step_context.kv_quant_policy,
         )
+        if getattr(step_context.model_config, 'use_flash_mla', False) is True:
+            if step_context.is_decoding is True:
+                import flash_mla_cuda
+                tile_scheduler_metadata, num_splits = flash_mla_cuda.get_mla_metadata(
+                    attn_metadata.kv_seqlens.to(torch.int32), step_context.model_config.num_attention_heads, 1)
+                attn_metadata.tile_scheduler_metadata = tile_scheduler_metadata
+                attn_metadata.num_splits = num_splits
 
         cross_seqlens = step_context.cross_seqlens
         cross_kv_seqlens = step_context.cross_kv_seqlens
 
@@ -108,6 +108,7 @@ class ModelConfig:
     hf_config: Any = None
     cogvlm_style: bool = False
     custom_module_map: Dict[str, setattr] = None
+    use_flash_mla: bool = False
 
     def get_head_size(self):
         """get head size."""
 
@@ -2,6 +2,7 @@
 from lmdeploy.pytorch.config import ModelConfig
 
 from .builder import AutoModelConfigBuilder
+from .utils import flash_mla_available
 
 
 class DeepseekV2ModelConfigBuilder(AutoModelConfigBuilder):
@@ -23,6 +24,7 @@ def build(cls, hf_config, model_path: str = None, **kwargs):
         tp = kwargs.get('tp', 1)
         # update num_kv_heads for tp mode
         num_key_value_heads = cls.update_num_kv_heads(hf_config, tp, num_key_value_heads)
+        hf_config.use_flash_mla = flash_mla_available()
 
         return ModelConfig(hidden_size=hf_config.hidden_size,
                            num_layers=hf_config.num_hidden_layers,
@@ -33,4 +35,5 @@ def build(cls, hf_config, model_path: str = None, **kwargs):
                            head_dim=head_dim,
                            k_head_dim=k_head_dim,
                            v_head_dim=v_head_dim,
-                           vocab_size=hf_config.vocab_size)
+                           vocab_size=hf_config.vocab_size,
+                           use_flash_mla=hf_config.use_flash_mla)
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
+
+
+def flash_mla_available():
+    """Check if flash mla is available."""
+    # use flash_mla by default if it is installed
+    use_flash_mla = False
+    try:
+        import flash_mla_cuda  # noqa
+        if torch.cuda.get_device_properties(0).major >= 9:
+            use_flash_mla = True
+    except ImportError:
+        logger.warning('For higher performance, please install flash_mla https://github.com/deepseek-ai/FlashMLA')
+    return use_flash_mla
@@ -40,10 +40,10 @@ class InferOutput:
     logits: torch.Tensor = None
 
 
-def _tensorlize_block_offsets(block_offsets):
+def _tensorlize_block_offsets(block_offsets, dtype=torch.int32):
     """tensorlize block_offsets."""
     from torch.nn.utils.rnn import pad_sequence
-    block_offsets = [torch.from_numpy(off) for off in block_offsets]
+    block_offsets = [torch.from_numpy(off).to(dtype) for off in block_offsets]
     block_offsets = pad_sequence(block_offsets, batch_first=True)
     return block_offsets
 
@@ -371,6 +371,13 @@ def model_config(self) -> ModelConfig:
     def gpu_count(self):
         return self.tp * self.dp
 
+    @property
+    def torch_int_dtype(self):
+        """return int32 for cuda, int64 for others."""
+        if self.executor.device_type == 'cuda':
+            return torch.int32
+        return torch.int64
+
     @logging_timer('CreateModelInputs', logger)
     def create_model_inputs(self, messages: SeqList, is_prefill: bool):
         """create model inputs from messages.
@@ -398,7 +405,7 @@ def create_model_inputs(self, messages: SeqList, is_prefill: bool):
         max_q_seq_length = seq_length.max().item()
 
         block_offsets = self.scheduler.get_block_tables(messages)
-        block_offsets = _tensorlize_block_offsets(block_offsets)
+        block_offsets = _tensorlize_block_offsets(block_offsets, dtype=self.torch_int_dtype)
 
         local_adapter_ids = None
         if self.adapter_manager.num_adapters() > 1:
 
@@ -101,11 +101,16 @@ def _get_runtime_size(self, num_free_gpu_mem: int, cache_block_size: int, vocal_
 
     def _adjust_block_size(self):
         """adjust block_size."""
+        if self.model_config.use_flash_mla is True:
+            if self.cache_config.block_size != 64:
+                raise ValueError('Please set block_size to 64 for flash_mla.')
+            return
         # TODO: support kernel with both large head dim and large block size.
         if self.model_config.k_head_dim >= 512 and self.cache_config.block_size > 32:
             self.cache_config.block_size = 32
-            logger.warning(f'Update `block_size={self.cache_config.block_size}`'
-                           f' for large `head_dim={self.model_config.k_head_dim}`.')
+            logger.warning(
+                f'Update `block_size={self.cache_config.block_size}` for large `head_dim={self.model_config.k_head_dim}`.'  # noqa
+            )
 
     def update_configs(self):
         """update cache config."""
@@ -114,7 +119,7 @@ def update_configs(self):
         model_config = self.model_config
         free_mems = self.gather_free_mem()
         free_mem = min(free_mems)
-        logger.debug(f'minimal free gpu memory: {free_mem>>20} mb')
+        logger.debug(f'minimal free gpu memory: {free_mem >> 20} mb')
         vocal_size = self.model_config.vocab_size
 
         cache_block_size = CacheEngine.get_cache_block_size(cache_config.block_size, model_config, self.tp,
@@ -126,7 +131,7 @@ def update_configs(self):
             cache_config.max_prefill_token_num = max_prefill_token_num
             logger.warning(f'No enough memory. Update max_prefill_token_num={max_prefill_token_num}')
         free_mem -= runtime_mem
-        logger.debug(f'estimated max runtime memory: {runtime_mem>>20} mb')
+        logger.debug(f'estimated max runtime memory: {runtime_mem >> 20} mb')
         available_mem = free_mem * cache_config.cache_max_entry_count
 
         if cache_config.num_gpu_blocks == 0:
@@ -144,5 +149,5 @@ def init(self):
         self.update_configs()
         logger.info('Building GraphRunner.')
         self.build_graph_runner()
-        logger.info(f'Building CacheEngine with config:\n{self.cache_config}.')
+        logger.info(f'Building CacheEngine with config: \n{self.cache_config}.')
         self.build_cache_engine()
@@ -3,6 +3,7 @@
 from .alibi_pagedattention import alibi_paged_attention_fwd
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .fill_kv_cache import fill_kv_cache
+from .flash_mla import flash_mla_fwd
 from .flashattention import flash_attention_fwd
 from .flatten_kv_cache import flatten_kv_cache
 from .fused_moe import fused_moe
@@ -30,4 +31,5 @@
     'flash_attention_fwd',
     'flatten_kv_cache',
     'fused_moe_w8a8',
+    'flash_mla_fwd',
 ]