ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 77 additions & 2 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 77 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 10 additions & 0 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 46 additions & 0 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe_ep.py‎
Lines changed: 11 additions & 0 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe_ep.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 23 additions & 1 deletion b/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎lightllm/common/fused_moe/moe_sum_reduce.py‎
Lines changed: 24 additions & 1 deletion b/‎lightllm/common/fused_moe/moe_sum_reduce.py‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py‎
Lines changed: 45 additions & 2 deletions b/‎lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py‎
Lines changed: 45 additions & 2 deletions
diff --git a/‎lightllm/common/triton_utils/__init__.py‎ b/‎lightllm/common/triton_utils/__init__.py‎
@@ -1,6 +1,7 @@
 import os
 
 # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+import gc
 import copy
 import json
 import torch
@@ -24,8 +25,8 @@
 from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status
-
+from lightllm.utils.envs_utils import set_model_init_status, is_triton_autotune_enabled, disable_triton_autotune
+from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
 
@@ -100,6 +101,7 @@ def __init__(self, kvargs):
         self._init_some_value()
         self._init_custom()
         self._init_inferstate_cls()
+        self._autotune_warmup()
         self._init_padded_req()
         self._init_cudagraph()
         self._check_max_len_infer()
@@ -721,6 +723,79 @@ def _check_max_len_infer(self):
             raise Exception(exception_str)
         return
 
+    def autotune_layers(self):
+        # 控制autotune的层数，用于适配不同模型
+        return self.config.get("first_k_dense_replace", 0) + 1
+
+    @final
+    @torch.no_grad()
+    @post_empty_cache
+    def _autotune_warmup(self):
+        if not is_triton_autotune_enabled():
+            return
+
+        torch.distributed.barrier()
+
+        warmup_lengths = [1, 8, 16, 64, 128, 256, 1024, 2048, 4096]
+
+        if self.batch_max_tokens not in warmup_lengths:
+            warmup_lengths.append(self.batch_max_tokens)
+
+        warmup_lengths = [e for e in warmup_lengths if e <= self.batch_max_tokens]
+
+        warmup_lengths.sort(reverse=True)
+
+        layer_num_bak = self.layers_num
+        self.layers_num = self.autotune_layers()
+        for input_len in warmup_lengths:
+            try:
+                logger.info(f"autotune warmup for length {input_len}")
+                rand_gen = torch.Generator(device="cuda")
+                rand_gen.manual_seed(input_len)
+                dummy_input_ids = torch.randint(
+                    0, 10000, (input_len,), dtype=torch.int32, device="cuda", generator=rand_gen
+                )
+                b_req_idx = torch.tensor([self.req_manager.alloc()], dtype=torch.int32, device="cuda")
+                mem_indexes = self.mem_manager.alloc(len(dummy_input_ids)).cuda()
+                b_seq_len = torch.ones(1, dtype=torch.int32, device="cuda")
+                b_seq_len[:] = input_len
+                b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
+                total_token_num = input_len
+                b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
+                model_input = ModelInput(
+                    batch_size=1,
+                    total_token_num=total_token_num,
+                    max_len_in_batch=input_len,
+                    input_ids=dummy_input_ids,
+                    mem_indexes=mem_indexes,
+                    b_req_idx=b_req_idx,
+                    b_seq_len=b_seq_len,
+                    b_mtp_index=b_mtp_index,
+                    is_prefill=True,
+                    b_ready_cache_len=b_ready_cache_len,
+                    multimodal_params=[],
+                    **self._gen_special_model_input(total_token_num),
+                )
+                model_output = self.forward(
+                    model_input,
+                )
+                del model_output
+                self.req_manager.free_all()
+                self.mem_manager.free_all()
+                gc.collect()
+                torch.cuda.empty_cache()
+                logger.info(f"autotune warmup for length {input_len} ok")
+            except Exception as e:
+                logger.warning(f"autotune warmup for length {input_len} failed: {str(e)}")
+                logger.exception(str(e))
+                self.req_manager.free_all()
+                self.mem_manager.free_all()
+                gc.collect()
+                torch.cuda.empty_cache()
+        self.layers_num = layer_num_bak
+        torch.distributed.barrier()
+        disable_triton_autotune()
+
     @final
     @torch.no_grad()
     def _init_padded_req(self):
 
@@ -17,6 +17,7 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
+from lightllm.utils.envs_utils import is_triton_autotune_enabled
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -353,6 +354,15 @@ def prefilled_group_gemm(
             )
             # gather and local reduce
             ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
+        else:
+            ######################################## warning ##################################################
+            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
+            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
+            if is_triton_autotune_enabled():
+                _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
+                _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
+                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
+                _gemm_out_a, _silu_out = None, None
 
         return gather_out
 
 
@@ -35,6 +35,7 @@
 from .moe_sum_reduce import moe_sum_reduce
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.utils.torch_ops_utils import direct_register_custom_op
+from lightllm.common.triton_utils.autotuner import autotune
 
 FFN_MOE_CHUNK_SIZE = 32 * 1024
 
@@ -449,6 +450,51 @@ def grouped_matmul_kernel(
     return
 
 
+def _get_grouped_matmul_static_key(
+    expert_weights: torch.Tensor,
+    topk_num: int,
+    out: torch.Tensor,
+    mul_routed_weight: bool,
+    use_fp8_w8a8: bool,
+) -> dict:
+    expert_num, n, k = expert_weights.shape
+    return {
+        "N": n,
+        "K": k,
+        "topk_num": topk_num,
+        "expert_num": expert_num,
+        "mul_routed_weight": mul_routed_weight,
+        "use_fp8_w8a8": use_fp8_w8a8,
+        "out_dtype": str(out.dtype),
+    }
+
+
+def _get_grouped_matmul_configs():
+    return [
+        {
+            "BLOCK_SIZE_M": bm,
+            "BLOCK_SIZE_N": bn,
+            "BLOCK_SIZE_K": bk,
+            "GROUP_SIZE_M": gm,
+            "num_warps": nw,
+            "num_stages": ns,
+        }
+        for ns in [1, 2, 3, 4, 5]
+        for gm in [1, 2, 4, 8]
+        for nw in [2, 4, 8]
+        for bm in [16, 32, 64, 128]
+        for bn in [16, 32, 64, 128]
+        for bk in [16, 32, 64, 128]
+    ]
+
+
+@autotune(
+    kernel_name="grouped_matmul:v1",
+    configs_gen_func=_get_grouped_matmul_configs,
+    static_key_func=_get_grouped_matmul_static_key,
+    run_key_func=lambda token_inputs: token_inputs.shape[0],
+    mutates_args=["out"],
+)
 def grouped_matmul(
     token_num_mul_topk_num: int,
     token_inputs: torch.Tensor,
 
@@ -14,6 +14,7 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
+from lightllm.utils.envs_utils import is_triton_autotune_enabled
 import numpy as np
 
 logger = init_logger(__name__)
@@ -186,6 +187,16 @@ def fused_experts_impl(
 
             # gather and local reduce
             ep_gather(gemm_out_b, recv_topk_idx, recv_topk_weights, output_index, gather_out)
+        else:
+            ######################################## warning ##################################################
+            # here is used to match autotune feature, make moe model run same triton kernel in different rank.
+            # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
+            if is_triton_autotune_enabled():
+                _gemm_out_a = torch.zeros((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
+                _silu_out = torch.zeros((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
+                silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
+                _gemm_out_a, _silu_out = None, None
+
         # normal combine
         combined_x, _, event = buffer.combine(
             gather_out,
 
@@ -3,6 +3,7 @@
 import triton
 import triton.language as tl
 from .moe_silu_and_mul_config import MoeSiluAndMulKernelConfig
+from lightllm.common.triton_utils.autotuner import autotune
 
 
 @triton.jit
@@ -62,7 +63,28 @@ def _silu_and_mul_kernel_fast(
         )
 
 
-def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, **run_config):
+def _get_silu_and_mul_configs():
+    return [
+        {"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}
+        for ns in [1, 2, 4]
+        for nw in [1, 4, 8]
+        for bm in [32, 64, 128, 256]
+        for bn in [32, 64, 128, 256]
+    ]
+
+
+def _get_silu_and_mul_static_key(input: torch.Tensor, output: torch.Tensor):
+    return {"N": input.shape[-1] // 2, "out_dtype": str(output.dtype)}
+
+
+@autotune(
+    kernel_name="silu_and_mul_fwd:v1",
+    configs_gen_func=_get_silu_and_mul_configs,
+    static_key_func=_get_silu_and_mul_static_key,
+    run_key_func=lambda input: input.shape[0],
+    mutates_args=["output"],
+)
+def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, run_config=None):
     assert input.is_contiguous()
     assert output.is_contiguous()
 
 
@@ -3,6 +3,8 @@
 import triton
 import triton.language as tl
 from .moe_sum_recude_config import MoeSumReduceKernelConfig
+from typing import Any, Callable, Dict, Optional, Tuple
+from lightllm.common.triton_utils.autotuner import autotune
 
 
 @triton.jit
@@ -46,7 +48,28 @@ def _moe_sum_reduce_kernel(
         tl.store(store_t_ptr, accumulator.to(input_ptr.dtype.element_ty), mask=offs_dim < dim_end)
 
 
-def moe_sum_reduce(input: torch.Tensor, output: torch.Tensor, **run_config):
+def _get_moe_sum_reduce_static_key(input: torch.Tensor, output: torch.Tensor):
+    return {"topk_num": input.shape[1], "hidden_dim": input.shape[2], "out_dtype": str(output.dtype)}
+
+
+def _get_moe_sum_reduce_configs():
+    return [
+        {"BLOCK_M": bm, "BLOCK_DIM": bd, "NUM_STAGE": ns, "num_warps": nw}
+        for ns in [1, 2, 4]
+        for nw in [1, 2, 4, 8, 16]
+        for bm in [1, 2, 4, 8, 16, 32]
+        for bd in [64, 128, 256, 512, 1024]
+    ]
+
+
+@autotune(
+    kernel_name="moe_sum_reduce:v1",
+    configs_gen_func=_get_moe_sum_reduce_configs,
+    static_key_func=_get_moe_sum_reduce_static_key,
+    run_key_func=lambda input: input.shape[0],
+    mutates_args=["output"],
+)
+def moe_sum_reduce(input: torch.Tensor, output: torch.Tensor, run_config: Dict = None):
     assert input.is_contiguous()
     assert output.is_contiguous()
 
 
@@ -7,6 +7,7 @@
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple
 from triton import Config
+from lightllm.common.triton_utils.autotuner import autotune
 
 
 class Fp8BlockMMKernelConfig(KernelConfigs):
@@ -142,6 +143,46 @@ def _block_scaled_block_gemm(
     tl.store(c_ptrs, acc, mask=mask)
 
 
+def get_test_configs():
+    fp8_gemm_configs = [
+        {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 3, "num_warps": 8},
+        {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2},
+        {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2},
+        {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8},
+        {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8},
+        {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+        {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4},
+    ]
+    return fp8_gemm_configs
+
+
+def _get_static_key(A, B, block_size, dtype):
+    M, K = A.shape
+    _, N = B.shape
+    return {
+        "N": N,
+        "K": K,
+        "block_size": block_size,
+        "out_dtype": str(dtype),
+    }
+
+
+@autotune(
+    kernel_name="w8a8_block_fp8_matmul:v1",
+    configs_gen_func=get_test_configs,
+    static_key_func=_get_static_key,
+    run_key_func=lambda A: A.shape[0],
+    mutates_args=["C"],
+)
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -150,7 +191,7 @@ def w8a8_block_fp8_matmul(
     C: torch.Tensor,
     block_size: List[int],
     dtype: torch.dtype = torch.bfloat16,
-    **run_config,
+    run_config=None,
 ) -> torch.Tensor:
     """w8a8fp8 block-wise quantization mm.
 
@@ -174,7 +215,9 @@ def w8a8_block_fp8_matmul(
     assert triton.cdiv(K, block_k) == Ascale.shape[-1] and Ascale.shape[-1] == Bscale.shape[0]
     assert triton.cdiv(N, block_n) == Bscale.shape[1]
     if not run_config:
-        run_config = Fp8BlockMMKernelConfig.try_to_get_best_config(M, N, K, block_size, dtype)
+        run_config = Fp8BlockMMKernelConfig.try_to_get_best_config(
+            M=M, N=N, K=K, block_size=block_size, out_dtype=dtype
+        )
     grid = (triton.cdiv(M, run_config["BLOCK_M"]) * triton.cdiv(N, run_config["BLOCK_N"]),)
     _block_scaled_block_gemm[grid](
         A,