Enable cutlass fp8 kernels

amirkl94 · amirkl94 · commit c595cbb9af8d · 2025-10-23T21:35:06.000+03:00
Signed-off-by: Amir Klein &lt;203507526+amirkl94@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -140,10 +140,12 @@ def apply(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool | None,
     ):
-        assert activation == "silu", (
-            "Only activation silu is supported in FlashInferExperts"
-        )
+        from flashinfer.fused_moe.core import ActivationType
 
+        activation_str_to_value_map = {
+            "silu": ActivationType.Swiglu,  # This is the default
+            "relu2_no_mul": ActivationType.Relu2,
+        }
         if self.quant_dtype == torch.float8_e4m3fn:
             quant_scales = [
                 self.g1_alphas,
@@ -193,6 +195,7 @@ def apply(
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            activation_type=activation_str_to_value_map[activation],
         )
 
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -354,11 +354,7 @@ def __init__(
 
         self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if (
-            envs.VLLM_USE_FLASHINFER_MOE_FP8
-            and has_flashinfer_moe()
-            and self.moe.is_act_and_mul
-        ):
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
@@ -557,7 +553,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if self.flashinfer_moe_backend is not None:
-            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
+            if self.moe.is_act_and_mul:
+                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             register_moe_scaling_factors(layer)
             if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
                 rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
@@ -570,13 +567,21 @@ def get_fused_moe_quant_config(
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
-            g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(),
+            g1_alphas=layer.output1_scales_gate_scalar.squeeze()
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            else None,
             w2_scale=layer.w2_weight_scale,
-            g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(),
+            g2_alphas=layer.output2_scales_scalar.squeeze()
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            else None,
             a1_scale=layer.w13_input_scale,
-            a1_gscale=layer.w13_input_scale,
+            a1_gscale=layer.w13_input_scale
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            else None,
             a2_scale=layer.w2_input_scale,
-            a2_gscale=1.0 / layer.w2_input_scale,
+            a2_gscale=layer.w2_input_scale_inv
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            else None,
             per_act_token_quant=False,
         )
 
@@ -660,10 +665,6 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not renormalize
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
-            )
             return flashinfer_cutlass_moe_fp8(
                 x,
                 layer,