moe quant with dedicated kernels [wip]

HDCharles · HDCharles · commit 8042ac3ea575 · 2025-06-06T01:33:09.000-07:00
Summary:

extending the torchao moe support to have more performant kernels. This
PR supports both scaled_grouped_mm and fbgemm's grouped_gemm_fp8_rowwise
though it seems like grouped_gemm_fp8_rowwise is a bit buggy (need to
make a clear repro)

todo: run benchmarks, debug fbgemm kernel, unit tests

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
@@ -25,20 +25,25 @@
     Int8WeightOnlyConfig,
     LinearActivationQuantizedTensor,
     quantize_,
+    PerRow,
+    PerTensor,
 )
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
     is_sm_at_least_90,
 )
+from torchao.quantization.utils import compute_error
 
 if torch.version.hip is not None:
     pytest.skip(
         "ROCm support for MoE quantization is under development",
         allow_module_level=True,
     )
+from torchao.prototype.moe_quant.kernels import fp8_dq_moe_op
 
+torch.manual_seed(0)
 
 class TestMoEQuantCompile(unittest.TestCase):
     DEFAULT_PARAMS = (512, 256, 8, 2)  # hidden_dim, expert_dim, num_experts, top_k
@@ -68,7 +73,6 @@ def _test_impl_moe_quant(
             .to(device)
         )
         input = torch.randn(input_shape, dtype=torch.bfloat16, device=device)
-
         out = model(input)
 
         quantize_(model, config, cond_ffn_filter)
@@ -363,6 +367,113 @@ def test_fp8dq_base(self, name, num_tokens, fullgraph):
             fullgraph=fullgraph,
         )
 
+class TestFusedMoEQuant(unittest.TestCase):
+    DEFAULT_PARAMS = (512, 256, 8, 2)  # hidden_dim, expert_dim, num_experts, top_k
+
+    @parameterized.expand(
+        [
+            ("multiple_tokens", 8),
+        ]
+    )
+    def test_pytorch_scaled_grouped_gemm(self, name, num_tokens):
+        if not torch.cuda.is_available():
+            self.skipTest("Need CUDA available")
+        if not is_sm_at_least_90():
+            self.skipTest("Requires CUDA capability >= 9.0")
+
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        config = MoEQuantConfig(Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
+
+        model_params = self.DEFAULT_PARAMS
+
+        input_shape = (num_tokens, model_params[0])
+        input = torch.randn(input_shape, dtype=torch.bfloat16, device=device)
+
+        model = (
+            MOEFeedForwardAOQuantizable(*model_params, empty_init=False)
+        )
+        model = model.to(dtype).to(device)
+
+        out_orig = model(input)
+
+        quantize_(model, config, cond_ffn_filter)
+        
+        w1 = model.experts.w1
+        w2 = model.experts.w2
+        w3 = model.experts.w3
+
+        router = model.router
+        top_k = model.top_k
+
+        # preprocess
+        scores = router(input)  # [T, E]
+        scores = torch.nn.functional.softmax(scores, dim=-1)
+        scores, expert_indices = torch.topk(
+            scores, top_k, dim=-1
+        )  # [T, A], [T, A]
+        scores /= scores.sum(dim=-1, keepdim=True).to(input.dtype)  # [T, A]
+
+        out = fp8_dq_moe_op(input, w1, w2, w3, expert_indices, scores)
+        out2 = model(input)
+
+        self.assertTrue(compute_error(out_orig, out) > 20)
+        self.assertTrue(compute_error(out_orig, out2) > 20)
+
+class TestFusedMoEQuant(unittest.TestCase):
+    DEFAULT_PARAMS = (512, 256, 8, 2)  # hidden_dim, expert_dim, num_experts, top_k
+
+    @parameterized.expand(
+        [
+            ("multiple_tokens", 8),
+        ]
+    )
+    def test_fbgemm_scaled_grouped_gemm(self, name, num_tokens):
+        if not torch.cuda.is_available():
+            self.skipTest("Need CUDA available")
+        if not is_sm_at_least_90():
+            self.skipTest("Requires CUDA capability >= 9.0")
+
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        config = MoEQuantConfig(Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
+
+        model_params = self.DEFAULT_PARAMS
+
+        input_shape = (num_tokens, model_params[0])
+        input = torch.randn(input_shape, dtype=torch.bfloat16, device=device)
+
+        model = (
+            MOEFeedForwardAOQuantizable(*model_params, empty_init=False)
+        )
+        model = model.to(dtype).to(device)
+
+        out_orig = model(input)
+
+        quantize_(model, config, cond_ffn_filter)
+        
+        w1 = model.experts.w1
+        w2 = model.experts.w2
+        w3 = model.experts.w3
+
+        router = model.router
+        top_k = model.top_k
+
+        # preprocess
+        scores = router(input)  # [T, E]
+        scores = torch.nn.functional.softmax(scores, dim=-1)
+        scores, expert_indices = torch.topk(
+            scores, top_k, dim=-1
+        )  # [T, A], [T, A]
+        scores /= scores.sum(dim=-1, keepdim=True).to(input.dtype)  # [T, A]
+
+        out = fp8_dq_moe_op(input, w1, w2, w3, expert_indices, scores, use_fbgemm_kernel=True)
+        out2 = model(input)
+
+        self.assertTrue(compute_error(out_orig, out) > 20)
+        self.assertTrue(compute_error(out_orig, out2) > 20)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/prototype/moe_quant/kernels.py b/torchao/prototype/moe_quant/kernels.py
@@ -0,0 +1,155 @@
+import torch
+import torch.nn.functional as F
+from torchao.quantization.utils import _torchtitan_available, _fbgemm_available
+
+grouped_gemm_fp8_rowwise = None
+if _fbgemm_available:
+    try:
+        from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import grouped_gemm_fp8_rowwise
+    except:
+        pass    
+
+__all__ = ["fp8_dq_moe_op",
+           "manual_pad",
+           "torchtitan_pad",
+        ]
+
+
+def fp8_dq_moe_op(input, w1, w2, w3, expert_indices, scores, fast_accum=True, use_fbgemm_kernel=False):
+    # parameters
+    orig_in_shape = input.shape
+    input.reshape(-1, orig_in_shape[-1])
+    num_tokens, dim = input.shape
+    num_experts, expert_dim, _ = w1.shape
+    scores = scores.view(-1, scores.shape[-1])
+    top_k = scores.shape[-1]
+    total_activations = num_tokens*top_k
+    
+    # preprocess indices
+    expert_indices = expert_indices.view(-1)
+    activation_shuffle = expert_indices.argsort(stable=True)
+    token_shuffle = activation_shuffle.div(top_k).floor().to(torch.int64)
+    num_tokens_per_expert = torch.histc(expert_indices, bins=num_experts, min=0, max=num_experts)
+    
+    # padding
+    alignment = 16
+    if _torchtitan_available:
+        num_ranks = 1
+        padded_indices, m_offsets = torchtitan_pad(num_tokens_per_expert, alignment, num_ranks)
+    else:
+        padded_indices, m_offsets = manual_pad(num_tokens_per_expert, alignment)
+
+    pad_len = padded_indices.shape[0]
+    valid_values = padded_indices >= 0
+    
+    # get data for weights
+    w1_fp8 = w1.original_weight_tensor.tensor_impl.float8_data
+    w1_scale = w1.original_weight_tensor.tensor_impl.scale.squeeze()
+    w1_qfunc = w1.input_quant_func
+    w1_quant_kwargs = w1.quant_kwargs
+
+    w3_fp8 = w3.original_weight_tensor.tensor_impl.float8_data
+    w3_scale = w3.original_weight_tensor.tensor_impl.scale.squeeze()
+
+    w2_fp8 = w2.original_weight_tensor.tensor_impl.float8_data
+    w2_scale = w2.original_weight_tensor.tensor_impl.scale.squeeze()
+    w2_qfunc = w2.input_quant_func
+    w2_quant_kwargs = w2.quant_kwargs
+
+
+    # quantize then shuffle input
+    q_input = w1_qfunc(input, **w1_quant_kwargs)
+    q_input_data = q_input.tensor_impl.float8_data
+    q_input_scale = q_input.tensor_impl.scale.squeeze()
+    input_fp8 = torch.zeros((pad_len, q_input_data.shape[-1]), dtype=q_input_data.dtype, device=q_input_data.device)
+    input_scale = torch.zeros(pad_len, dtype=q_input_scale.dtype, device=q_input_scale.device)
+    input_fp8[valid_values] = q_input_data[token_shuffle]
+    input_scale[valid_values] = q_input_scale[token_shuffle] if q_input_scale.numel()>1 else q_input_scale
+    
+    if use_fbgemm_kernel:
+        assert grouped_gemm_fp8_rowwise is not None, "fbgemm kernel requires fbgemm-gpu-genai to be installed: https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gen_ai/README.md"
+        y1 = grouped_gemm_fp8_rowwise(input_fp8, w1_fp8.reshape(-1, w1_fp8.shape[-1]), m_offsets, input_scale, w1_scale.reshape(-1), use_fast_accum=True)
+        y3 = grouped_gemm_fp8_rowwise(input_fp8, w3_fp8.reshape(-1, w3_fp8.shape[-1]), m_offsets, input_scale, w3_scale.reshape(-1), use_fast_accum=True)
+        
+        y = F.silu(y1)*y3
+        
+        y_q = w2_qfunc(y, **w2_quant_kwargs)
+
+        y_fp8 = y_q.tensor_impl.float8_data
+        y_scale = y_q.tensor_impl.scale.squeeze()
+        out = grouped_gemm_fp8_rowwise(y_fp8, w2_fp8.view(-1, w1_fp8.shape[-1]), m_offsets, y_scale, w2_scale.view(-1), use_fast_accum=fast_accum)
+        # unpad and combine output with weights
+        out = out[valid_values]
+        sorted_scores = scores.reshape(-1,1)[activation_shuffle]
+        out = out*sorted_scores
+
+        # sum weighted outputs
+        final_out = torch.zeros_like(input)
+        final_out = final_out.scatter_add(
+            dim=0,
+            index=token_shuffle.unsqueeze(-1).expand(total_activations, dim).to(torch.int64),
+            src=out
+        )
+        final_out = final_out.reshape(orig_in_shape)
+        return final_out
+
+    else:
+        y1 = torch._scaled_grouped_mm(input_fp8, w1_fp8.transpose(-2, -1), input_scale, w1_scale, offs=m_offsets, out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        y3 = torch._scaled_grouped_mm(input_fp8, w3_fp8.transpose(-2, -1), input_scale, w3_scale, offs=m_offsets, out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        y = F.silu(y1)*y3
+        y_q = w2_qfunc(y, **w2_quant_kwargs)
+
+        y_fp8 = y_q.tensor_impl.float8_data
+        y_scale = y_q.tensor_impl.scale.squeeze()
+        out = torch._scaled_grouped_mm(y_fp8, w2_fp8.transpose(-2, -1), y_scale, w2_scale, offs=m_offsets, out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        
+        # unpad and combine output with weights
+        out = out[valid_values]
+        sorted_scores = scores.reshape(-1,1)[activation_shuffle]
+        out = out*sorted_scores
+
+        # sum weighted outputs
+        final_out = torch.zeros_like(input)
+        final_out = final_out.scatter_add(
+            dim=0,
+            index=token_shuffle.unsqueeze(-1).expand(total_activations, dim).to(torch.int64),
+            src=out
+        )
+        final_out = final_out.reshape(orig_in_shape)
+        return final_out
+
+def torchtitan_pad(num_tokens_per_expert, alignment, num_ranks):
+    from torchtitan.experiments.kernels.moe.indices import generate_permute_indices 
+    num_experts = num_tokens_per_expert.shape[0]
+
+    # pad to nearest multiple of alignment that's greater than 0
+    padded_sizes = (((num_tokens_per_expert + (num_tokens_per_expert==0))/alignment).ceil() * alignment)
+    pad_len = int(padded_sizes.sum().item())
+
+    padded_indices, _, m_offsets = generate_permute_indices(
+        num_tokens_per_expert,
+        num_experts,
+        num_ranks,
+        pad_len,
+        alignment,
+        use_cpu=False
+    )
+    return padded_indices, m_offsets
+
+def manual_pad(num_tokens_per_expert, alignment):
+    num_experts = num_tokens_per_expert.shape[0]
+
+    padded_sizes = (((num_tokens_per_expert + (num_tokens_per_expert==0))/alignment).ceil() * alignment)
+    pad_len = int(padded_sizes.sum().item())
+
+    padded_indices = torch.zeros(pad_len, dtype=torch.int32, device=num_tokens_per_expert.device)-1
+    start_tok_index = 0
+    start_pad_index = 0
+    for i in range(num_experts):
+        end_tok_index = int(start_tok_index+num_tokens_per_expert[i].item())
+        end_pad_index = int(start_pad_index+num_tokens_per_expert[i].item())
+        padded_indices[start_pad_index:end_pad_index] = torch.arange(start_tok_index, end_tok_index, dtype=torch.int32, device=num_tokens_per_expert.device)
+        start_tok_index = end_tok_index
+        start_pad_index = start_pad_index + int(padded_sizes[i].item())
+    m_offsets = padded_sizes.cumsum(0).to(torch.int32)
+    return padded_indices, m_offsets
diff --git a/torchao/prototype/moe_quant/quantizable_moe_modules.py b/torchao/prototype/moe_quant/quantizable_moe_modules.py
@@ -1,9 +1,11 @@
 import torch
+import torchao
 import torch.nn.functional as F
 from torch import Tensor, nn
 
 from torchao.prototype.moe_quant.utils import FakeExtraDimTensor
-
+from torchao.quantization.utils import _torchtitan_available
+from torchao.prototype.moe_quant.kernels import fp8_dq_moe_op
 
 class MOEFeedForwardAOQuantizable(nn.Module):
     def __init__(
@@ -28,7 +30,7 @@ def __init__(
         self.return_scores = return_scores
 
     def forward(self, x: Tensor) -> Tensor:
-        batch_size = x.shape[0]
+        shape_no_dim = x.shape[:-1]
         x = x.view(-1, self.hidden_dim)  # x: [T, D]
         scores = self.router(x)  # [T, E]
         scores = F.softmax(scores, dim=-1)
@@ -40,11 +42,12 @@ def forward(self, x: Tensor) -> Tensor:
         out = self.experts(x, expert_indices, scores, self.top_k)
         if self.shared_expert:
             out += self.shared_expert(x)
-
+        out =  out.reshape(*shape_no_dim, -1)
+        
         if self.return_scores:
-            return out.reshape(batch_size, -1, self.hidden_dim), scores
+            return out, scores
         else:
-            return out.reshape(batch_size, -1, self.hidden_dim)
+            return out
 
 
 class ConditionalFeedForwardAOQuantizable(nn.Module):
@@ -79,7 +82,7 @@ def forward(
         self,
         x: Tensor,  # T, D
         expert_indices: Tensor,  # T, A
-        expert_weights: Tensor,  # T, A
+        scores: Tensor,  # T, A
         top_k: int,
     ) -> Tensor:
         num_tokens, _hidden_dim = x.shape
@@ -105,11 +108,20 @@ def forward(
 
             # combine outputs
             final_out = (
-                (torch.cat(outs, dim=0) * expert_weights.view(-1, 1))
+                (torch.cat(outs, dim=0) * scores.view(-1, 1))
                 .sum(dim=0)
                 .reshape(x.shape)
             )
             return final_out
+        
+        # fp8 dq moe
+        elif (
+            isinstance(self.w1, torchao.quantization.linear_activation_quantized_tensor.LinearActivationQuantizedTensor) and 
+            isinstance(self.w1.original_weight_tensor._layout, torchao.dtypes.floatx.float8_layout.Float8Layout)
+        ):
+            final_out = fp8_dq_moe_op(x, self.w1, self.w2, self.w3, expert_indices, scores)
+            return final_out
+        
         else:
             expert_list = [x for x in range(self.num_experts)]
 
@@ -172,7 +184,7 @@ def group_tokens_by_expert(
 
             # weigh outputs
             ordered_outs = torch.cat(outs, dim=0)  # [T*A, D]
-            ordered_token_activation_weights = expert_weights.view(-1, 1)[
+            ordered_token_activation_weights = scores.view(-1, 1)[
                 ordered_token_activations
             ].view(-1, 1)  # [T*A, 1]
             weighted_ordered_outs = (
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -54,6 +54,9 @@
 
 _lm_eval_available = importlib.util.find_spec("lm_eval") is not None
 
+_torchtitan_available = importlib.util.find_spec("torchtitan") is not None
+
+_fbgemm_available = importlib.util.find_spec("fbgemm_gpu") is not None
 
 # basic SQNR
 def compute_error(x, y):