initial pplx dispatch/combine class

bnellnm · bnellnm · commit fc3243d44da9 · 2025-04-03T19:39:35.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
@@ -362,7 +362,7 @@ def fp8_perm(m, idx):
         return m[idx, ...]
 
 
-def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     M, K = a.shape
 
     sorted_token_ids, m_indices, num_pad = moe_align_block_size(
@@ -381,7 +381,7 @@ def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     return a, a_s, m_indices, inv_perm
 
 
-def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
+def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
     M = topk_weight.shape[0]
     out = out[inv_perm, ...]
     tmp_out = out.view(-1, topk, K)
@@ -403,8 +403,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
 
     a_q, a_s = per_token_group_quant_fp8(a, block_m)
 
-    a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
-                                                     num_groups, topk, block_m)
+    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
+                                                 num_groups, topk, block_m)
 
     inter_out = torch.zeros((a_q.shape[0], N * 2),
                             dtype=torch.bfloat16,
@@ -421,7 +421,7 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
         (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
 
-    final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
+    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
 
     return final_out
 
diff --git a/vllm/model_executor/layers/fused_moe/dispatch_combine.py b/vllm/model_executor/layers/fused_moe/dispatch_combine.py
@@ -21,7 +21,7 @@ def dispatch(
         topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
             a2_scale.numel() != 1 if a2_scale is not None else False)
 
@@ -31,14 +31,14 @@ def dispatch(
             self.block_shape,
             per_act_token,
         )
-        return a1q, a1q_scale, topk_ids
+        return a1q, a1q_scale
 
     def combine(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
     ) -> None:
         _moe_unpermute_and_reduce(output, fused_expert_output, None,
                                   topk_weights)
-
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1459,6 +1459,7 @@ def fused_moe(
                          block_shape=block_shape)
 
 
+# TODO: merge with StandardDispatchCombine
 class TritonDispatchCombine(mk.FusedMoEQuantizeDispatchCombine):
 
     def __init__(self, use_fp8_w8a8: bool, block_shape: Optional[List[int]]):
@@ -1474,7 +1475,7 @@ def dispatch(
         topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         if self.use_fp8_w8a8:
             a1q, a1q_scale = _fp8_quantize(
                 a1,
@@ -1485,13 +1486,14 @@ def dispatch(
             a1q = a1
             a1q_scale = a1_scale
 
-        return a1q, a1q_scale, topk_ids
+        return a1q, a1q_scale
 
     def combine(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
     ) -> None:
         M, topk = topk_weights.shape
         K = fused_expert_output.shape[-1]
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -9,9 +9,6 @@
 
 class FusedMoEQuantizeDispatchCombine(ABC):
 
-    #    def __init__(self):
-    #        pass
-
     @abstractmethod
     def dispatch(
         self,
@@ -21,11 +18,9 @@ def dispatch(
         topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
-        # TODO: figure this out
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # returns (quantized+dispatched a,
-        #          quantized+dispatched a1_scales,
-        #          dispatched topk_ids)
+        #          quantized+dispatched a1_scales)
         raise NotImplementedError
 
     @abstractmethod
@@ -34,16 +29,14 @@ def combine(
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,  # not reduced or weighted
         topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
     ) -> None:
         raise NotImplementedError
 
 
 # store weights, etc. here
 class FusedMoEPermuteExpertsUnpermute(ABC):
 
-    #    def __init__(self):
-    #        pass
-
     @abstractmethod
     def workspace_shapes(
         self,
@@ -115,6 +108,7 @@ def forward(
         # two, so it's not "correct" to extract N or K from the trailing dimension of
         # w1 or w2.  Similarly, some kernels transpose the weights, so this needs to
         # be kept in mind.
+        # TODO: make this a method/utility function, e.g. problem_size(a, w1, w2, topk_ids, ...)
         M, _ = a1.shape
         E, N, _ = w1.shape
         K = w2.shape[1]
@@ -144,7 +138,7 @@ def forward(
                                  device=a1.device,
                                  dtype=workspace_dtype)
 
-        a1q, a1q_scale, dispatched_topk_ids = self.dispatch_combine.dispatch(
+        a1q, a1q_scale = self.dispatch_combine.dispatch(
             a1,
             a1_scale,
             a2_scale,
@@ -157,7 +151,7 @@ def forward(
             a1q,
             w1,
             w2,
-            dispatched_topk_ids,
+            topk_ids,
             activation,
             global_num_experts,
             expert_map,
@@ -171,6 +165,6 @@ def forward(
             workspace2=workspace2,
         )
 
-        self.dispatch_combine.combine(output, fused_out, topk_weights)
+        self.dispatch_combine.combine(output, fused_out, topk_weights, topk_ids)
 
         return output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -1,64 +1,106 @@
 import torch
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import pplx_kernels as pplx
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 
 
+# Note use: layer.get_all_to_all() to get an AllToAll instance
+# The max_num_tokens, world_size and dp_size must be the same
+# as the ones used to create the AllToAll.  Unfortunately, there's
+# no way(?) to extract this info from AllToAll
 class PplxDispatchCombine(mk.FusedMoEQuantizeDispatchCombine):
-    def __init__(self, a2a: pplx.AllToAll):
+    def __init__(
+            self,
+            a2a: pplx.AllToAll,
+            max_num_tokens: int,
+            world_size: int,
+            dp_size: int,
+            block_shape: Optional[List[int]] = None):
         super().__init__()
         self.a2a = a2a
+        self.block_shape = block_shape
+        self.dp_num_tokens = max_num_tokens * (world_size // dp_size)
 
     def dispatch(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        topk_ids: torch.Tensor,
+        rank_topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Is this always going to be a1.device?
+        device = a1.device
+
+        per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+            a2_scale.numel() != 1 if a2_scale is not None else False)
+
+        a1q, a1q_scale = _fp8_quantize(
+            a1,
+            a1_scale,
+            self.block_shape,
+            per_act_token,
+        )
+
+        expert_num_tokens = torch.empty(
+            num_experts,
+            dtype=torch.int32,
+            device=device,
+        )
+
+        expert_x = torch.empty(
+            (num_experts, self.dp_num_tokens, a1q.shape[-1]),
+            dtype=a1q.dtype,
+            device=device,
+        )
+
+        expert_x_scale: torch.Tensor | None = None
+        if a1q.dtype.itemsize == 1:
+            float32_size = torch.float32.itemsize
+            block_size = (self.block_shape[0] if self.block_shape is not None else 1) * float32_size
+            expert_x_scale = torch.empty(
+                (
+                    num_experts,
+                    expert_x.size(1),
+                    (expert_x.size(2) + block_size - 1) // block_size,
+                ),
+                dtype=torch.float32,
+                device=device,
+            )
+
+        # This argument is optional
+        bound_m = torch.tensor([a1q.shape[0]], dtype=torch.uint32, device=device)
+
         self.a2a.dispatch(
-            out_expert_num_tokens, # torch.Tensor,
-            out_expert_x, # torch.Tensor,
-            out_expert_x_scale, # torch.Tensor | None,
-            dp_x, # torch.Tensor,
-            dp_x_scale, # torch.Tensor | None,
-            indices, # torch.Tensor,
-            bound_m, # torch.Tensor | None,
-            do_send, # bool = True,
-            do_recv, # bool = True,
+            out_expert_num_tokens=expert_num_tokens,
+            out_expert_x=expert_x,
+            out_expert_x_scale=expert_x_scale,
+            dp_x=a1q,
+            dp_x_scale=a1q_scale,
+            indices=rank_topk_ids,
+            bound_m=bound_m,
         )
-        return 1q, a1q_scale, topk_ids
+        return expert_x, expert_x_scale
 
     def combine(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
     ) -> None:
-        self.a2a.combine(
-            out_tokens, #: torch.Tensor,
-            indices, #: torch.Tensor,
-            weights, #: torch.Tensor,
-            expert_y, #: torch.Tensor,
-            bound_m, #: torch.Tensor | None,
-            do_send, #: bool = True,
-            do_recv, #: bool = True,
-        )
+        # This argument is optional
+        bound_m = torch.tensor([output.shape[0]], dtype=torch.uint32, device=output.device)
 
+        # TODO assert output is the proper size
 
-# singleton-ish
-def get_a2a(
-        max_num_tokens: int,
-        num_experts: int,
-        experts_per_token: int,
-        rank: int,
-        world_size: int,
-        dp_size: int,
-        hidden_dim: int,
-        hidden_dim_bytes: int,
-        hidden_dim_scale_bytes: int,
-) -> pplx.AllToAll:
-    pass
+        self.a2a.combine(
+            out_tokens=output,
+            indices=topk_ids,
+            weights=topk_weights,
+            expert_y=fused_expert_output,
+            bound_m=bound_m
+        )