pytorch
diff --git a/‎run_train.sh
Lines changed: 2 additions & 2 deletions b/‎run_train.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/components/checkpoint.py
Lines changed: 13 additions & 12 deletions b/‎torchtitan/components/checkpoint.py
Lines changed: 13 additions & 12 deletions
diff --git a/‎torchtitan/config_manager.py
Lines changed: 7 additions & 0 deletions b/‎torchtitan/config_manager.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎torchtitan/distributed/parallel_dims.py
Lines changed: 79 additions & 2 deletions b/‎torchtitan/distributed/parallel_dims.py
Lines changed: 79 additions & 2 deletions
diff --git a/‎torchtitan/experiments/llama4/infra/expert_parallel.py
Lines changed: 147 additions & 5 deletions b/‎torchtitan/experiments/llama4/infra/expert_parallel.py
Lines changed: 147 additions & 5 deletions
@@ -11,8 +11,8 @@ set -ex
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_train.sh
 NGPU=${NGPU:-"8"}
-export LOG_RANK=${LOG_RANK:-0}
-CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/llama3/train_configs/debug_model.toml"}
+export LOG_RANK=${LOG_RANK:-0,1}
+CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/experiments/llama4/train_configs/debug_model.toml"}
 
 overrides=""
 if [ $# -ne 0 ]; then
 
@@ -41,6 +41,10 @@
 LR_SCHEDULER = "lr_scheduler"
 DATALOADER = "dataloader"
 TRAIN_STATE = "train_state"
+# For now, we will manually pop the freqs_cis buffer, as we made this permanent
+# temporarily and we don't want to include it in the exported state_dict.
+# Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
+excluded_parameters_for_model_only = {"freqs_cis"}
 
 
 class AsyncMode(str, enum.Enum):
@@ -53,7 +57,10 @@ class ModelWrapper(Stateful):
     def __init__(self, model: nn.Module | list[nn.Module]) -> None:
         self.model = [model] if isinstance(model, nn.Module) else model
         self.cache_state_dict = {
-            k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
+            k: v
+            for sd in map(get_model_state_dict, self.model)
+            for k, v in sd.items()
+            if k not in excluded_parameters_for_model_only
         }
 
     def state_dict(self) -> dict[str, Any]:
@@ -69,7 +76,10 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         # `set_model_state_dict()` does change the keys of the input state_dict,
         # we will need to reinitialize the cache_state_dict.
         self.cache_state_dict = {
-            k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
+            k: v
+            for sd in map(get_model_state_dict, self.model)
+            for k, v in sd.items()
+            if k not in excluded_parameters_for_model_only
         }
 
 
@@ -81,12 +91,6 @@ class SaveDone:
     pass
 
 
-# For now, we will manually pop the freqs_cis buffer, as we made this permanent
-# temporarily and we don't want to include it in the exported state_dict.
-# Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
-excluded_parameters_for_model_only = {"freqs_cis"}
-
-
 @torch.no_grad()
 def save_with_gc(state, checkpoint_id):
     dcp.save(state, checkpoint_id=checkpoint_id)
@@ -568,10 +572,7 @@ def _states_to_load(self, model_only: bool) -> dict[str, Any]:
         """
         # For the first step, we will only load the model weights.
         if model_only:
-            sd = self.states[MODEL].state_dict()
-            for k in excluded_parameters_for_model_only:
-                sd.pop(k, None)
-            return sd
+            return {MODEL: self.states[MODEL]}
 
         for exclude_key in self.exclude_from_loading:
             if exclude_key not in self.states:
 
@@ -363,6 +363,13 @@ class Parallelism:
     The default value is 'allgather'.
     """
 
+    expert_parallel_degree: int = 1
+    """
+    Expert parallelism degree. 1 means disabled.
+    Currently, only "dp2ep" is supported.
+    EP degree has to be k * context_parallel_degree, where k >= 1 and data_parallel_shard_degree % k == 0.
+    """
+
 
 @dataclass
 class Checkpoint:
 
@@ -23,21 +23,23 @@ class ParallelDims:
     cp: int
     tp: int
     pp: int
+    ep: int
     world_size: int
     enable_loss_parallel: bool
 
     def __post_init__(self):
         self._validate()
 
     def _validate(self):
-        dp_replicate, dp_shard, cp, tp, pp = (
+        dp_replicate, dp_shard, cp, tp, pp, ep = (
             self.dp_replicate,
             self.dp_shard,
             self.cp,
             self.tp,
             self.pp,
+            self.ep,
         )
-        for d in (dp_replicate, cp, tp, pp):
+        for d in (dp_replicate, cp, tp, pp, ep):
             assert d >= 1, "Parallelism degree should be >= 1, except for dp_shard"
 
         assert dp_shard == -1 or dp_shard >= 1, " dp_shard must -1 or >=1."
@@ -50,7 +52,78 @@ def _validate(self):
             f"cp({cp}) * tp({tp}) * pp({pp}) != WORLD_SIZE({self.world_size})"
         )
 
+        if ep > 1:
+            # EP would borrow all cp and some dp_shard degree
+            assert ep % cp == 0 and (dp_shard * cp) % ep == 0
+
+    def _build_mesh_with_ep(self, device_type):
+        # With ep, dp_shard and ep are derived submeshes:
+        # dp_shard = dp_shard_mod_ep * dp_shard_in_ep
+        # ep = dp_shard_in_ep * cp
+        dp_shard_mod_ep = self.dp_shard * self.cp // self.ep
+        dp_shard_in_ep = self.ep // self.cp
+
+        dims = []
+        names = []
+        for d, name in zip(
+            [
+                self.pp,
+                self.dp_replicate,
+                dp_shard_mod_ep,
+                dp_shard_in_ep,
+                self.cp,
+                self.tp,
+            ],
+            ["pp", "dp_replicate", "dp_shard_mod_ep", "dp_shard_in_ep", "cp", "tp"],
+        ):
+            # dp_shard_mod_ep is needed even if it's 1, whose FSDP wrapping
+            # helps the MoE layers do mixed precision training
+            if d > 1 or name == "dp_shard_mod_ep":
+                dims.append(d)
+                names.append(name)
+
+        logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
+        mesh = init_device_mesh(device_type, dims, mesh_dim_names=names)
+
+        # Create all the submesh here to ensure all required process groups are
+        # initialized:
+        # Mesh for data loading (no communication on this mesh)
+        dp_mesh_dim_names = []
+        # Mesh for param sharding
+        dp_shard_cp_mesh_dim_names = []
+        # Mesh for loss all-reduce
+        dp_cp_mesh_dim_names = []
+        # Mesh for ep
+        ep_mesh_dim_names = []
+
+        if self.dp_replicate_enabled:
+            dp_mesh_dim_names.append("dp_replicate")
+            dp_cp_mesh_dim_names.append("dp_replicate")
+        # dp_shard_mod_ep is always needed, even if it's 1
+        dp_mesh_dim_names.append("dp_shard_mod_ep")
+        dp_shard_cp_mesh_dim_names.append("dp_shard_mod_ep")
+        dp_cp_mesh_dim_names.append("dp_shard_mod_ep")
+        if "dp_shard_in_ep" in names:
+            dp_mesh_dim_names.append("dp_shard_in_ep")
+            dp_shard_cp_mesh_dim_names.append("dp_shard_in_ep")
+            dp_cp_mesh_dim_names.append("dp_shard_in_ep")
+            ep_mesh_dim_names.append("dp_shard_in_ep")
+        if self.cp_enabled:
+            dp_shard_cp_mesh_dim_names.append("cp")
+            dp_cp_mesh_dim_names.append("cp")
+            ep_mesh_dim_names.append("cp")
+
+        mesh[tuple(dp_mesh_dim_names)]._flatten(mesh_dim_name="dp")
+        mesh[tuple(dp_shard_cp_mesh_dim_names)]._flatten(mesh_dim_name="dp_shard_cp")
+        mesh[tuple(dp_cp_mesh_dim_names)]._flatten(mesh_dim_name="dp_cp")
+        mesh[tuple(ep_mesh_dim_names)]._flatten(mesh_dim_name="ep")
+
+        return mesh
+
     def build_mesh(self, device_type: str) -> DeviceMesh:
+        if self.ep > 1:
+            return self._build_mesh_with_ep(device_type)
+
         dims = []
         names = []
         for d, name in zip(
@@ -143,3 +216,7 @@ def loss_parallel_enabled(self):
     @cached_property
     def non_data_parallel_size(self):
         return self.cp * self.tp * self.pp
+
+    @property
+    def ep_enabled(self):
+        return self.ep > 1
@@ -6,10 +6,12 @@
 
 
 from functools import partial
-from typing import Optional, Tuple
+from typing import Callable
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed._functional_collectives import all_to_all_single_autograd
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -27,8 +29,8 @@ class TensorParallel(ParallelStyle):
     def __init__(
         self,
         *,
-        input_layouts: Optional[Tuple[Optional[Placement]]] = None,
-        output_layout: Optional[Placement] = None,
+        input_layouts: tuple[Placement | None] | None = None,
+        output_layout: Placement | None = None,
         use_local_output: bool = True,
     ):
         super().__init__()
@@ -99,8 +101,8 @@ class NoParallel(ParallelStyle):
     def __init__(
         self,
         *,
-        input_layout: Optional[Placement] = None,
-        output_layout: Optional[Placement] = None,
+        input_layout: Placement | None = None,
+        output_layout: Placement | None = None,
         use_local_output: bool = True,
     ):
         super().__init__()
@@ -141,3 +143,143 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             ),
             partial(self._prepare_output_fn, self.output_layout, self.use_local_output),
         )
+
+
+class ExpertParallel(ParallelStyle):
+    def __init__(
+        self,
+        *,
+        input_layouts: Placement | None = None,
+        output_layouts: Placement | None = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Shard(0),)
+        self.output_layouts = (output_layouts or Shard(0),)
+        self.use_local_output = use_local_output
+        self.input_splits = None
+        self.output_splits = None
+
+    # performing all-to-all dispatch on the input
+    def _prepare_input_fn(self, mod, inputs, device_mesh):
+        # annotate module input placements/sharding with input_layouts
+        routed_input, num_tokens_per_expert = inputs
+
+        # generate the input splits and output splits for all-to-all
+        with torch.no_grad():
+            num_tokens_per_expert_group = num_tokens_per_expert.new_empty(
+                num_tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                num_tokens_per_expert_group,
+                num_tokens_per_expert,
+                group=device_mesh.get_group(),
+            )
+            # NOTE: this would incur a device-to-host sync
+            self.input_splits = (
+                num_tokens_per_expert.view(device_mesh.shape[0], -1).sum(dim=1).tolist()
+            )
+            self.output_splits = (
+                num_tokens_per_expert_group.view(device_mesh.shape[0], -1)
+                .sum(dim=1)
+                .tolist()
+            )
+
+        # perform all-to-all
+        routed_input = all_to_all_single_autograd(
+            routed_input,
+            self.output_splits,
+            self.input_splits,
+            device_mesh.get_group(),
+        )
+
+        # NOTE: After this all-to-all, the routed input is put on proper EP rank.
+        # However, the num_tokens_per_expert_group is not of the final target format
+        # [#tokens for local expert 0, #tokens for local expert 1, ...]
+        # Rather, it is of the format
+        # [#tokens for local expert 0 from EP rank 0, #tokens for local expert 1 from EP rank 0, ...,
+        #  #tokens for local expert 0 from EP rank 1, #tokens for local expert 1 from EP rank 1, ...]
+        # We need to perform another shuffle to get the correct format -- this is done via the function
+        # generate_permute_indices in moe.py, which also does padding to make sure the number of tokens
+        # each expert gets locally is a multiple of ALIGN_SIZE_M.
+
+        return routed_input, num_tokens_per_expert_group
+
+    def _partition_fn(self, name, module, device_mesh):
+        # shard on the expert dimension
+        for name, param in module.named_parameters(recurse=False):
+            dist_param = nn.Parameter(distribute_tensor(param, device_mesh, [Shard(0)]))
+            module.register_parameter(name, dist_param)
+
+    # performing all-to-all combine on the output
+    def _prepare_output_fn(self, mod, routed_output, device_mesh):
+        routed_output = all_to_all_single_autograd(
+            routed_output,
+            self.input_splits,
+            self.output_splits,
+            device_mesh.get_group(),
+        )
+        return routed_output
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            self._partition_fn,
+            self._prepare_input_fn,
+            self._prepare_output_fn,
+        )
+
+
+def expert_parallel(func: Callable) -> Callable:
+    def wrapper(
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w3: torch.Tensor,
+        x: torch.Tensor,
+        num_tokens_per_expert: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if isinstance(w1, DTensor):
+            w1 = w1.to_local()
+            w2 = w2.to_local()
+            w3 = w3.to_local()
+
+        if num_tokens_per_expert is not None:
+            # NOTE: In order to use torch._grouped_mm, we need to make sure
+            # the number of tokens each expert gets is a multiple of 16.
+            # The following kernel helps achieve this via padding, without
+            # incurring synchronization between device and host.
+            from torchtitan.experiments.kernels.moe.indices import (
+                generate_permute_indices,
+            )
+
+            experts_per_ep_rank = w1.shape[0]
+            num_ep_ranks = num_tokens_per_expert.shape[0] // experts_per_ep_rank
+
+            ALIGN_SIZE_M = 16
+            with torch.no_grad():
+                (
+                    permuted_indices,
+                    num_tokens_per_expert,
+                    _,  # offsets,
+                ) = generate_permute_indices(
+                    num_tokens_per_expert,
+                    experts_per_ep_rank,
+                    num_ep_ranks,
+                    ALIGN_SIZE_M,
+                )
+
+            x = torch.vstack((x, x.new_zeros((x.shape[-1]))))
+            input_shape = x.shape
+            x = x[permuted_indices, :]
+
+        out = func(w1, w2, w3, x, num_tokens_per_expert)
+
+        if num_tokens_per_expert is not None:
+            out_unpermuted = out.new_empty(input_shape)
+            out_unpermuted[permuted_indices, :] = out
+            out = out_unpermuted[:-1]
+
+        return out
+
+    return wrapper