moe check_vma true

NuojCheng · NuojCheng · commit fe22e79450d0 · 2025-11-24T20:34:00.000Z
diff --git a/src/MaxText/configs/models/deepseek3-test.yml b/src/MaxText/configs/models/deepseek3-test.yml
@@ -33,7 +33,7 @@ num_experts_per_tok: 8
 shared_experts: 1
 routed_scaling_factor: 2.5
 routed_score_func: "sigmoid"
-routed_bias: True
+routed_bias: False
 decoder_block: "deepseek"
 # MLA
 attention_type: "mla"
diff --git a/src/MaxText/configs/models/deepseek3-tiny.yml b/src/MaxText/configs/models/deepseek3-tiny.yml
@@ -31,7 +31,7 @@ num_experts_per_tok: 8
 shared_experts: 1
 routed_scaling_factor: 2.5
 routed_score_func: "sigmoid"
-routed_bias: True
+routed_bias: False
 decoder_block: "deepseek"
 # MLA
 attention_type: "mla"
diff --git a/src/MaxText/kernels/megablox/backend.py b/src/MaxText/kernels/megablox/backend.py
@@ -522,7 +522,7 @@ def out_transform_indices(n_i, grid_id, k_i, group_metadata, group_offset):
   }
   call_gmm = qpl.pallas_call(
       kernel,
-      out_shape=jax.ShapeDtypeStruct((m, n), preferred_element_type),
+      out_shape=jax.ShapeDtypeStruct((m, n), preferred_element_type, vma=set(["fsdp"])),
       grid_spec=pltpu.PrefetchScalarGridSpec(
           num_scalar_prefetch=2,
           in_specs=[
@@ -775,7 +775,7 @@ def out_transform_indices(n_i, k_i, grid_id, group_metadata, group_offset):
   }
   call_gmm = qpl.pallas_call(
       kernel,
-      out_shape=jax.ShapeDtypeStruct((num_actual_groups, k, n), preferred_element_type),
+      out_shape=jax.ShapeDtypeStruct((num_actual_groups, k, n), preferred_element_type, vma=set()),
       grid_spec=pltpu.PrefetchScalarGridSpec(
           num_scalar_prefetch=2,
           in_specs=[
diff --git a/src/MaxText/layers/attention_mla.py b/src/MaxText/layers/attention_mla.py
@@ -24,7 +24,6 @@
 import jax.numpy as jnp
 
 from flax import nnx
-from flax import linen as nn
 
 from MaxText.common_types import (
     Array,
@@ -65,7 +64,7 @@
 from MaxText.layers.linears import DenseGeneral
 from MaxText.layers.normalizations import RMSNorm
 from MaxText.layers.quantizations import AqtQuantization as Quant
-from MaxText.sharding import maybe_shard_with_logical
+from MaxText.sharding import maybe_shard_with_logical, create_sharding
 
 
 @dataclasses.dataclass(frozen=True)
@@ -314,7 +313,7 @@ def __init__(
   def _create_sharding(self, axis_names):
     """Creates NamedSharding if shard_mode is EXPLICIT, otherwise None."""
     if self.config.shard_mode == ShardMode.EXPLICIT:
-      return NamedSharding(self.mesh, nn.logical_to_mesh_axes(axis_names))
+      return create_sharding(self.mesh, axis_names)
     return None
 
   def _get_logical_names(self, model_mode):
diff --git a/src/MaxText/layers/attentions.py b/src/MaxText/layers/attentions.py
@@ -474,7 +474,7 @@ def _init_projections(self, inputs_q_shape: Tuple, inputs_kv_shape: Tuple) -> No
   def _create_sharding(self, axis_names):
     """Creates NamedSharding if shard_mode is EXPLICIT, otherwise None."""
     if self.config.shard_mode == ShardMode.EXPLICIT:
-      return NamedSharding(self.mesh, nn.logical_to_mesh_axes(axis_names))
+      return create_sharding(self.mesh, axis_names)
     return None
 
   def _get_logical_names(self, model_mode):
diff --git a/src/MaxText/layers/deepseek.py b/src/MaxText/layers/deepseek.py
@@ -19,7 +19,7 @@
 from functools import partial
 
 from jax.ad_checkpoint import checkpoint_name
-from jax.sharding import Mesh, NamedSharding
+from jax.sharding import Mesh
 import jax.numpy as jnp
 
 from flax import linen as nn
@@ -33,7 +33,7 @@
 from MaxText.layers import quantizations
 from MaxText.layers.quantizations import AqtQuantization as Quant
 from MaxText.inference import page_manager
-from MaxText.sharding import maybe_shard_with_logical
+from MaxText.sharding import maybe_shard_with_logical, create_sharding
 from MaxText.common_types import MODEL_MODE_PREFILL
 
 # -----------------------------------------
@@ -74,7 +74,7 @@ def self_attention_with_norm(
       mesh=mesh,
       shard_mode=cfg.shard_mode,
   )
-  lnx_out_sharding = NamedSharding(mesh, nn.logical_to_mesh_axes(logical_axis_names))
+  lnx_out_sharding = create_sharding(mesh, logical_axis_names)
 
   lnx = _maybe_shard_with_logical(lnx, logical_axis_names)
 
@@ -184,8 +184,8 @@ def __call__(
       logical_axis_names = ("activation_batch", "activation_norm_length", "activation_embed")
       mlp_logical_axis_names = ("activation_batch", "activation_norm_length", "activation_mlp")
     _maybe_shard_with_logical = partial(maybe_shard_with_logical, mesh=self.mesh, shard_mode=self.config.shard_mode)
-    lnx_out_sharding = NamedSharding(self.mesh, nn.logical_to_mesh_axes(logical_axis_names))
-    mlp_intermediate_sharding = NamedSharding(self.mesh, nn.logical_to_mesh_axes(mlp_logical_axis_names))
+    lnx_out_sharding = create_sharding(self.mesh, logical_axis_names)
+    mlp_intermediate_sharding = create_sharding(self.mesh, mlp_logical_axis_names)
     inputs = _maybe_shard_with_logical(inputs, logical_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
 
@@ -263,8 +263,8 @@ def __call__(
       logical_axis_names = ("activation_batch", "activation_norm_length", "activation_embed")
       mlp_logical_axis_names = ("activation_batch", "activation_norm_length", "activation_mlp")
     _maybe_shard_with_logical = partial(maybe_shard_with_logical, mesh=self.mesh, shard_mode=self.config.shard_mode)
-    lnx_out_sharding = NamedSharding(self.mesh, nn.logical_to_mesh_axes(logical_axis_names))
-    lnx_intermediate_sharding = NamedSharding(self.mesh, nn.logical_to_mesh_axes(mlp_logical_axis_names))
+    lnx_out_sharding = create_sharding(self.mesh, logical_axis_names)
+    lnx_intermediate_sharding = create_sharding(self.mesh, mlp_logical_axis_names)
     inputs = _maybe_shard_with_logical(inputs, logical_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
 
diff --git a/src/MaxText/layers/embeddings.py b/src/MaxText/layers/embeddings.py
@@ -26,7 +26,7 @@
 
 from MaxText import max_logging
 from MaxText import max_utils
-from MaxText.sharding import logical_to_mesh_axes
+from MaxText.sharding import logical_to_mesh_axes, create_sharding
 from MaxText.common_types import ShardMode, MODEL_MODE_PREFILL, MODEL_MODE_TRAIN, Array, Config, DType
 from MaxText.layers import nnx_wrappers
 from MaxText.layers.initializers import Initializer, default_embed_init, variable_to_logically_partitioned
@@ -745,7 +745,7 @@ def __init__(
     self.mesh = mesh
     self.shard_mode = shard_mode
     self.freqs_sharding = (
-        NamedSharding(mesh, nn.logical_to_mesh_axes(("activation_batch", "activation_length_no_exp", "q_heads")))
+        create_sharding(mesh, ("activation_batch", "activation_length_no_exp", "q_heads"))
         if shard_mode == ShardMode.EXPLICIT
         else None
     )
@@ -873,7 +873,7 @@ def __call__(self, inputs: Array, position: None | Array = None) -> Array:
     inputs_complex = first_half + 1j * second_half  # shape: [B, S, N, half_dim]
     # Apply the rotary transformation via complex multiplication.
     rotated_sharding = (
-        NamedSharding(self.mesh, nn.logical_to_mesh_axes(("activation_batch", "activation_length_no_exp", None, None)))
+        create_sharding(self.mesh, ("activation_batch", "activation_length_no_exp", None, None))
         if self.shard_mode == ShardMode.EXPLICIT
         else None
     )
diff --git a/src/MaxText/layers/llama2.py b/src/MaxText/layers/llama2.py
@@ -139,7 +139,7 @@ def __init__(
   def _create_sharding(self, axis_names):
     """Creates NamedSharding if shard_mode is EXPLICIT, otherwise None."""
     if self.config.shard_mode == ShardMode.EXPLICIT:
-      return NamedSharding(self.mesh, nn.logical_to_mesh_axes(axis_names))
+      return create_sharding(self.mesh, axis_names)
     return None
 
   def _get_logical_names(self, model_mode):
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -33,7 +33,7 @@
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText.common_types import ShardMode
-from MaxText.sharding import maybe_shard_with_logical
+from MaxText.sharding import maybe_shard_with_logical, create_sharding
 from MaxText.kernels import megablox as mblx
 from MaxText.sharding import logical_to_mesh_axes
 from MaxText.layers import attentions, linears, nnx_wrappers, quantizations
@@ -264,9 +264,7 @@ def __call__(self, inputs: jax.Array, _initializing: bool = False) -> Tuple[jax.
 
     # [B, S, E] -> [B, S, num_exp]
     output_sharding = (
-        NamedSharding(
-            self.mesh, nn.logical_to_mesh_axes(("activation_batch_no_exp", "activation_length_no_exp", "activation_exp"))
-        )
+        create_sharding(self.mesh, ("activation_batch_no_exp", "activation_length_no_exp", "activation_exp"))
         if self.shard_mode == ShardMode.EXPLICIT
         else None
     )
@@ -505,7 +503,7 @@ def _get_logical_names(self, model_mode):
   def _create_sharding(self, axis_names):
     """Creates NamedSharding if shard_mode is EXPLICIT, otherwise None."""
     if self.config.shard_mode == ShardMode.EXPLICIT:
-      return NamedSharding(self.mesh, nn.logical_to_mesh_axes(axis_names))
+      return create_sharding(self.mesh, axis_names)
     return None
 
   def setup_sharding(self, model_mode):
@@ -1015,15 +1013,15 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
         output = output[: hs_shape[0]]
       return output
 
-    input_partition_pspec = nn.logical_to_mesh_axes(self.logical_names.inputs)
-    w0_bias_pspec = nn.logical_to_mesh_axes(self.logical_names.wi_bias)
-    w1_bias_pspec = nn.logical_to_mesh_axes(self.logical_names.wi_bias)
-    wo_bias_pspec = nn.logical_to_mesh_axes(self.logical_names.wo_bias)
-    gate_logits_pspec = nn.logical_to_mesh_axes(self.logical_names.gate)
-    pre_bias_logits_pspec = nn.logical_to_mesh_axes(self.logical_names.pre_bias)
-    w0_pspec = nn.logical_to_mesh_axes(self.logical_names.wi_kernel_sp)
-    w1_pspec = nn.logical_to_mesh_axes(self.logical_names.wi_kernel_sp)
-    wo_pspec = nn.logical_to_mesh_axes(self.logical_names.wo_kernel_sp)
+    input_partition_pspec = logical_to_mesh_axes(self.logical_names.inputs, self.mesh)
+    w0_bias_pspec = logical_to_mesh_axes(self.logical_names.wi_bias, self.mesh)
+    w1_bias_pspec = logical_to_mesh_axes(self.logical_names.wi_bias, self.mesh)
+    wo_bias_pspec = logical_to_mesh_axes(self.logical_names.wo_bias, self.mesh)
+    gate_logits_pspec = logical_to_mesh_axes(self.logical_names.gate, self.mesh)
+    pre_bias_logits_pspec = logical_to_mesh_axes(self.logical_names.pre_bias, self.mesh)
+    w0_pspec = logical_to_mesh_axes(self.logical_names.wi_kernel_sp, self.mesh)
+    w1_pspec = logical_to_mesh_axes(self.logical_names.wi_kernel_sp, self.mesh)
+    wo_pspec = logical_to_mesh_axes(self.logical_names.wo_kernel_sp, self.mesh)
 
     if isinstance(w0_kernel, aqt.QTensor):
       w0_pspec = aqt.partition_spec(w0_pspec, (1,), w0_kernel.dtype, use_bias=False)
@@ -1047,8 +1045,8 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
             wo_bias_pspec,
             None,
         ),
-        out_specs=(nn.logical_to_mesh_axes(self.logical_names.out)),
-        check_vma=False,
+        out_specs=(logical_to_mesh_axes(self.logical_names.out, self.mesh)),
+        check_vma=True,
     )
     def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, rngs):
       batch_size, sequence_length, _ = x.shape
diff --git a/src/MaxText/sharding.py b/src/MaxText/sharding.py
@@ -51,6 +51,8 @@ def maybe_shard_with_logical(inputs, logical_axes, mesh, shard_mode):
   """
   A wrapper of maybe_shard_with_name when logical axes are inputs
   """
+  if inputs is None:
+    return None
   named_sharding = create_sharding(mesh, logical_axes)
   return maybe_shard_with_name(inputs, named_sharding, shard_mode)
 
diff --git a/src/MaxText/train.py b/src/MaxText/train.py
@@ -262,7 +262,11 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
         )
         extra_dpo_args = [reference_params]
     if config.shard_optimizer_over_data:
-      params = jax.tree.map(jax.lax.with_sharding_constraint, params, params_shardings)
+      params = jax.tree.map(
+          functools.partial(sharding.maybe_shard_with_name, shard_mode=config.shard_mode),
+          params,
+          params_shardings,
+      )
     grad_func = jax.value_and_grad(_loss_fn, argnums=4, has_aux=True)
     (loss, aux), raw_grads = grad_func(model, config, data, dropout_rng, params, *extra_dpo_args, is_train=True)