mlcommons
diff --git a/‎algoperf/workloads/cifar/cifar_jax/workload.py
Lines changed: 0 additions & 11 deletions b/‎algoperf/workloads/cifar/cifar_jax/workload.py
Lines changed: 0 additions & 11 deletions
diff --git a/‎algoperf/workloads/lm/input_pipeline.py
Lines changed: 1 addition & 1 deletion b/‎algoperf/workloads/lm/input_pipeline.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎algoperf/workloads/lm/lm_jax/models.py
Lines changed: 3 additions & 2 deletions b/‎algoperf/workloads/lm/lm_jax/models.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎algoperf/workloads/lm/lm_jax/workload.py
Lines changed: 71 additions & 11 deletions b/‎algoperf/workloads/lm/lm_jax/workload.py
Lines changed: 71 additions & 11 deletions
diff --git a/‎algoperf/workloads/lm/lm_pytorch/workload.py
Lines changed: 70 additions & 59 deletions b/‎algoperf/workloads/lm/lm_pytorch/workload.py
Lines changed: 70 additions & 59 deletions
@@ -71,17 +71,6 @@ def _build_input_queue(
                                      cache,
                                      repeat_final_dataset)
 
-  def sync_batch_stats(
-      self, model_state: spec.ModelAuxiliaryState) -> spec.ModelAuxiliaryState:
-    """Sync the batch statistics across replicas."""
-    # An axis_name is passed to pmap which can then be used by pmean.
-    # In this case each device has its own version of the batch statistics
-    # and we average them.
-    avg_fn = jax.pmap(lambda x: lax.pmean(x, 'x'), 'x')
-    new_model_state = model_state.copy()
-    new_model_state['batch_stats'] = avg_fn(model_state['batch_stats'])
-    return new_model_state
-
   def init_model_fn(
       self,
       rng: spec.RandomState,
 
@@ -87,7 +87,7 @@ def batch_iterator():
         tokens = jax.nn.one_hot(token_ids, num_classes=vocab_size)
         inputs, targets = tokens[:, :-1], tokens[:, 1:]
         inputs, targets = jax.device_put(inputs), jax.device_put(targets)
-      yield inputs, targets
+      yield {'inputs': inputs, 'targets': targets}
 
   return batch_iterator()
 
 
@@ -7,12 +7,13 @@ class LinearModel(nn.Module):
     @nn.compact
     def __call__(self, inputs: jnp.ndarray) -> jnp.ndarray:
         x = nn.Dense(
-            512,
+            10,
             kernel_init=nn.initializers.normal(0.02),
             bias_init=nn.initializers.zeros
         )(inputs)
         return nn.Dense(
             self.vocab_size,
             kernel_init=nn.initializers.normal(0.02),
-            bias_init=nn.initializers.zeros
+            bias_init=nn.initializers.zeros,
+            name="output"
         )(x)
@@ -2,33 +2,57 @@
 
 from typing import Dict, Optional, Tuple
 
+import jax
 import jax.numpy as jnp
+import optax
 from flax import jax_utils
 from algoperf import param_utils
+from algoperf import sharding_utils
 from algoperf import spec
 from algoperf.workloads.lm.workload import BaseLmWorkload
 from algoperf.workloads.lm.lm_jax.models import LinearModel
+from algoperf.workloads.lm.input_pipeline import get_hf_dataloader
 
 
 class LmWorkload(BaseLmWorkload):
   """LM JAX workload."""
 
+  def _build_input_queue(self,
+                         data_rng: jax.random.PRNGKey,
+                         split: str,
+                         data_dir: str,
+                         global_batch_size: int,
+                         num_batches: Optional[int] = None,
+                         repeat_final_dataset: bool = False):
+    """Build an input queue using HuggingFace FineWeb dataset."""
+    del num_batches
+    del repeat_final_dataset
+    loader = get_hf_dataloader(
+        cache_dir=data_dir,
+        data_rng=data_rng,
+        batch_size=global_batch_size,
+        seq_len=self._seq_len,
+        framework="jax",
+        split=split)
+    return loader
+
   def init_model_fn(
       self,
       rng: spec.RandomState,
       dropout_rate: Optional[float] = None,
       aux_dropout_rate: Optional[float] = None) -> spec.ModelInitState:
 
-    model = LinearModel(vocab_size=self._vocab_size)
+    self._model = LinearModel(vocab_size=self._vocab_size)
     input_shape = (1, self._seq_len, self._vocab_size)
-    variables = model.init(rng, jnp.ones(input_shape, jnp.float32))
-    model_state, params = variables.pop('params')
-    
+    params_rng, init_rng = jax.random.split(rng)
+    print(params_rng)
+    # variables = model.init(init_rng, jnp.ones(input_shape, jnp.float32))
+    variables = jax.jit(self._model.init)({'params': params_rng}, jnp.ones(input_shape, jnp.float32))
+    params = variables['params']
     self._param_shapes = param_utils.jax_param_shapes(params)
     self._param_types = param_utils.jax_param_types(self._param_shapes)
-    model_state = jax_utils.replicate(model_state)
-    params = jax_utils.replicate(params)
-    
+    params = sharding_utils.shard_replicated(params)
+    model_state = None
     return params, model_state
 
   def model_fn(
@@ -40,15 +64,51 @@ def model_fn(
       rng: spec.RandomState,
       update_batch_norm: bool) -> Tuple[spec.Tensor, spec.ModelAuxiliaryState]:
 
-    del mode, rng, update_batch_norm  # Not used for linear model
+    del mode, rng, update_batch_norm, model_state
     inputs = batch['inputs']
-    logits = self._model.apply({'params': params, **model_state}, inputs)
-    return logits, model_state
+    logits = self._model.apply({'params': params}, inputs)
+    return logits, None
+
+  def loss_fn(
+      self,
+      label_batch: spec.Tensor,
+      logits_batch: spec.Tensor,
+      mask_batch: Optional[spec.Tensor] = None,
+      label_smoothing: float = 0.0) -> Dict[str, spec.Tensor]:
+    """Compute cross-entropy loss for language modeling in JAX."""
+    vocab_size = logits_batch.shape[-1]
+    
+    if len(label_batch.shape) == len(logits_batch.shape):
+      # One-hot labels
+      loss = -jnp.sum(label_batch * jax.nn.log_softmax(logits_batch, axis=-1))
+    else:
+      # Dense labels
+      loss = -jax.nn.log_softmax(logits_batch)[jnp.arange(label_batch.shape[0]), label_batch]
+    
+    if mask_batch is not None:
+      loss = loss * mask_batch
+    
+    n_valid = mask_batch.sum() if mask_batch is not None else label_batch.shape[0]
+    return {
+        'summed': loss.sum(),
+        'n_valid_examples': n_valid,
+        'per_example': loss
+    }
 
+  def is_output_params(self, param_name: str) -> bool:
+    """Return whether the given parameter is an output parameter."""
+    return param_name.contains('output') 
+    
   def _eval_batch(self,
                   params: spec.ParameterContainer,
                   batch: Dict[str, spec.Tensor],
                   model_state: spec.ModelAuxiliaryState,
                   rng: spec.RandomState) -> spec.Tensor:
     """Evaluate the model on a single batch."""
-    pass
+    logits, _ = self.model_fn(
+        params, batch, model_state, spec.ForwardPassMode.EVAL, rng, False)
+    targets = batch['targets']
+    
+    # Calculate cross-entropy loss
+    loss = -jnp.sum(targets * jax.nn.log_softmax(logits, axis=-1))
+    return loss
@@ -66,68 +66,38 @@ def _build_input_queue(
       global_batch_size: int,
       num_batches: Optional[int] = None,
       repeat_final_dataset: bool = False) -> Iterator[Dict[str, spec.Tensor]]:
-    not_train = split != 'train'
-    per_device_batch_size = int(global_batch_size / N_GPUS)
-
-    seq_len = self._seq_len  # TODO: define it somewehere else?
-    dtype = torch.int32  # TODO: decide between int32 and int64.
-
-    # Only create and iterate over tf input pipeline in one Python process to
-    # avoid creating too many threads.
-    if RANK == 0:
-      np_iter = super()._build_input_queue(
-          data_rng=data_rng,
-          split=split,
-          data_dir=data_dir,
-          global_batch_size=global_batch_size,
-          num_batches=num_batches,
-          repeat_final_dataset=repeat_final_dataset)
+    """Build an input queue for the given split."""
+    from algoperf.workloads.lm.input_pipeline import get_hf_dataloader
+    
+    loader = get_hf_dataloader(
+        cache_dir=data_dir,
+        data_rng=data_rng,
+        batch_size=global_batch_size,
+        seq_len=self._seq_len,
+        framework="torch",
+        split=split)
+    seq_len = self._seq_len 
     weights = None
-
-    while True:
-      # Only iterate over tf input pipeline in one Python process to
-      # avoid creating too many threads.
-      if RANK == 0:
-        batch = next(np_iter)  # pylint: disable=stop-iteration-return
-        inputs = torch.as_tensor(
-            batch['inputs'], dtype=dtype,
-            device=DEVICE)  # (N_GPUS, global_batch_size, seq_len)
-        targets = torch.as_tensor(
-            batch['targets'], dtype=dtype,
-            device=DEVICE)  # (N_GPUS, global_batch_size, seq_len)
-
-        # Send batch to other devices when using DDP.
-        if USE_PYTORCH_DDP:
-          if not_train:
-            # During eval, the batch size of the remainder might be different.
-            per_device_batch_size = torch.tensor(
-                len(targets[0]), dtype=dtype, device=DEVICE)
-            dist.broadcast(per_device_batch_size, src=0)
-          # We don't broadcast the shard for RANK 0.
-          dist.broadcast(inputs[1:], src=0)
-          dist.broadcast(targets[1:], src=0)
-
-        # RANK 0 extracts his shard. If not DDP, this just flattens.
-        inputs, targets = inputs[0], targets[0]
-
-      else:
-        # Receive batch from rank 0.
-        if not_train:
-          # During eval, the batch size of the remainder might be different.
-          per_device_batch_size = torch.empty((1,), dtype=dtype, device=DEVICE)
+    
+    dtype = torch.long
+    is_train = split == 'train'
+    
+    for batch in loader:
+      inputs, targets = batch
+      
+      if USE_PYTORCH_DDP:
+        if not is_train:
+          # During eval, the batch size of the remainder might be different
+          per_device_batch_size = torch.tensor(
+              len(targets[0]), dtype=dtype, device=DEVICE)
           dist.broadcast(per_device_batch_size, src=0)
-
-        # N_GPUS - 1 since we don't broadcast the shard for RANK 0.
-        inputs = torch.empty((N_GPUS - 1, per_device_batch_size, seq_len),
-                             dtype=dtype,
-                             device=DEVICE)
-        targets = torch.empty((N_GPUS - 1, per_device_batch_size, seq_len),
-                              dtype=dtype,
-                              device=DEVICE)
+        
+        # Broadcast to all devices
         dist.broadcast(inputs, src=0)
         dist.broadcast(targets, src=0)
-        # RANK - 1 since we don't broadcast the shard for RANK 0.
-        inputs, targets = inputs[RANK - 1], targets[RANK - 1]
+      
+      if weights is None:
+        weights = torch.ones(inputs.shape[0], device=DEVICE)
 
       if weights is None:
         weights = torch.ones(per_device_batch_size, device=DEVICE)
@@ -138,10 +108,51 @@ def _build_input_queue(
       }
       yield batch
 
+  def is_output_params(self, param_name: str) -> bool:
+    """Return whether the given parameter is an output parameter."""
+    return 'output.weight' in param_name or 'output.bias' in param_name
+    
   def _eval_batch(self,
                   params: spec.ParameterContainer,
                   batch: Dict[str, spec.Tensor],
                   model_state: spec.ModelAuxiliaryState,
                   rng: spec.RandomState) -> spec.Tensor:
     """Evaluate the model on a single batch."""
-    pass
+    model = params
+    logits, _ = self.model_fn(
+        model, batch, model_state, spec.ForwardPassMode.EVAL, rng, False)
+    targets = batch['targets']
+    
+    # Calculate cross-entropy loss
+    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+    loss = -torch.sum(targets * log_probs)
+    return loss
+  def loss_fn(
+      self,
+      label_batch: spec.Tensor,
+      logits_batch: spec.Tensor,
+      mask_batch: Optional[spec.Tensor] = None,
+      label_smoothing: float = 0.0) -> Dict[str, spec.Tensor]:
+    """Compute cross-entropy loss for language modeling in PyTorch."""
+    vocab_size = logits_batch.shape[-1]
+    
+    if len(label_batch.shape) == len(logits_batch.shape):
+      # One-hot labels
+      log_probs = torch.nn.functional.log_softmax(logits_batch, dim=-1)
+      loss = -torch.sum(label_batch * log_probs, dim=-1)
+    else:
+      # Dense labels
+      loss = torch.nn.functional.cross_entropy(
+          logits_batch, 
+          label_batch,
+          reduction='none')
+    
+    if mask_batch is not None:
+      loss = loss * mask_batch
+    
+    n_valid = mask_batch.sum() if mask_batch is not None else label_batch.shape[0]
+    return {
+        'summed': loss.sum(),
+        'n_valid_examples': n_valid,
+        'per_example': loss
+    }