mlcommons · rka97 · Nov 21, 2024 · Nov 21, 2024 · Dec 9, 2024 · Dec 9, 2024
@@ -37,7 +37,7 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=wmt     --framework=jax     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/jax_nadamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=wmt     --framework=jax     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/jax_nadamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json
   wmt_pytorch:
     runs-on: ubuntu-latest
     steps:
@@ -54,7 +54,7 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=wmt     --framework=pytorch     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_nadamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=wmt     --framework=pytorch     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_nadamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/wmt/tuning_search_space.json
   imagenet_jax:
     runs-on: ubuntu-latest
     steps:
@@ -71,8 +71,8 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=imagenet_vit     --framework=jax     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json
-        python tests/reference_algorithm_tests.py --workload=imagenet_resnet     --framework=jax     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/jax_momentum.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=imagenet_vit     --framework=jax     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=imagenet_resnet     --framework=jax     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/jax_momentum.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json
   imagenet_pytorch:
     runs-on: ubuntu-latest
     steps:
@@ -89,8 +89,8 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=imagenet_resnet     --framework=pytorch     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_momentum.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json
-        python tests/reference_algorithm_tests.py --workload=imagenet_vit     --framework=pytorch     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=imagenet_resnet     --framework=pytorch     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_momentum.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=imagenet_vit     --framework=pytorch     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/imagenet_vit/tuning_search_space.json
   # uncomment when https://github.com/mlcommons/algorithmic-efficiency/issues/339 is resolved.
   criteo_jax:
     runs-on: ubuntu-latest
@@ -142,8 +142,8 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=librispeech_conformer     --framework=jax     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json
-        python tests/reference_algorithm_tests.py --workload=librispeech_deepspeech     --framework=jax     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=librispeech_conformer     --framework=jax     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=librispeech_deepspeech     --framework=jax     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/jax_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json
   speech_pytorch:
     runs-on: ubuntu-latest
     steps:
@@ -160,8 +160,8 @@ jobs:
         pip install .[pytorch_cpu]
         pip install .[full]
         pip install -e .
-        python tests/reference_algorithm_tests.py --workload=librispeech_deepspeech     --framework=pytorch     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json
-        python tests/reference_algorithm_tests.py --workload=librispeech_conformer     --framework=pytorch     --global_batch_size=2     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=librispeech_deepspeech     --framework=pytorch     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_deepspeech/tuning_search_space.json
+        python tests/reference_algorithm_tests.py --workload=librispeech_conformer     --framework=pytorch     --global_batch_size=8     --submission_path=reference_algorithms/target_setting_algorithms/pytorch_adamw.py     --tuning_search_space=reference_algorithms/target_setting_algorithms/librispeech_conformer/tuning_search_space.json
   ogbg:
     runs-on: ubuntu-latest
     steps:

@@ -25,4 +25,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
-algoperf/_version.py
+algoperf/_version.py
@@ -11,7 +11,6 @@
 from flax import jax_utils
 from flax.training import checkpoints as flax_checkpoints
 from flax.training.checkpoints import latest_checkpoint
-import jax
 import numpy as np
 from tensorflow.io import gfile  # pytype: disable=import-error
 import torch
@@ -193,10 +192,7 @@ def save_checkpoint(framework: str,
     train_state, eval_results, global_step, preemption_count).
   """
   if framework == 'jax':
-    model_params = jax.device_get(jax_utils.unreplicate(model_params))
     opt_state, _ = optimizer_state
-    opt_state = jax.device_get(jax_utils.unreplicate(opt_state))
-    model_state = jax.device_get(jax_utils.unreplicate(model_state))
   else:
     if isinstance(
         model_params,

@@ -11,6 +11,7 @@
 from torch.utils.data import DistributedSampler
 from torch.utils.data import Sampler
 
+from algoperf import sharding_utils
 from algoperf import spec
 
 
@@ -50,6 +51,7 @@ def shard_and_maybe_pad_np(
     weights = batch.get('weights')
     # The weights will also be padded.
     batch['weights'] = np.ones(mask_shape) if weights is None else weights
+  naive_sharding_spec = sharding_utils.get_naive_sharding_spec()
 
   def _prepare(x):
     # Use _numpy() for zero-copy conversion between TF and NumPy.
@@ -60,10 +62,7 @@ def _prepare(x):
     if remainder_size != 0 or pad_to_global_batch_size:
       x = pad(x, pad_size, padding_value=padding_value)
 
-    # Reshape (global_batch_size, ...) to
-    # (local_device_count, per_device_batch_size, ...).
-    # Assumes that `global_batch_size % local_device_count == 0`.
-    return x.reshape((local_device_count, -1, *x.shape[1:]))
+    return jax.device_put(x, naive_sharding_spec)
 
   return jax.tree.map(_prepare, batch)
 

@@ -43,6 +43,8 @@ def pytorch_param_types(
         param_types[name] = spec.ParameterType.ATTENTION_BIAS
       elif 'in_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_QKV
+      elif 'qkv' in name:
+        param_types[name] = spec.ParameterType.ATTENTION_QKV
       elif 'kv_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_KV
       elif 'k_proj' in name or 'key' in name:

@@ -0,0 +1,82 @@
+"""Utilities for dealing with sharding in JAX."""
+
+import jax
+from jax.sharding import NamedSharding
+from jax.sharding import PartitionSpec
+
+
+def get_mesh() -> jax.sharding.Mesh:
+  """Creates a mesh from all available GPUs.
+  Here, we simply create a one-dimensional mesh."""
+  return jax.sharding.Mesh(jax.devices(), ("batch",))
+
+
+def get_replicated_sharding(mesh=None):
+  """Returns a sharding spec that replicates data across all devices."""
+  if mesh is None:
+    mesh = get_mesh()
+  return NamedSharding(mesh, PartitionSpec())
+
+
+def shard_replicated(x, mesh=None):
+  """Shards a tensor across all devices."""
+  if mesh is None:
+    mesh = get_mesh()
+  return jax.tree.map(
+      lambda x: jax.device_put(x, get_replicated_sharding(mesh)), x)
+
+
+def get_naive_sharding_spec(mesh=None):
+  """Returns a sharding spec that shards data along the first axis."""
+  if mesh is None:
+    mesh = get_mesh()
+  return NamedSharding(mesh, PartitionSpec("batch"))
+
+
+def get_naive_sharding(x, mesh=None):
+  """Given a 1D mesh and a tensor, try to shard along the appropriate axis."""
+  if mesh is None:
+    mesh = get_mesh()
+  grid_size = mesh.shape["batch"]
+  if len(x.shape) > 0 and x.shape[0] % grid_size == 0:
+    return NamedSharding(mesh, PartitionSpec("batch"))
+  else:
+    return NamedSharding(mesh, PartitionSpec())
+
+
+def shard_params(params, mesh=None):
+  """Shards a parameter tree across all devices
+  with naive sharding (see get_naive_sharding)."""
+  if mesh is None:
+    mesh = get_mesh()
+  return jax.tree.map(lambda x: jax.device_put(x, get_naive_sharding(x)),
+                      params)
+
+
+def shard_naive(x, mesh=None):
+  return shard_params(x, mesh)
+
+
+def get_naive_sharding_tree(input_tree, mesh=None):
+  if mesh is None:
+    mesh = get_mesh()
+  return jax.tree.map(lambda x: get_naive_sharding(x, mesh), input_tree)
+
+
+def get_sharding_tree(params, mesh=None):
+  """Returns a sharding tree for a parameter tree."""
+  return jax.tree.map(lambda x: get_naive_sharding(x, mesh), params)
+
+
+def get_empty_sharding(mesh=None):
+  """Returns a sharding spec that replicates data across all devices."""
+  if mesh is None:
+    mesh = get_mesh()
+  return NamedSharding(mesh, PartitionSpec())
+
+
+def disp_shard_info(x: jax.Array):
+  """Displays shard info of a jax array."""
+  for shard in x.addressable_shards:
+    print(f"shard.device: {shard.device}, index: {shard.index}, replica_id:"
+          f" {shard.replica_id}.\n")
@@ -8,7 +8,6 @@
 import functools
 from typing import Dict, Iterator, Tuple
 
-from flax import jax_utils
 import jax
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -171,5 +170,6 @@ def create_input_iter(
       functools.partial(
           shard_and_maybe_pad_np, global_batch_size=global_batch_size),
       ds)
-  it = jax_utils.prefetch_to_device(it, 2)
+  # FIXME(rka97): Figure out how to do prefetching+sharding.
+  # it = jax_utils.prefetch_to_device(it, 2)
   return it