axonn-ai · Avuxon · Feb 2, 2024 · Feb 5, 2024 · Feb 6, 2024 · Feb 8, 2024
diff --git a/.github/workflows/nvidia-rtx-3090-tests.yaml b/.github/workflows/nvidia-rtx-3090-tests.yaml
@@ -17,6 +17,10 @@ jobs:
         ginter: [ 1, 2 ]
         memopt: [ '0', '1' ]
 
+    env:
+      SLURM_NTASKS: 0
+      SLURM_PROCID: 0
+
     steps:
     - uses: actions/checkout@v3
     - name: Install AxoNN
@@ -40,6 +44,10 @@ jobs:
   intra-layer:
     runs-on: [ nvidia ]
 
+    env:
+      SLURM_NTASKS: 0
+      SLURM_PROCID: 0
+
     steps:
     - uses: actions/checkout@v3
     - name: Install AxoNN

diff --git a/axonn/axonn.py b/axonn/axonn.py
@@ -111,6 +111,7 @@ def init(
     mixed_precision=False,
     fp16_allreduce=True,
     cpu_offload=False,
+    device="cuda",
 ) -> None:
     """
     Initialize AxoNN's 2D parallelism with G_inter-way inter-layer
@@ -135,8 +136,15 @@ def init(
     global comm_handle, is_initialized, computation_dtype, _fp16_all_reduce
     global _cpu_offload
     comm_handle = communication_handle(
-        G_inter, G_data, G_intra_r, G_intra_c, G_intra_d, gpus_per_node=gpus_per_node
+        G_inter,
+        G_data,
+        G_intra_r,
+        G_intra_c,
+        G_intra_d,
+        gpus_per_node=gpus_per_node,
+        device=device,
     )
+    config.device = device
     config.G_inter = G_inter
     config.G_data = G_data
     config.G_intra = G_intra_r * G_intra_c * G_intra_d
@@ -152,6 +160,9 @@ def init(
         comm_handle.intra_layer_column_parallel_rank
     )
     is_initialized = True
+    if device == "cuda" and not torch.cuda.is_available():
+        raise ValueError("CUDA is not available. Please choose a different device.")
+
     if mixed_precision:
         computation_dtype = torch.float16
     else:
@@ -537,7 +548,7 @@ def _post_fw_recv_requests():
     if (requests["fw"] is None) and config.inter_layer_parallel_rank > 0:
         tensor = torch.empty(
             size=_fill_shape(model.get_input_shape()),
-            device="cuda",
+            device=config.device,
             dtype=computation_dtype,
         )
         tensor.requires_grad = True
@@ -556,7 +567,7 @@ def _post_bw_recv_requests():
     ):
         tensor = torch.empty(
             size=_fill_shape(model.get_output_shape()),
-            device="cuda",
+            device=config.device,
             dtype=computation_dtype,
         )
         requests["bw"] = [

diff --git a/axonn/communication.py b/axonn/communication.py
@@ -32,6 +32,7 @@ def __init__(
         G_intra_c=1,
         G_intra_d=1,
         gpus_per_node=None,
+        device="cuda",
     ):
         """Constructor for the communication handle
 
@@ -44,6 +45,11 @@ def __init__(
             G_intra_c (int): number of GPUs in the column intra-layer parallel dimension
             G_intra_d (int): number of GPUs in the depth intra-layer parallel dimension
         """
+        if device == "cpu":
+            self.backend = "gloo"
+        else:
+            self.backend = "nccl"
+
         if not torch.distributed.is_initialized():
             assert MPI4PY, "either install mpi4py and launch via mpirun/srun"
             "or initialize torch.distributed outside axonn"
@@ -71,8 +77,10 @@ def __init__(
         self.gpus_per_node = (
             gpus_per_node if gpus_per_node is not None else torch.cuda.device_count()
         )
-        self.local_rank = self.world_rank % self.gpus_per_node
-        torch.cuda.set_device(self.local_rank)
+
+        if device == "cuda":
+            self.local_rank = self.world_rank % self.gpus_per_node
+            torch.cuda.set_device(self.local_rank)
         self.intra_layer_parallel_rank = self.world_rank % G_intra
         self.intra_layer_column_parallel_rank = (
             self.intra_layer_parallel_rank % G_intra_c
@@ -115,7 +123,7 @@ def __init__(
             master_port = os.getenv("MASTER_PORT", "6000")
             init_method += master_ip + ":" + master_port
             torch.distributed.init_process_group(
-                backend="nccl",
+                backend=self.backend,
                 world_size=self.world_size,
                 rank=self.world_rank,
                 init_method=init_method,
@@ -130,7 +138,7 @@ def __init__(
                     for k in range(self.G_data)
                 ]
                 ith_jth_data_parallel_group = torch.distributed.new_group(
-                    ranks=ranks_in_ith_jth_data_parallel_group, backend="nccl"
+                    ranks=ranks_in_ith_jth_data_parallel_group, backend=self.backend
                 )
                 if self.world_rank in ranks_in_ith_jth_data_parallel_group:
                     self.coll_nccl_comm = ith_jth_data_parallel_group
@@ -142,7 +150,7 @@ def __init__(
                     i_ * G_inter * G_intra + j_ * G_intra + k for k in range(G_intra)
                 ]
                 ith_jth_intra_layer_group = torch.distributed.new_group(
-                    ranks=ranks_in_ith_jth_intra_layer_group, backend="nccl"
+                    ranks=ranks_in_ith_jth_intra_layer_group, backend=self.backend
                 )
                 if self.world_rank in ranks_in_ith_jth_intra_layer_group:
                     self.intra_layer_group = ith_jth_intra_layer_group
@@ -165,7 +173,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[i, j, :]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.inner_intra_layer_parallel_group = group
@@ -177,7 +185,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[i, :, j]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.outer_intra_layer_parallel_group = group
@@ -189,7 +197,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[:, i, j]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.depth_intra_layer_parallel_group = group
@@ -200,7 +208,7 @@ def __init__(
                         ranks_in_ith_jth_intra_layer_group[i, :, :].flatten()
                     )
                     group = torch.distributed.new_group(
-                        ranks=group_members, backend="nccl"
+                        ranks=group_members, backend=self.backend
                     )
                     if self.world_rank in group_members:
                         self.outer_inner_intra_layer_parallel_group = group

diff --git a/axonn/config.py b/axonn/config.py
@@ -8,3 +8,4 @@
 G_data = 0
 micro_batch_size = 0
 batch_size = 0
+device = "cuda"
diff --git a/axonn/intra_layer/__init__.py b/axonn/intra_layer/__init__.py
@@ -98,13 +98,22 @@ def trigger_async_all_gathers(model):
                     handle = None
                 else:
                     assert weight.ndim == 1
-                    output_shape = weight.shape[0] * world_size
-                    all_gathered_weight = torch.empty(
-                        output_shape, dtype=weight.dtype, device=weight.device
-                    )
-                    handle = dist.all_gather_into_tensor(
-                        all_gathered_weight, weight, group=process_group, async_op=True
-                    )
+
+                    if torch.distributed.get_backend() == "nccl":
+                        output_shape = weight.shape[0] * world_size
+                        all_gathered_weight = torch.empty(
+                            output_shape, dtype=weight.dtype, device=weight.device
+                        )
+                        handle = dist.all_gather_into_tensor(
+                            all_gathered_weight,
+                            weight,
+                            group=process_group,
+                            async_op=True,
+                        )
+
+                    elif torch.distributed.get_backend() == "gloo":
+                        raise NotImplementedError
+
                 weights_cache[weight] = [all_gathered_weight, handle]
             yield
 
@@ -173,6 +182,12 @@ def optimize_communication(
                 "for_overlapping_allgathers=model, ...)"
                 "if overlap_all_gather is True"
             )
+
+        if torch.distributed.get_backend() == "gloo":
+            raise ValueError(
+                "overlap_all_gather does not work with gloo" "please set it to false"
+            )
+
         ALL_GATHER_ITERATOR = trigger_async_all_gathers(
             model_object_for_overlapping_allgathers
         )

diff --git a/axonn/intra_layer/communication.py b/axonn/intra_layer/communication.py
@@ -1,6 +1,7 @@
 import torch.distributed as dist
 import torch
 import axonn
+from axonn import config
 
 
 def _all_reduce(input_, process_group=None, overlap_comm=False):
@@ -48,7 +49,8 @@ def _gather(input_, dim, process_group=None, cache=False):
         tensor_list = [
             torch.empty_like(input_) for _ in range(dist.get_world_size(process_group))
         ]
-        tensor_list[rank] = input_
+        if config.device == "cuda":
+            tensor_list[rank] = input_
         dist.all_gather(tensor_list, input_, group=process_group)
 
         # Note: torch.cat already creates a contiguous tensor.

diff --git a/axonn/intra_layer/fully_connected.py b/axonn/intra_layer/fully_connected.py
@@ -6,6 +6,7 @@
 
 from axonn import axonn as ax
 import axonn
+from axonn import config
 from .communication import (
     Drop,
     Gather,
@@ -38,9 +39,9 @@ def initialize_params(
     in_features_group,
     depth_group,
     init_method,
-    init_device="cuda",
+    init_device=config.device,
 ):
-    params = torch.empty((out_features, in_features), device=init_device)
+    params = torch.empty((out_features, in_features), device=config.device)
     init_method(params)
     params = extract_local_params_from_full_params(
         params, out_features_group, in_features_group, depth_group

diff --git a/axonn/tests/test_intra_layer_fc.py b/axonn/tests/test_intra_layer_fc.py
@@ -18,18 +18,28 @@
 )
 @pytest.mark.parametrize("easy_tp", [False, True])
 @pytest.mark.parametrize("bias", [False, True])
-def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, easy_tp, bias):
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, easy_tp, bias, device):
     # These tests are in fp-32
     torch.manual_seed(42)
+
+    # GPU runs on axonn-cpu currently do not work with mixed_precision or fp16_allreduce
+    # if set_device == "cpu":
+    #     bool set_mixed_precision = False
+    #     bool set_fp16_allreduce = False
+
     ax.init(
         G_data=1,
         G_inter=1,
         G_intra_r=G_intra_r,
         G_intra_c=G_intra_c,
         G_intra_d=G_intra_d,
+        mixed_precision=False,
+        fp16_allreduce=False,
+        device=device,
     )
 
-    X = torch.randn(B, H).cuda() * 0.01
+    X = torch.randn(B, H).to(device) * 0.01
 
     inner_group = ax.comm_handle.inner_intra_layer_parallel_group
     outer_group = ax.comm_handle.outer_intra_layer_parallel_group
@@ -44,8 +54,10 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, easy_tp, bias):
         )  # divide colunns of X along the inner tensor group
         # manually divide input
 
-    layer = Linear(in_features=H, out_features=H, bias=bias).cuda()
-    layer_sequential = torch.nn.Linear(in_features=H, out_features=H, bias=bias).cuda()
+    layer = Linear(in_features=H, out_features=H, bias=bias).to(device)
+    layer_sequential = torch.nn.Linear(in_features=H, out_features=H, bias=bias).to(
+        device
+    )
 
     # test if load state dict works with a sequential checkpoint
     layer.load_state_dict(layer_sequential.state_dict())
@@ -72,6 +84,7 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, easy_tp, bias):
 @pytest.mark.parametrize("easy_tp", [False, True])
 @pytest.mark.parametrize("clip_grad_norm", [-1, 1e-3])
 @pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_bw_pass(
     G_intra_r,
     G_intra_c,
@@ -82,18 +95,25 @@ def test_bw_pass(
     easy_tp,
     clip_grad_norm,
     bias,
+    device,
 ):
     # These tests are in fp-32
+    if device == "cpu" and G_intra_d > 1:
+        return  # Gloo doesnt support reduce scatter
+
     torch.manual_seed(42)
     ax.init(
         G_data=1,
         G_inter=1,
         G_intra_r=G_intra_r,
         G_intra_c=G_intra_c,
         G_intra_d=G_intra_d,
+        mixed_precision=False,
+        fp16_allreduce=False,
+        device=device,
     )
-    X = torch.randn(B, H).cuda() * 0.01
-    Y_grad = torch.randn(B, H).cuda() * 0.01
+    X = torch.randn(B, H).to(device) * 0.01
+    Y_grad = torch.randn(B, H).to(device) * 0.01
 
     inner_group = ax.comm_handle.inner_intra_layer_parallel_group
     outer_group = ax.comm_handle.outer_intra_layer_parallel_group
@@ -104,8 +124,10 @@ def test_bw_pass(
         in_features=H,
         out_features=H,
         bias=bias,
-    ).cuda()
-    layer_sequential = torch.nn.Linear(in_features=H, out_features=H, bias=bias).cuda()
+    ).to(device)
+    layer_sequential = torch.nn.Linear(in_features=H, out_features=H, bias=bias).to(
+        device
+    )
 
     # test if load state dict works with a sequential checkpoint
     layer.load_state_dict(layer_sequential.state_dict())
@@ -128,9 +150,9 @@ def test_bw_pass(
 
     with optimize_communication(
         overlap_all_reduce=comm_opt_level >= 1,
-        overlap_reduce_scatter=comm_opt_level >= 2,
+        overlap_reduce_scatter=comm_opt_level >= 2 and device != "cpu",
         cache_weights=comm_opt_level >= 3,
-        overlap_all_gather=comm_opt_level == 4,
+        overlap_all_gather=comm_opt_level == 4 and device != "cpu",
         model_object_for_overlapping_allgathers=layer,
     ):
         Y_local = layer(X_local, scatter_input=easy_tp, gather_output=easy_tp)
@@ -175,17 +197,3 @@ def test_bw_pass(
         assert torch.allclose(
             bias_grad_parallel, layer_sequential.bias.grad
         ), "BW Pass - gradients of bias do not match"
-
-
-if __name__ == "__main__":
-    test_bw_pass(
-        G_intra_r=1,
-        G_intra_c=1,
-        G_intra_d=2,
-        B=2,
-        H=256,
-        comm_opt_level=0,
-        easy_tp=False,
-        clip_grad_norm=-1,
-        bias=True,
-    )