axonn-ai · Avuxon · Feb 2, 2024 · Feb 5, 2024 · Feb 6, 2024 · Feb 8, 2024
diff --git a/axonn/axonn.py b/axonn/axonn.py
@@ -111,6 +111,7 @@ def init(
     mixed_precision=False,
     fp16_allreduce=True,
     cpu_offload=False,
+    device="cuda",
 ) -> None:
     """
     Initialize AxoNN's 2D parallelism with G_inter-way inter-layer
@@ -135,8 +136,15 @@ def init(
     global comm_handle, is_initialized, computation_dtype, _fp16_all_reduce
     global _cpu_offload
     comm_handle = communication_handle(
-        G_inter, G_data, G_intra_r, G_intra_c, G_intra_d, gpus_per_node=gpus_per_node
+        G_inter,
+        G_data,
+        G_intra_r,
+        G_intra_c,
+        G_intra_d,
+        gpus_per_node=gpus_per_node,
+        device=device,
     )
+    config.device = device
     config.G_inter = G_inter
     config.G_data = G_data
     config.G_intra = G_intra_r * G_intra_c * G_intra_d
@@ -152,6 +160,14 @@ def init(
         comm_handle.intra_layer_column_parallel_rank
     )
     is_initialized = True
+    if device == "cuda" and not torch.cuda.is_available():
+        raise ValueError("CUDA is not available. Please choose a different device.")
+
+    if device == "cpu":
+        assert (
+            G_intra_d == 1
+        ), "G_intra_d > 1: Intra_d uses reduce-scatters which gloo(cpu) doesn't support"
+
     if mixed_precision:
         computation_dtype = torch.float16
     else:
@@ -542,7 +558,7 @@ def _post_fw_recv_requests():
     if (requests["fw"] is None) and config.inter_layer_parallel_rank > 0:
         tensor = torch.empty(
             size=_fill_shape(model.get_input_shape()),
-            device="cuda",
+            device=config.device,
             dtype=computation_dtype,
         )
         tensor.requires_grad = True
@@ -561,7 +577,7 @@ def _post_bw_recv_requests():
     ):
         tensor = torch.empty(
             size=_fill_shape(model.get_output_shape()),
-            device="cuda",
+            device=config.device,
             dtype=computation_dtype,
         )
         requests["bw"] = [

diff --git a/axonn/communication.py b/axonn/communication.py
@@ -32,6 +32,7 @@ def __init__(
         G_intra_c=1,
         G_intra_d=1,
         gpus_per_node=None,
+        device="cuda",
     ):
         """Constructor for the communication handle
 
@@ -44,6 +45,11 @@ def __init__(
             G_intra_c (int): number of GPUs in the column intra-layer parallel dimension
             G_intra_d (int): number of GPUs in the depth intra-layer parallel dimension
         """
+        if device == "cpu":
+            self.backend = "gloo"
+        else:
+            self.backend = "nccl"
+
         if not torch.distributed.is_initialized():
             assert MPI4PY, "either install mpi4py and launch via mpirun/srun"
             "or initialize torch.distributed outside axonn"
@@ -71,8 +77,10 @@ def __init__(
         self.gpus_per_node = (
             gpus_per_node if gpus_per_node is not None else torch.cuda.device_count()
         )
-        self.local_rank = self.world_rank % self.gpus_per_node
-        torch.cuda.set_device(self.local_rank)
+
+        if device == "cuda":
+            self.local_rank = self.world_rank % self.gpus_per_node
+            torch.cuda.set_device(self.local_rank)
         self.intra_layer_parallel_rank = self.world_rank % G_intra
         self.intra_layer_column_parallel_rank = (
             self.intra_layer_parallel_rank % G_intra_c
@@ -115,7 +123,7 @@ def __init__(
             master_port = os.getenv("MASTER_PORT", "6000")
             init_method += master_ip + ":" + master_port
             torch.distributed.init_process_group(
-                backend="nccl",
+                backend=self.backend,
                 world_size=self.world_size,
                 rank=self.world_rank,
                 init_method=init_method,
@@ -130,7 +138,7 @@ def __init__(
                     for k in range(self.G_data)
                 ]
                 ith_jth_data_parallel_group = torch.distributed.new_group(
-                    ranks=ranks_in_ith_jth_data_parallel_group, backend="nccl"
+                    ranks=ranks_in_ith_jth_data_parallel_group, backend=self.backend
                 )
                 if self.world_rank in ranks_in_ith_jth_data_parallel_group:
                     self.coll_nccl_comm = ith_jth_data_parallel_group
@@ -142,7 +150,7 @@ def __init__(
                     i_ * G_inter * G_intra + j_ * G_intra + k for k in range(G_intra)
                 ]
                 ith_jth_intra_layer_group = torch.distributed.new_group(
-                    ranks=ranks_in_ith_jth_intra_layer_group, backend="nccl"
+                    ranks=ranks_in_ith_jth_intra_layer_group, backend=self.backend
                 )
                 if self.world_rank in ranks_in_ith_jth_intra_layer_group:
                     self.intra_layer_group = ith_jth_intra_layer_group
@@ -165,7 +173,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[i, j, :]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.inner_intra_layer_parallel_group = group
@@ -177,7 +185,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[i, :, j]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.outer_intra_layer_parallel_group = group
@@ -189,7 +197,7 @@ def __init__(
                             ranks_in_ith_jth_intra_layer_group[:, i, j]
                         )
                         group = torch.distributed.new_group(
-                            ranks=group_members, backend="nccl"
+                            ranks=group_members, backend=self.backend
                         )
                         if self.world_rank in group_members:
                             self.depth_intra_layer_parallel_group = group
@@ -200,7 +208,7 @@ def __init__(
                         ranks_in_ith_jth_intra_layer_group[i, :, :].flatten()
                     )
                     group = torch.distributed.new_group(
-                        ranks=group_members, backend="nccl"
+                        ranks=group_members, backend=self.backend
                     )
                     if self.world_rank in group_members:
                         self.outer_inner_intra_layer_parallel_group = group

diff --git a/axonn/config.py b/axonn/config.py
@@ -8,3 +8,4 @@
 G_data = 0
 micro_batch_size = 0
 batch_size = 0
+device = "cuda"
diff --git a/axonn/intra_layer/__init__.py b/axonn/intra_layer/__init__.py
@@ -98,13 +98,22 @@ def trigger_async_all_gathers(model):
                     handle = None
                 else:
                     assert weight.ndim == 1
-                    output_shape = weight.shape[0] * world_size
-                    all_gathered_weight = torch.empty(
-                        output_shape, dtype=weight.dtype, device=weight.device
-                    )
-                    handle = dist.all_gather_into_tensor(
-                        all_gathered_weight, weight, group=process_group, async_op=True
-                    )
+
+                    if torch.distributed.get_backend() == "nccl":
+                        output_shape = weight.shape[0] * world_size
+                        all_gathered_weight = torch.empty(
+                            output_shape, dtype=weight.dtype, device=weight.device
+                        )
+                        handle = dist.all_gather_into_tensor(
+                            all_gathered_weight,
+                            weight,
+                            group=process_group,
+                            async_op=True,
+                        )
+
+                    elif torch.distributed.get_backend() == "gloo":
+                        raise NotImplementedError
+
                 weights_cache[weight] = [all_gathered_weight, handle]
             yield
 
@@ -173,6 +182,12 @@ def optimize_communication(
                 "for_overlapping_allgathers=model, ...)"
                 "if overlap_all_gather is True"
             )
+
+        if torch.distributed.get_backend() == "gloo":
+            raise ValueError(
+                "overlap_all_gather does not work with gloo" "please set it to false"
+            )
+
         ALL_GATHER_ITERATOR = trigger_async_all_gathers(
             model_object_for_overlapping_allgathers
         )

diff --git a/axonn/intra_layer/communication.py b/axonn/intra_layer/communication.py
@@ -1,6 +1,7 @@
 import torch.distributed as dist
 import torch
 import axonn
+from axonn import config
 
 
 def _all_reduce(input_, process_group=None, overlap_comm=False):
@@ -48,7 +49,8 @@ def _gather(input_, dim, process_group=None, cache=False):
         tensor_list = [
             torch.empty_like(input_) for _ in range(dist.get_world_size(process_group))
         ]
-        tensor_list[rank] = input_
+        if config.device == "cuda":
+            tensor_list[rank] = input_
         dist.all_gather(tensor_list, input_, group=process_group)
 
         # Note: torch.cat already creates a contiguous tensor.

diff --git a/axonn/intra_layer/fully_connected.py b/axonn/intra_layer/fully_connected.py
@@ -6,6 +6,7 @@
 
 from axonn import axonn as ax
 import axonn
+from axonn import config
 from .communication import (
     Drop,
     Gather,
@@ -38,9 +39,9 @@ def initialize_params(
     in_features_group,
     depth_group,
     init_method,
-    init_device="cuda",
+    init_device=config.device,
 ):
-    params = torch.empty((out_features, in_features), device=init_device)
+    params = torch.empty((out_features, in_features), device=config.device)
     init_method(params)
     params = extract_local_params_from_full_params(
         params, out_features_group, in_features_group, depth_group

diff --git a/axonn/tests/test_intra_layer_conv.py b/axonn/tests/test_intra_layer_conv.py
@@ -42,6 +42,7 @@ def norm_allclose(X, Y):
 )
 @pytest.mark.parametrize("easy_tp", [True, False])
 @pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
 @pytest.mark.skip(reason="torch.all_close does not work with conv")
 def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
     # These tests are in fp-32
@@ -54,15 +55,21 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
     # This is required because TF32 cores only look at the first 10 bits of mantissa
     torch.backends.cudnn.allow_tf32 = False
 
+    if device == "cpu" and G_intra_d > 1:
+        return  # Gloo doesnt support reduce scatter
+
     ax.init(
         G_data=1,
         G_inter=1,
         G_intra_r=G_intra_r,
         G_intra_c=G_intra_c,
         G_intra_d=G_intra_d,
+        mixed_precision=False,
+        fp16_allreduce=False,
+        device=device,
     )
 
-    X = torch.randn(B, C, H, W).cuda() * 0.01
+    X = torch.randn(B, C, H, W).to(device) * 0.01
 
     inner_group = ax.comm_handle.inner_intra_layer_parallel_group
     outer_group = ax.comm_handle.outer_intra_layer_parallel_group
@@ -78,7 +85,9 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
     else:
         X_local = X
 
-    layer = Conv2d(in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias).cuda()
+    layer = Conv2d(in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias).to(
+        device
+    )
 
     with torch.no_grad():
         # parallel FW pass
@@ -95,7 +104,7 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
             out_channels=C * 2,
             kernel_size=5,
             bias=bias,
-        ).cuda()
+        ).to(device)
         weight_sequential = _gather(
             _gather(
                 _gather(layer.weight, 0, depth_group).reshape(
@@ -126,12 +135,25 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
 )
 @pytest.mark.parametrize("easy_tp", [True, False])
 @pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
 @pytest.mark.parametrize("comm_opt_level", [0, 3])
 @pytest.mark.skip(reason="torch.all_close does not work with conv")
 def test_bw_pass(
-    G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias, comm_opt_level
+    G_intra_r,
+    G_intra_c,
+    G_intra_d,
+    B,
+    H,
+    W,
+    C,
+    easy_tp,
+    bias,
+    comm_opt_level,
+    device,
 ):
     # These tests are in fp-32
+    if device == "cpu" and G_intra_d > 1:
+        return  # Gloo doesn't support reduce scatter
     # Need to remove all non-determinism from convolutions
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
@@ -147,16 +169,21 @@ def test_bw_pass(
         G_intra_r=G_intra_r,
         G_intra_c=G_intra_c,
         G_intra_d=G_intra_d,
+        mixed_precision=False,
+        fp16_allreduce=False,
+        device=device,
     )
-    X = torch.randn(B, C, H, W).cuda() * 0.01
-    Y_grad = torch.randn(B, 2 * C, H - 4, W - 4).cuda() * 0.01
+    X = torch.randn(B, C, H, W).to(device) * 0.01
+    Y_grad = torch.randn(B, 2 * C, H - 4, W - 4).to(device) * 0.01
 
     inner_group = ax.comm_handle.inner_intra_layer_parallel_group
     outer_group = ax.comm_handle.outer_intra_layer_parallel_group
     depth_group = ax.comm_handle.depth_intra_layer_parallel_group
 
     # parallel backward pass
-    layer = Conv2d(in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias).cuda()
+    layer = Conv2d(in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias).to(
+        device
+    )
 
     if not easy_tp:
         X_local = (
@@ -176,9 +203,9 @@ def test_bw_pass(
         Y_local_grad = Y_grad
 
     with optimize_communication(
-        overlap_reduce_scatter=comm_opt_level >= 1,
+        overlap_reduce_scatter=comm_opt_level >= 1 and device != "cpu",
         cache_weights=comm_opt_level >= 2,
-        overlap_all_gather=comm_opt_level == 3,
+        overlap_all_gather=comm_opt_level == 3 and device != "cpu",
         model_object_for_overlapping_allgathers=layer,
     ):
         Y_local = layer(X_local, scatter_input=easy_tp, gather_output=easy_tp)
@@ -195,7 +222,7 @@ def test_bw_pass(
         out_channels=C * 2,
         kernel_size=5,
         bias=bias,
-    ).cuda()
+    ).to(device)
     with torch.no_grad():
         weight_sequential = _gather(
             _gather(