Large data count support for MPI Communication (#1765)

JuanPedroGHM · Hoppe · mrfh92 · web-flow · commit d2afadff0858 · 2025-02-24T15:55:14.000+01:00
* trick to send large data

* added tests

* Fixes for allreduce

* fixed large counts for allreduce, now trying to fix non-contiguous data types

* Custom operations for allreduce

* bench fixes

* perun fix

* correct inplace contiguous (sorry fabian)

* remove print statements

* benchmark fixes and debug output

* Incorrect move to acc if not CUDA_AWARE_MPI

* added tests for the Allreduce case

* tests were too large

* Update test_communication.py

---------

Co-authored-by: Hoppe &lt;mrhf92@gmail.com&gt;
Co-authored-by: Fabian Hoppe &lt;112093564+mrfh92@users.noreply.github.com&gt;
diff --git a/.perun.ini b/.perun.ini
@@ -1,16 +1,35 @@
+[post-processing]
+power_overhead = 100
+pue = 1.05
+emissions_factor = 417.8
+price_factor = 0.3251
+price_unit = €
+
+[monitor]
+sampling_period = 0.1
+include_backends =
+include_sensors =
+exclude_backends =
+exclude_sensors = CPU_FREQ_\d
+
 [output]
+app_name
+run_id
 format = bench
 data_out = ./bench_data
 
 [benchmarking]
 rounds = 10
 warmup_rounds = 1
-metrics=runtime
-region_metrics=runtime
+metrics = runtime,energy
+region_metrics = runtime,power
 
 [benchmarking.units]
 joule = k
 second =
 percent =
 watt =
 byte = G
+
+[debug]
+log_lvl = WARNING
diff --git a/benchmarks/cb/main.py b/benchmarks/cb/main.py
@@ -6,6 +6,10 @@
 ht.use_device(os.environ["HEAT_DEVICE"] if os.environ["HEAT_DEVICE"] else "cpu")
 ht.random.seed(12345)
 
+world_size = ht.MPI_WORLD.size
+rank = ht.MPI_WORLD.rank
+print(f"{rank}/{world_size}: Working on {ht.get_device()}")
+
 from linalg import run_linalg_benchmarks
 from cluster import run_cluster_benchmarks
 from manipulations import run_manipulation_benchmarks
diff --git a/heat/core/communication.py b/heat/core/communication.py
@@ -5,6 +5,8 @@
 from __future__ import annotations
 
 import numpy as np
+import math
+import ctypes
 import os
 import subprocess
 import torch
@@ -123,6 +125,8 @@ class MPICommunication(Communication):
         Handle for the mpi4py Communicator
     """
 
+    COUNT_LIMIT = torch.iinfo(torch.int32).max
+
     __mpi_type_mappings = {
         torch.bool: MPI.BOOL,
         torch.uint8: MPI.UNSIGNED_CHAR,
@@ -288,7 +292,33 @@ def mpi_type_and_elements_of(
 
         if is_contiguous:
             if counts is None:
-                return mpi_type, elements
+                if elements > cls.COUNT_LIMIT:
+                    # Uses vector type to get around the MAX_INT limit on certain MPI implementations
+                    # This is at the moment only applied when sending contiguous data, as the construction of data types to get around non-contiguous data naturally aliviates the problem to a certain extent.
+                    # Thanks to: J. R. Hammond, A. Schäfer and R. Latham, "To INT_MAX... and Beyond! Exploring Large-Count Support in MPI," 2014 Workshop on Exascale MPI at Supercomputing Conference, New Orleans, LA, USA, 2014, pp. 1-8, doi: 10.1109/ExaMPI.2014.5. keywords: {Vectors;Standards;Libraries;Optimization;Context;Memory management;Open area test sites},
+
+                    new_count = elements // cls.COUNT_LIMIT
+                    left_over = elements % cls.COUNT_LIMIT
+
+                    if new_count > cls.COUNT_LIMIT:
+                        raise ValueError("Tensor is too large")
+                    vector_type = mpi_type.Create_vector(
+                        new_count, cls.COUNT_LIMIT, cls.COUNT_LIMIT
+                    )
+                    if left_over > 0:
+                        left_over_mpi_type = mpi_type.Create_contiguous(left_over).Commit()
+                        _, old_type_extent = mpi_type.Get_extent()
+                        disp = cls.COUNT_LIMIT * new_count * old_type_extent
+                        struct_type = mpi_type.Create_struct(
+                            [1, 1], [0, disp], [vector_type, left_over_mpi_type]
+                        ).Commit()
+                        vector_type.Free()
+                        left_over_mpi_type.Free()
+                        return struct_type, 1
+                    else:
+                        return vector_type, 1
+                else:
+                    return mpi_type, elements
             factor = np.prod(obj.shape[1:], dtype=np.int32)
             return (
                 mpi_type,
@@ -317,7 +347,7 @@ def mpi_type_and_elements_of(
         return mpi_type, elements
 
     @classmethod
-    def as_mpi_memory(cls, obj) -> MPI.memory:
+    def as_mpi_memory(cls, obj: torch.Tensor) -> MPI.memory:
         """
         Converts the passed ``torch.Tensor`` into an MPI compatible memory view.
 
@@ -327,7 +357,8 @@ def as_mpi_memory(cls, obj) -> MPI.memory:
             The tensor to be converted into a MPI memory view.
         """
         # TODO: MPI.memory might be depraecated in future versions of mpi4py. The following code might need to be adapted and use MPI.buffer instead.
-        return MPI.memory.fromaddress(obj.data_ptr(), 0)
+        nbytes = obj.dtype.itemsize * obj.numel()
+        return MPI.memory.fromaddress(obj.data_ptr(), nbytes)
 
     @classmethod
     def as_buffer(
@@ -782,11 +813,71 @@ def Ibcast(self, buf: Union[DNDarray, torch.Tensor, Any], root: int = 0) -> MPIR
 
     Ibcast.__doc__ = MPI.Comm.Ibcast.__doc__
 
+    def __derived_op(
+        self, tensor: torch.Tensor, datatype: MPI.Datatype, operation: MPI.Op
+    ) -> Callable[[MPI.memory, MPI.memory, MPI.Datatype], None]:
+
+        # Based from this conversation on the internet: https://groups.google.com/g/mpi4py/c/UkDT_9pp4V4?pli=1
+        shape = tensor.shape
+        dtype = tensor.dtype
+        stride = tensor.stride()
+        offset = tensor.storage_offset()
+        count = tensor.numel()
+
+        mpiOp2torch = {
+            MPI.SUM.handle: torch.add,
+            MPI.PROD.handle: torch.mul,
+            MPI.MIN.handle: torch.min,
+            MPI.MAX.handle: torch.max,
+            MPI.LAND.handle: torch.logical_and,
+            MPI.LOR.handle: torch.logical_or,
+            MPI.LXOR.handle: torch.logical_xor,
+            MPI.BAND.handle: torch.bitwise_and,
+            MPI.BOR.handle: torch.bitwise_or,
+            MPI.BXOR.handle: torch.bitwise_xor,
+            # MPI.MINLOC.handle: torch.argmin, Not supported, seems to be an invalid inplace operation
+            # MPI.MAXLOC.handle: torch.argmax
+        }
+        mpiDtype2Ctype = {
+            torch.bool: ctypes.c_bool,
+            torch.uint8: ctypes.c_uint8,
+            torch.uint16: ctypes.c_uint16,
+            torch.uint32: ctypes.c_uint32,
+            torch.uint64: ctypes.c_uint64,
+            torch.int8: ctypes.c_int8,
+            torch.int16: ctypes.c_int16,
+            torch.int32: ctypes.c_int32,
+            torch.int64: ctypes.c_int64,
+            torch.float32: ctypes.c_float,
+            torch.float64: ctypes.c_double,
+            torch.complex64: ctypes.c_double,
+            torch.complex128: ctypes.c_longdouble,
+        }
+        ctype_size = mpiDtype2Ctype[dtype]
+        torch_op = mpiOp2torch[operation.handle]
+
+        def op(sendbuf: MPI.memory, recvbuf: MPI.memory, datatype):
+            send_arr = (ctype_size * (count + offset)).from_address(sendbuf.address)
+            recv_arr = (ctype_size * (count + offset)).from_address(recvbuf.address)
+
+            send_tensor = torch.as_strided(
+                torch.frombuffer(send_arr, dtype=dtype, count=count, offset=offset), shape, stride
+            )
+            recv_tensor = torch.as_strided(
+                torch.frombuffer(recv_arr, dtype=dtype, count=count, offset=offset), shape, stride
+            )
+            torch_op(send_tensor, recv_tensor, out=recv_tensor)
+
+        op = MPI.Op.Create(op)
+
+        return op
+
     def __reduce_like(
         self,
         func: Callable,
         sendbuf: Union[DNDarray, torch.Tensor, Any],
         recvbuf: Union[DNDarray, torch.Tensor, Any],
+        op: MPI.Op,
         *args,
         **kwargs,
     ) -> Tuple[Optional[DNDarray, torch.Tensor]]:
@@ -801,6 +892,8 @@ def __reduce_like(
             Buffer address of the send message
         recvbuf: Union[DNDarray, torch.Tensor, Any]
             Buffer address where to store the result of the reduction
+        op: MPI.Op
+            Operation to apply during the reduction.
         """
         sbuf = None
         rbuf = None
@@ -815,56 +908,59 @@ def __reduce_like(
         # harmonize the input and output buffers
         # MPI requires send and receive buffers to be of same type and length. If the torch tensors are either not both
         # contiguous or differently strided, they have to be made matching (if possible) first.
-        if isinstance(sendbuf, torch.Tensor):
-            # convert the send buffer to a pointer, number of elements and type are identical to the receive buffer
-            dummy = (
-                sendbuf.contiguous()
-            )  # make a contiguous copy and reassign the storage, old will be collected
-            # In PyTorch Version >= 2.0.0 we can use untyped_storage() instead of storage
-            # to keep backward compatibility with earlier PyTorch versions (where no untyped_storage() exists) we use a try/except
-            # (this applies to all places of Heat where untyped_storage() is used without further comment)
-            try:
-                sendbuf.set_(
-                    dummy.untyped_storage(),
-                    dummy.storage_offset(),
-                    size=dummy.shape,
-                    stride=dummy.stride(),
-                )
-            except AttributeError:
-                sendbuf.set_(
-                    dummy.storage(),
-                    dummy.storage_offset(),
-                    size=dummy.shape,
-                    stride=dummy.stride(),
-                )
-            sbuf = sendbuf if CUDA_AWARE_MPI else sendbuf.cpu()
-            sendbuf = self.as_buffer(sbuf)
+        if sendbuf is not MPI.IN_PLACE:
+            # Send and recv buffer need the same number of elements.
+            if sendbuf.numel() != recvbuf.numel():
+                raise ValueError("Send and recv buffers need the same number of elements.")
+
+            # Stride and offset should be the same to create the same datatype and operation. If they differ, they should be made contiguous (at the expense of memory)
+            if (
+                sendbuf.stride() != recvbuf.stride()
+                or sendbuf.storage_offset() != recvbuf.storage_offset()
+            ):
+                if not sendbuf.is_contiguous():
+                    tmp = sendbuf.contiguous()
+                    try:
+                        sendbuf.set_(
+                            tmp.untyped_storage(),
+                            tmp.storage_offset(),
+                            size=tmp.shape,
+                            stride=tmp.stride(),
+                        )
+                    except AttributeError:
+                        sendbuf.set_(
+                            tmp.storage(), tmp.storage_offset(), size=tmp.shape, stride=tmp.stride()
+                        )
+                if not recvbuf.is_contiguous():
+                    tmp = recvbuf.contiguous()
+                    try:
+                        recvbuf.set_(
+                            tmp.untyped_storage(),
+                            tmp.storage_offset(),
+                            size=tmp.shape,
+                            stride=tmp.stride(),
+                        )
+                    except AttributeError:
+                        recvbuf.set_(
+                            tmp.storage(), tmp.storage_offset(), size=tmp.shape, stride=tmp.stride()
+                        )
+
         if isinstance(recvbuf, torch.Tensor):
+            # Datatype and count shall be derived from the recv buffer, and applied to both, as they should match after the last code block
             buf = recvbuf
-            # nothing matches, the buffers have to be made contiguous
-            dummy = recvbuf.contiguous()
-            try:
-                recvbuf.set_(
-                    dummy.untyped_storage(),
-                    dummy.storage_offset(),
-                    size=dummy.shape,
-                    stride=dummy.stride(),
-                )
-            except AttributeError:
-                recvbuf.set_(
-                    dummy.storage(),
-                    dummy.storage_offset(),
-                    size=dummy.shape,
-                    stride=dummy.stride(),
-                )
             rbuf = recvbuf if CUDA_AWARE_MPI else recvbuf.cpu()
-            if sendbuf is MPI.IN_PLACE:
-                recvbuf = self.as_buffer(rbuf)
-            else:
-                recvbuf = (self.as_mpi_memory(rbuf), sendbuf[1], sendbuf[2])
+            recvbuf: Tuple[MPI.memory, int, MPI.Datatype] = self.as_buffer(rbuf, is_contiguous=True)
+            if not recvbuf[2].is_predefined:
+                # If using a derived datatype, we need to define the reduce operation to be able to handle the it.
+                derived_op = self.__derived_op(rbuf, recvbuf[2], op)
+                op = derived_op
+
+        if isinstance(sendbuf, torch.Tensor):
+            sbuf = sendbuf if CUDA_AWARE_MPI else sendbuf.cpu()
+            sendbuf = (self.as_mpi_memory(sbuf), recvbuf[1], recvbuf[2])
 
         # perform the actual reduction operation
-        return func(sendbuf, recvbuf, *args, **kwargs), sbuf, rbuf, buf
+        return func(sendbuf, recvbuf, op, **kwargs), sbuf, rbuf, buf
 
     def Allreduce(
         self,
diff --git a/heat/core/tests/test_communication.py b/heat/core/tests/test_communication.py
@@ -1,6 +1,7 @@
 import numpy as np
 import torch
 import heat as ht
+import unittest
 
 from .test_suites.basic_test import TestCase
 
@@ -2492,3 +2493,39 @@ def test_alltoallSorting(self):
             test4.comm.Alltoallv(test4.larray, redistributed4, send_axis=2, recv_axis=2)
         with self.assertRaises(NotImplementedError):
             test4.comm.Alltoallv(test4.larray, redistributed4, send_axis=None)
+
+    # The following test is only for the bool data type to save memory
+    # memory requirement: ~16MB * number of processes
+    def test_largecount_workaround_IsendRecv(self):
+        shape = (2**15, 2**16)
+        data = (
+            torch.zeros(shape, dtype=torch.bool)
+            if ht.MPI_WORLD.rank % 2 == 0
+            else torch.ones(shape, dtype=torch.bool)
+        )
+        buf = torch.empty(shape, dtype=torch.bool)
+        req = ht.MPI_WORLD.Isend(
+            data, ht.MPI_WORLD.rank - 1 if ht.MPI_WORLD.rank > 0 else ht.MPI_WORLD.size - 1
+        )
+        ht.MPI_WORLD.Recv(
+            buf, ht.MPI_WORLD.rank + 1 if ht.MPI_WORLD.rank < ht.MPI_WORLD.size - 1 else 0
+        )
+        req.Wait()
+        self.assertTrue(
+            buf.all()
+            if (ht.MPI_WORLD.rank % 2 == 0 and ht.MPI_WORLD.rank != ht.MPI_WORLD.size - 1)
+            else not buf.all()
+        )
+
+    # the following test is only for two processes to save memory
+    # memory requirement: ~16MB * number of processes
+    @unittest.skipIf(ht.MPI_WORLD.size != 2, "Only for two processes")
+    def test_largecount_workaround_Allreduce(self):
+        shape = (2**10, 2**11, 2**10)
+        data = (
+            torch.zeros(shape, dtype=torch.bool)
+            if ht.MPI_WORLD.rank % 2 == 0
+            else torch.ones(shape, dtype=torch.bool)
+        )
+        ht.MPI_WORLD.Allreduce(ht.MPI.IN_PLACE, data, op=ht.MPI.SUM)
+        self.assertTrue(data.all())