merged

microsoft · Nov 13, 2023 · 1a04933 · 1a04933
2 parents 1289984 + 2f6c8a9
commit 1a04933
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 8 deletions.
diff --git a/python/benchmark/allreduce.cu b/python/benchmark/allreduce.cu
@@ -123,7 +123,7 @@ __forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) {
 #endif
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, int nelems) {
+    allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, size_t nelems) {
   const size_t chunkSize = nelems / nranks;
   if (nranks == 1) return;
   const int nPeer = nranks - 1;
@@ -664,7 +664,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) __global__
     allreduce4(mscclpp::SmChannelDeviceHandle* smChans,
                mscclpp::SimpleProxyChannelDeviceHandle* reduceScatterProxyChans,
                mscclpp::SimpleProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank,
-               int nRanksPerNode, int worldSize, int nelems, int pipelineDepth) {
+               int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
   reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems, pipelineDepth);
   deviceSyncer.sync(gridDim.x);
@@ -674,7 +674,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) __global__
 // allreduce 5 for 2-nodes
 extern "C" __global__ void __launch_bounds__(1024, 1)
     allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff,
-               TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize, int nelems) {
+               TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize,
+               size_t nelems) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
   // This version of allreduce only works for single nodes
   const int nPeersInNode = nRanksPerNode - 1;

diff --git a/python/benchmark/mscclpp_op.py b/python/benchmark/mscclpp_op.py
@@ -67,7 +67,7 @@ def __init__(
             self.memory,
             self.group.my_rank,
             self.group.nranks,
-            self.memory.size,
+            ctypes.c_size_t(self.memory.size),
         )
         self.nthreads = nthreads
         self.nblocks = nblocks
@@ -109,7 +109,7 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out
             self.memory_out,
             self.group.my_rank,
             self.group.nranks,
-            self.memory.size,
+            ctypes.c_size_t(self.memory.size),
         )
 
     def __call__(self, stream_ptr):
@@ -155,7 +155,7 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_serv
             self.scratch,
             self.group.my_rank,
             self.group.nranks,
-            self.memory.size,
+            ctypes.c_size_t(self.memory.size),
         )
 
     def __call__(self, stream_ptr):
@@ -241,7 +241,8 @@ def set_params(self, nblocks, block_size, pipeline_depth):
             self.group.my_rank,
             self.nranks_per_node,
             self.group.nranks,
-            self.memory.size,
+            bytes(4),  # padding for memory alignment
+            ctypes.c_size_t(self.memory.size),
             self.pipeline_depth,
         )
 
@@ -322,7 +323,8 @@ def __init__(
             self.group.my_rank,
             nranks_per_node,
             self.group.nranks,
-            self.memory.size,
+            bytes(4),  # padding for memory alignment
+            ctypes.c_size_t(self.memory.size),
         )
 
     def __call__(self, stream_ptr):

diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
@@ -139,13 +139,17 @@ def pack(*args):
     for arg in list(args):
         if isinstance(arg, int):
             res += struct.pack("i", arg)
+        elif isinstance(arg, ctypes.c_size_t):
+            res += struct.pack("N", arg.value)
         elif isinstance(arg, np.ndarray):
             res += struct.pack("P", arg.ctypes.data)
         elif isinstance(arg, cp.ndarray):
             res += struct.pack("P", arg.data.ptr)
         # use int to represent bool, which can avoid CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error
         elif isinstance(arg, bool):
             res += struct.pack("i", arg)
+        elif isinstance(arg, bytes):
+            res += struct.pack(f"{len(arg)}s", arg)
         else:
             raise RuntimeError(f"Unsupported type: {type(arg)}")
     return res