microsoft · chhwang · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/include/mscclpp/device.hpp b/include/mscclpp/device.hpp
@@ -15,9 +15,9 @@
 #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
 #if defined(__HIP_PLATFORM_AMD__)
 #define MSCCLPP_DEVICE_HIP
-#else  // !(defined(__HIP_PLATFORM_AMD__)
+#else  // !(defined(__HIP_PLATFORM_AMD__))
 #define MSCCLPP_DEVICE_CUDA
-#endif  // !(defined(__HIP_PLATFORM_AMD__))
+#endif  // defined(__HIP_PLATFORM_AMD__)
 
 #else  // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
 

diff --git a/src/connection.cc b/src/connection.cc
@@ -98,7 +98,11 @@ void CudaIpcConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset,
 
   if (!env()->cudaIpcUseDefaultStream && stream_->empty()) stream_->set(cudaStreamNonBlocking);
 
-  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr, src, sizeof(uint64_t), cudaMemcpyHostToDevice, *stream_));
+  uint64_t* gpuData;
+  hipMalloc(&gpuData, sizeof(uint64_t));
+  hipMemcpy(gpuData, src, sizeof(uint64_t), cudaMemcpyHostToDevice);
+
+  MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr, gpuData, sizeof(uint64_t), cudaMemcpyDeviceToDevice, *stream_));
   INFO(MSCCLPP_P2P, "CudaIpcConnection atomic write: from %p to %p, %lu -> %lu", src, dstPtr + dstOffset, oldValue,
        newValue);
 

diff --git a/src/context.cc b/src/context.cc
@@ -43,7 +43,20 @@ MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpo
     if (remoteEndpoint.transport() != Transport::CudaIpc) {
       throw mscclpp::Error("Local transport is CudaIpc but remote is not", ErrorCode::InvalidUsage);
     }
-    conn = std::make_shared<CudaIpcConnection>(localEndpoint, remoteEndpoint, pimpl_->ipcStream_);
+#if defined(__HIP_PLATFORM_AMD__)
+    pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaStreamWithFlags>(cudaStreamNonBlocking));
+#else
+    if (pimpl_->ipcStreams_.empty()) {
+      pimpl_->ipcStreams_.emplace_back(std::make_shared<CudaStreamWithFlags>(cudaStreamNonBlocking));
+    }
+#endif
+    if (pimpl_->ipcStreams_.size() < 4) {
+      conn = std::make_shared<CudaIpcConnection>(localEndpoint, remoteEndpoint, pimpl_->ipcStreams_.back());
+    } else {
+      static int index = 0;
+      index = (index + 1) % 4;
+      conn = std::make_shared<CudaIpcConnection>(localEndpoint, remoteEndpoint, pimpl_->ipcStreams_[index]);
+    }
   } else if (AllIBTransports.has(localEndpoint.transport())) {
     if (!AllIBTransports.has(remoteEndpoint.transport())) {
       throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);

diff --git a/src/include/context.hpp b/src/include/context.hpp
@@ -16,6 +16,7 @@ namespace mscclpp {
 struct Context::Impl {
   std::vector<std::shared_ptr<Connection>> connections_;
   std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
+  std::vector<std::shared_ptr<CudaStreamWithFlags>> ipcStreams_;
   std::shared_ptr<CudaStreamWithFlags> ipcStream_;
   CUmemGenericAllocationHandle mcHandle_;
 

diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu
@@ -28,6 +28,12 @@ static int nranksPerNode = 8;
     }                                                                                   \
   } while (false)
 
+#if defined(__HIP_PLATFORM_AMD__)
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+
 // Measure current time in second.
 static double getTime(void) {
   struct timespec tspec;
@@ -47,14 +53,14 @@ __device__ void allgather0(DeviceHandle<mscclpp::PortChannel> portChan, int rank
 
   // this thread's role is a sender role
   // put your data asynchronously
-  if ((threadIdx.x % 32) == 0) portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
+  if ((threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
   // make sure everyone is put their data before some thread randomly blocks everyone else in signal
   __syncthreads();
   // push with flag and sync to make sure the data is received
-  if ((threadIdx.x % 32) == 0) portChan.flush();
+  if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush();
 
   // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
-  if ((threadIdx.x % 32) == 0) portChan.wait();
+  if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
 }
 
 __device__ void localAllGather(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int nranksPerNode, int remoteRank,
@@ -68,17 +74,17 @@ __device__ void localAllGather(DeviceHandle<mscclpp::PortChannel> portChan, int
   for (int i = 1; i < nranksPerNode; i++) {
     if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) {
       // put your data to GPU (rank+i) % nranksPerNode and signal in one call
-      if ((threadIdx.x % 32) == 0) portChan.putWithSignal(offset, size);
+      if ((threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignal(offset, size);
     }
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
-      if ((threadIdx.x % 32) == 0) portChan.wait();
+      if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
     }
-#if defined(__HIP_PLATFORM_AMD__)
+#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
     // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
     __syncthreads();
 #else
-    asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nranksPerNode - 1) * 32) : "memory");
+    asm volatile("bar.sync %0, %1;" ::"r"(11), "r"((nranksPerNode - 1) * WARP_SIZE) : "memory");
 #endif
   }
 }
@@ -88,7 +94,7 @@ __device__ void allgather1(DeviceHandle<mscclpp::PortChannel> portChan, int rank
   localAllGather(portChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
                  nelemsPerGPU * sizeof(int));
   if (remoteRank / nranksPerNode == rank / nranksPerNode)
-    if ((threadIdx.x % 32) == 0) portChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush();
 }
 
 __device__ void allgather2(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int world_size, int nranksPerNode,
@@ -114,10 +120,10 @@ __device__ void allgather2(DeviceHandle<mscclpp::PortChannel> portChan, int rank
   // cross-node exchange
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
-    if ((threadIdx.x % 32) == 0)
+    if ((threadIdx.x % WARP_SIZE) == 0)
       portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
                              (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) portChan.wait();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
   }
 
   __syncthreads();
@@ -133,10 +139,10 @@ __device__ void allgather2(DeviceHandle<mscclpp::PortChannel> portChan, int rank
   // cross-node exchange
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
-    if ((threadIdx.x % 32) == 0)
+    if ((threadIdx.x % WARP_SIZE) == 0)
       portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
                              nelemsPerGPU / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) portChan.wait();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
   }
 
   __syncthreads();
@@ -150,13 +156,13 @@ __device__ void allgather2(DeviceHandle<mscclpp::PortChannel> portChan, int rank
   }
 
   if (remoteRank / nranksPerNode == rank / nranksPerNode || remoteRank % nranksPerNode == rank % nranksPerNode) {
-    if ((threadIdx.x % 32) == 0) portChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush();
   }
 }
 
 __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) {
-  // find the mapping between remoteRank and portChans
-  int warpId = threadIdx.x / 32;
+  // find the mapping between remoteRank and proxyChans
+  int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
   // Each warp is responsible for one of the remote ranks
   DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[warpId];
@@ -410,7 +416,7 @@ int main(int argc, const char* argv[]) {
     cudaStream_t stream;
     CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
     CUDACHECK(cudaDeviceSynchronize());
-    kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
+    kernel<<<1, WARP_SIZE * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
     CUDACHECK(cudaDeviceSynchronize());
     CUDACHECK(cudaMemcpy(data_h, data_d, dataSize, cudaMemcpyDeviceToHost));
 
@@ -432,7 +438,7 @@ int main(int argc, const char* argv[]) {
     CUDACHECK(cudaStreamSynchronize(stream));
     bootstrap->allGather(tmp, sizeof(int));
     for (int i = 0; i < iterwithoutcudagraph; ++i) {
-      kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
+      kernel<<<1, WARP_SIZE * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
     }
     CUDACHECK(cudaStreamSynchronize(stream));
     bootstrap->allGather(tmp, sizeof(int));
@@ -444,7 +450,7 @@ int main(int argc, const char* argv[]) {
     cudaGraphExec_t instance;
     CUDACHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
     for (int i = 0; i < cudagraphiter; ++i) {
-      kernel<<<1, 32 * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
+      kernel<<<1, WARP_SIZE * (world_size - 1), 0, stream>>>(rank, world_size, nranksPerNode, nelemsPerGPU, kernelNum);
     }
     CUDACHECK(cudaStreamEndCapture(stream, &graph));
     CUDACHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));