Skip to content

Commit

Permalink
merged
Browse files Browse the repository at this point in the history
  • Loading branch information
Saeed Maleki committed Nov 13, 2023
2 parents 1289984 + 2f6c8a9 commit 1a04933
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 8 deletions.
7 changes: 4 additions & 3 deletions python/benchmark/allreduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ __forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) {
#endif

extern "C" __global__ void __launch_bounds__(1024, 1)
allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, int nelems) {
allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, size_t nelems) {
const size_t chunkSize = nelems / nranks;
if (nranks == 1) return;
const int nPeer = nranks - 1;
Expand Down Expand Up @@ -664,7 +664,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) __global__
allreduce4(mscclpp::SmChannelDeviceHandle* smChans,
mscclpp::SimpleProxyChannelDeviceHandle* reduceScatterProxyChans,
mscclpp::SimpleProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank,
int nRanksPerNode, int worldSize, int nelems, int pipelineDepth) {
int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) {
nelems = nelems / (sizeof(int) / sizeof(TYPE));
reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems, pipelineDepth);
deviceSyncer.sync(gridDim.x);
Expand All @@ -674,7 +674,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) __global__
// allreduce 5 for 2-nodes
extern "C" __global__ void __launch_bounds__(1024, 1)
allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SimpleProxyChannelDeviceHandle* proxyChans, TYPE* buff,
TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize, int nelems) {
TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize,
size_t nelems) {
nelems = nelems / (sizeof(int) / sizeof(TYPE));
// This version of allreduce only works for single nodes
const int nPeersInNode = nRanksPerNode - 1;
Expand Down
12 changes: 7 additions & 5 deletions python/benchmark/mscclpp_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(
self.memory,
self.group.my_rank,
self.group.nranks,
self.memory.size,
ctypes.c_size_t(self.memory.size),
)
self.nthreads = nthreads
self.nblocks = nblocks
Expand Down Expand Up @@ -109,7 +109,7 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, memory_out
self.memory_out,
self.group.my_rank,
self.group.nranks,
self.memory.size,
ctypes.c_size_t(self.memory.size),
)

def __call__(self, stream_ptr):
Expand Down Expand Up @@ -155,7 +155,7 @@ def __init__(self, group: mscclpp_comm.CommGroup, memory: cp.ndarray, proxy_serv
self.scratch,
self.group.my_rank,
self.group.nranks,
self.memory.size,
ctypes.c_size_t(self.memory.size),
)

def __call__(self, stream_ptr):
Expand Down Expand Up @@ -241,7 +241,8 @@ def set_params(self, nblocks, block_size, pipeline_depth):
self.group.my_rank,
self.nranks_per_node,
self.group.nranks,
self.memory.size,
bytes(4), # padding for memory alignment
ctypes.c_size_t(self.memory.size),
self.pipeline_depth,
)

Expand Down Expand Up @@ -322,7 +323,8 @@ def __init__(
self.group.my_rank,
nranks_per_node,
self.group.nranks,
self.memory.size,
bytes(4), # padding for memory alignment
ctypes.c_size_t(self.memory.size),
)

def __call__(self, stream_ptr):
Expand Down
4 changes: 4 additions & 0 deletions python/mscclpp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,17 @@ def pack(*args):
for arg in list(args):
if isinstance(arg, int):
res += struct.pack("i", arg)
elif isinstance(arg, ctypes.c_size_t):
res += struct.pack("N", arg.value)
elif isinstance(arg, np.ndarray):
res += struct.pack("P", arg.ctypes.data)
elif isinstance(arg, cp.ndarray):
res += struct.pack("P", arg.data.ptr)
# use int to represent bool, which can avoid CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error
elif isinstance(arg, bool):
res += struct.pack("i", arg)
elif isinstance(arg, bytes):
res += struct.pack(f"{len(arg)}s", arg)
else:
raise RuntimeError(f"Unsupported type: {type(arg)}")
return res

0 comments on commit 1a04933

Please sign in to comment.