Skip to content

Commit

Permalink
Enhance the nccl error message handling (#434)
Browse files Browse the repository at this point in the history
Add WARN or INFO before returning the nccl error message.
Change NCCL_DEBUG to MSCCLPP_DEBUG in debug message.
  • Loading branch information
seagater authored Jan 3, 2025
1 parent 3d6bfed commit ba0d0d6
Showing 1 changed file with 106 additions and 22 deletions.
128 changes: 106 additions & 22 deletions apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "allgather.hpp"
#include "allreduce.hpp"
#include "broadcast.hpp"
#include "debug.h"
#include "nccl.h"

#define NCCL_API extern "C" __attribute__((visibility("default")))
Expand Down Expand Up @@ -182,11 +183,18 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) {
// FallBack for single node
if (comm->comm->bootstrap()->getNranks() != comm->comm->bootstrap()->getNranksPerNode()) return ncclInvalidUsage;
if (comm->comm->bootstrap()->getNranks() != comm->comm->bootstrap()->getNranksPerNode()) {
WARN("ncclAllReduceFallback is currently unavailable for multi-node");
return ncclInvalidUsage;
}

// Checking if the parameters are valids
if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr)
if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
"datatype is invalid, or comm is nullptr.");
return ncclInvalidArgument;
}

// Declarating variables
size_t sendBytes, recvBytes;
Expand Down Expand Up @@ -262,6 +270,7 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
comm->comm->bootstrap()->getNranks(), count, stream));
break;
default:
WARN("datatype is invalid");
return ncclInvalidArgument;
}
return ncclSuccess;
Expand All @@ -270,11 +279,19 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// FallBack for single node
if (comm->comm->bootstrap()->getNranks() != comm->comm->bootstrap()->getNranksPerNode()) return ncclInvalidUsage;
if (comm->comm->bootstrap()->getNranks() != comm->comm->bootstrap()->getNranksPerNode()) {
WARN("ncclAllGatherFallback is currently unavailable for multi-node");
return ncclInvalidUsage;
}

// Checking if the parameters are valids
size_t bytes = sendcount * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}

// Declarating variables
size_t recvBytes;
Expand Down Expand Up @@ -347,13 +364,19 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
}

NCCL_API ncclResult_t ncclGetVersion(int* version) {
if (version == nullptr) return ncclInvalidArgument;
if (version == nullptr) {
WARN("version is nullptr");
return ncclInvalidArgument;
}
*version = MSCCLPP_VERSION;
return ncclSuccess;
}

NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) {
if (uniqueId == nullptr) return ncclInvalidArgument;
if (uniqueId == nullptr) {
WARN("uniqueId is nullptr");
return ncclInvalidArgument;
}
if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError;
mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId();
memcpy(uniqueId, &id, sizeof(ncclUniqueId));
Expand All @@ -367,8 +390,14 @@ NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclU
}

NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
if (comm == nullptr) return ncclInvalidArgument;
if (nranks < 0 || rank < 0 || rank >= nranks) return ncclInvalidArgument;
if (comm == nullptr) {
WARN("comm is nullptr");
return ncclInvalidArgument;
}
if (nranks < 0 || rank < 0 || rank >= nranks) {
WARN("nranks is %d, rank is %d", nranks, rank);
return ncclInvalidArgument;
}
std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
mscclpp::UniqueId id;
memcpy(id.data(), &commId, sizeof(ncclUniqueId));
Expand All @@ -386,6 +415,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
if (getenv("MSCCLPP_EXECUTION_PLAN_DIR")) {
std::string collectiveDir = getenv("MSCCLPP_EXECUTION_PLAN_DIR");
if (!std::filesystem::is_directory(collectiveDir)) {
WARN("The value of the environment variable %s is not a directory", collectiveDir.c_str());
return ncclInvalidArgument;
}
for (const auto& entry : std::filesystem::directory_iterator(collectiveDir)) {
Expand All @@ -402,6 +432,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI

NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t*, int, const int*) {
// TODO: implement this function
WARN("ncclCommInitAll is currently unavailable");
return ncclInternalError;
}

Expand All @@ -411,7 +442,10 @@ NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) {
}

NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == nullptr) return ncclInvalidArgument;
if (comm == nullptr) {
WARN("comm is nullptr");
return ncclInvalidArgument;
}
delete comm;
return ncclSuccess;
}
Expand All @@ -423,6 +457,7 @@ NCCL_API ncclResult_t ncclCommAbort(ncclComm_t) {

NCCL_API ncclResult_t ncclCommSplit(ncclComm_t, int, int, ncclComm_t*, ncclConfig_t*) {
// TODO: implement this function
WARN("ncclCommSplit is currently unavailable");
return ncclInternalError;
}

Expand All @@ -431,15 +466,15 @@ NCCL_API const char* ncclGetErrorString(ncclResult_t result) {
case ncclSuccess:
return "no error";
case ncclUnhandledCudaError:
return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)";
return "unhandled cuda error (run with MSCCLPP_DEBUG=INFO for details)";
case ncclSystemError:
return "unhandled system error (run with NCCL_DEBUG=INFO for details)";
return "unhandled system error (run with MSCCLPP_DEBUG=INFO for details)";
case ncclInternalError:
return "internal error - please report this issue to the NCCL developers";
return "internal error (run with MSCCLPP_DEBUG=WARN for details)";
case ncclInvalidArgument:
return "invalid argument (run with NCCL_DEBUG=WARN for details)";
return "invalid argument (run with MSCCLPP_DEBUG=WARN for details)";
case ncclInvalidUsage:
return "invalid usage (run with NCCL_DEBUG=WARN for details)";
return "invalid usage (run with MSCCLPP_DEBUG=WARN for details)";
case ncclRemoteError:
return "remote process exited or there was a network error";
case ncclInProgress:
Expand All @@ -455,42 +490,57 @@ NCCL_API const char* ncclGetLastError(ncclComm_t) {
}

NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) {
if (asyncError == nullptr) return ncclInvalidArgument;
if (asyncError == nullptr) {
WARN("asyncError is nullptr");
return ncclInvalidArgument;
}
*asyncError = ncclSuccess;
return ncclSuccess;
}

NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
if (comm == nullptr || count == nullptr) return ncclInvalidArgument;
if (comm == nullptr || count == nullptr) {
WARN("comm is nullptr or count is nullptr");
return ncclInvalidArgument;
}
*count = comm->comm->bootstrap()->getNranks();
return ncclSuccess;
}

NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) {
if (comm == nullptr || device == nullptr) return ncclInvalidArgument;
if (comm == nullptr || device == nullptr) {
WARN("comm is nullptr or device is nullptr");
return ncclInvalidArgument;
}
*device = comm->comm->bootstrap()->getRank();
return ncclSuccess;
}

NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
if (comm == nullptr || rank == nullptr) return ncclInvalidArgument;
if (comm == nullptr || rank == nullptr) {
WARN("comm is nullptr or rank is nullptr");
return ncclInvalidArgument;
}
*rank = comm->comm->bootstrap()->getRank();
return ncclSuccess;
}

NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, ncclScalarResidence_t, ncclComm_t) {
// TODO: implement this function
WARN("ncclRedOpCreatePreMulSum is currently unavailable");
return ncclInternalError;
}

NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
// TODO: implement this function
WARN("ncclRedOpDestroy is currently unavailable");
return ncclInternalError;
}

NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t,
cudaStream_t) {
// TODO: implement this function
WARN("ncclReduce is currently unavailable");
return ncclInternalError;
}

Expand All @@ -502,7 +552,12 @@ NCCL_API ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatyp
NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) {
size_t bytes = sendcount * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}

// Declarating variables
size_t recvBytes;
Expand Down Expand Up @@ -546,7 +601,12 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff
NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
int root, ncclComm_t comm, cudaStream_t stream) {
size_t bytes = count * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}

int rank = comm->comm->bootstrap()->getRank();
int nRank = comm->comm->bootstrap()->getNranks();
Expand Down Expand Up @@ -584,6 +644,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
stream);
break;
default:
WARN("datatype is invalid");
return ncclInvalidArgument;
}

Expand All @@ -593,8 +654,12 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t reductionOperation, ncclComm_t comm, cudaStream_t stream) {
// Checking if the parameters are valids
if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr)
if (sendbuff == nullptr || recvbuff == nullptr || count == 0 || ncclTypeSize(datatype) == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, count is 0, "
"datatype is invalid, or comm is nullptr.");
return ncclInvalidArgument;
}

// Declarating variables
size_t bytes = count * ncclTypeSize(datatype);
Expand Down Expand Up @@ -632,6 +697,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
stream, mscclpp::PacketType::LL8);
break;
default:
WARN("datatype is invalid");
return ncclInvalidArgument;
}

Expand All @@ -641,13 +707,19 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t,
cudaStream_t) {
// TODO: implement this function
WARN("ncclReduceScatter is currently unavailable");
return ncclInternalError;
}

NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
ncclComm_t comm, cudaStream_t stream) {
size_t bytes = sendcount * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) {
WARN(
"One or more of the following conditions is met: sendbuff or recvbuff pointer is nullptr, bytes is 0, "
"or comm is nullptr.");
return ncclInvalidArgument;
}

int rank = comm->comm->bootstrap()->getRank();
int nRank = comm->comm->bootstrap()->getNranks();
Expand Down Expand Up @@ -684,6 +756,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
*plan, stream);
break;
default:
WARN("datatype is invalid");
return ncclInvalidArgument;
}

Expand All @@ -692,16 +765,19 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t

NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
// TODO: implement this function
WARN("ncclSend is currently unavailable");
return ncclInternalError;
}

NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
// TODO: implement this function
WARN("ncclRecv is currently unavailable");
return ncclInternalError;
}

NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t) {
// TODO: implement this function
WARN("ncclAllToAll is currently unavailable");
return ncclInternalError;
}

Expand All @@ -728,6 +804,7 @@ NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
// Allocate memory using mscclpp::allocSharedPhysicalCuda
if (ptr == nullptr || size == 0) {
WARN("ptr is nullptr or size is 0");
return ncclInvalidArgument;
}
std::shared_ptr<char> sharedPtr;
Expand All @@ -738,19 +815,25 @@ ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
sharedPtr = mscclpp::allocExtSharedCuda<char>(size);
}
if (sharedPtr == nullptr) {
INFO(MSCCLPP_ALLOC, "Failed to allocate memory");
return ncclSystemError;
}
} catch (const mscclpp::Error& e) {
if (e.getErrorCode() == mscclpp::ErrorCode::InvalidUsage) {
WARN("Invalid usage: %s", e.what());
return ncclInvalidUsage;
} else {
WARN("Internal error: %s", e.what());
return ncclInternalError;
}
} catch (const mscclpp::CudaError& e) {
INFO(MSCCLPP_ALLOC, "Cuda error: %s", e.what());
return ncclUnhandledCudaError;
} catch (const mscclpp::CuError& e) {
INFO(MSCCLPP_ALLOC, "Cu error: %s", e.what());
return ncclUnhandledCudaError;
} catch (const mscclpp::BaseError& e) {
WARN("Base error: %s", e.what());
return ncclInternalError;
}
ptrMap[sharedPtr.get()] = sharedPtr;
Expand All @@ -768,5 +851,6 @@ ncclResult_t ncclMemFree(void* ptr) {
}

// Pointer not found
WARN("Pointer not found");
return ncclInvalidUsage;
}

0 comments on commit ba0d0d6

Please sign in to comment.