diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index b093cd9e..9a1a96f6 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -218,7 +218,7 @@ jobs: set -e; \ cd /root/mscclpp; \ mkdir -p build && cd build; \ - cmake -DCMAKE_BUILD_TYPE=Release -DNPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \ + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \ make -j"' kill $CHILD_PID workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 479c971a..41342413 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -247,7 +247,7 @@ __global__ void __launch_bounds__(1024, 1) , NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) { #else - ) { + ) { #endif // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; @@ -265,7 +265,8 @@ __global__ void __launch_bounds__(1024, 1) #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) #if defined(MSCCLPP_DEVICE_HIP) - NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x), + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, + NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x), #else NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, #endif @@ -516,9 +517,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< } #if defined(ENABLE_NPKIT) size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent); - allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, - channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, - flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); + allreduce7<<>>( + buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, + nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,