diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp index 81c315e9a558..59da19318250 100644 --- a/csrc/aio/common/deepspeed_aio_common.cpp +++ b/csrc/aio/common/deepspeed_aio_common.cpp @@ -300,8 +300,9 @@ int regular_read(const char* filename, std::vector& buffer) } while (r > 0); if (read_bytes != num_bytes) { - std::cerr << "read error " << " read_bytes (read) = " << read_bytes - << " num_bytes (fstat) = " << num_bytes << std::endl; + std::cerr << "read error " + << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes + << std::endl; } assert(read_bytes == num_bytes); close(fd); diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index 1ff0397043fa..28fdc0bd9d74 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -71,8 +71,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } @@ -117,7 +118,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp index f5480e9d9d83..561c46f7c287 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.cpp +++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp @@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices. #include "deepspeed_py_copy.h" #include -#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) +#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) or defined(__AVX256__) union AVX_Data { diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp index 64d7c2e0541e..f9ec88694a52 100644 --- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp @@ -95,8 +95,9 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); } const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } @@ -131,8 +132,9 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer, const std::chrono::duration fn_time = std::chrono::high_resolution_clock::now() - start_time; - std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6 - << " call = " << fn_time.count() * 1e6 << std::endl; + std::cout << "Elapsed time(usec): " + << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6 + << std::endl; return 0; } diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h index 40d3265c7a63..8b8beab1af69 100644 --- a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h +++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h @@ -1207,7 +1207,8 @@ template -struct DefaultWarpIteratorAFromSharedMemory {}; +struct DefaultWarpIteratorAFromSharedMemory { +}; // TensorOp - Ampere half template diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h index c102234a4dfb..2a4300c5cac1 100644 --- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h +++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h @@ -125,10 +125,11 @@ struct CheckArch { std::cerr << #PTR " is not correctly aligned\n"; \ return false; \ } -#define EVOFORMER_CHECK(COND, ERR) \ - if (!(COND)) { \ - std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \ - return false; \ +#define EVOFORMER_CHECK(COND, ERR) \ + if (!(COND)) { \ + std::cerr << "[Evoformer Attention]" \ + << "'" #COND "' failed: " << ERR << "\n"; \ + return false; \ } #endif diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h index 8d4173f1a6a2..2550da2fa6ae 100644 --- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h +++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h @@ -12,7 +12,8 @@ namespace epilogue { namespace threadblock { template -struct atomic_store {}; +struct atomic_store { +}; template struct atomic_store -struct is_device_copyable> : std::true_type {}; +struct is_device_copyable> : std::true_type { +}; } // namespace sycl template diff --git a/csrc/xpu/common/custom_cuda_kernel.dp.cpp b/csrc/xpu/common/custom_cuda_kernel.dp.cpp index cfd004ef1357..702b567d528d 100644 --- a/csrc/xpu/common/custom_cuda_kernel.dp.cpp +++ b/csrc/xpu/common/custom_cuda_kernel.dp.cpp @@ -21,7 +21,8 @@ inline void has_capability_or_fail(const sycl::device& dev, break; default: #define __SYCL_ASPECT(ASPECT, ID) \ - case sycl::aspect::ASPECT: return #ASPECT; + case sycl::aspect::ASPECT: \ + return #ASPECT; #define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string { diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h index 097e2d8585cc..f77568be7835 100644 --- a/csrc/xpu/includes/simd.h +++ b/csrc/xpu/includes/simd.h @@ -13,7 +13,7 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) -#define ROUND_DOWN(size, step) ((size) & ~((step) - 1)) +#define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d) diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h index 1897afd1fea2..fa41757c895b 100644 --- a/csrc/xpu/includes/type_shim.h +++ b/csrc/xpu/includes/type_shim.h @@ -82,11 +82,11 @@ } template -__inline__ __attribute__((always_inline)) T -reduce_block_into_lanes(T* x, - T val, - int lanes = 1, - bool share_result = false) // lanes is intended to be <= 32. +__inline__ __attribute__((always_inline)) T reduce_block_into_lanes( + T* x, + T val, + int lanes = 1, + bool share_result = false) // lanes is intended to be <= 32. { auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>(); int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2); diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index 340bc82de508..7b68be6c0c3d 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -292,7 +292,8 @@ def top2gating(logits: Tensor, min_capacity: int, drop_tokens: bool = True, ep_group: Union[torch.distributed.ProcessGroup, None] = None, - top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + top2_2nd_expert_sampling: bool = True, + use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function gates = F.softmax(logits, dim=1) @@ -313,8 +314,12 @@ def top2gating(logits: Tensor, mask2 = F.one_hot(indices2_s, num_classes=num_experts) # Compute locations in capacity buffer - locations1 = torch.cumsum(mask1, dim=0) - 1 - locations2 = torch.cumsum(mask2, dim=0) - 1 + if not use_tutel: + locations1 = torch.cumsum(mask1, dim=0) - 1 + locations2 = torch.cumsum(mask2, dim=0) - 1 + else: + locations1 = tutel_moe.fast_cumsum_sub_one(mask1) + locations2 = tutel_moe.fast_cumsum_sub_one(mask2) # Update 2nd's location by accounting for locations of 1st locations2 += torch.sum(mask1, dim=0, keepdim=True) @@ -358,6 +363,19 @@ def top2gating(logits: Tensor, gates1_s /= denom_s gates2_s /= denom_s + if use_tutel: + # return critical information for tutel + return l_aux, capacity, num_experts, [ + indices1_s, + indices2_s, + ], [ + locations1_s, + locations2_s, + ], [ + gates1_s, + gates2_s, + ], exp_counts + # Calculate combine_weights and dispatch_mask gates1 = einsum("s,se->se", gates1_s, mask1_float) gates2 = einsum("s,se->se", gates2_s, mask2_float) @@ -517,7 +535,8 @@ def forward(self, elif self.k == 2: gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, - self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling) + self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling, + use_tutel) else: gate_output = topkgating(logits, self.k, self.capacity_factor if self.training else self.eval_capacity_factor, @@ -568,7 +587,7 @@ def __init__(self, self.timers = SynchronizedWallClockTimer() self.wall_clock_breakdown = False - self.use_tutel = use_tutel and TUTEL_INSTALLED and gate.k == 1 + self.use_tutel = use_tutel and TUTEL_INSTALLED and (gate.k == 1 or gate.k == 2) if self.use_tutel: logger.info('Using Tutel optimizations.')