diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 81c315e9a558..59da19318250 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -300,8 +300,9 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
-                  << " num_bytes (fstat) = " << num_bytes << std::endl;
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 1ff0397043fa..28fdc0bd9d74 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -71,8 +71,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
@@ -117,7 +118,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index f5480e9d9d83..561c46f7c287 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -10,7 +10,7 @@ Functionality for swapping tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index 64d7c2e0541e..f9ec88694a52 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -95,8 +95,9 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer,
     if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
@@ -131,8 +132,9 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
-              << " call = " << fn_time.count() * 1e6 << std::endl;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
     return 0;
 }
 
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
index 40d3265c7a63..8b8beab1af69 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm/mma_from_smem.h
@@ -1207,7 +1207,8 @@ template <typename WarpShape,
           typename RegularWarpIterator,
           typename Policy,
           typename Enable = void>
-struct DefaultWarpIteratorAFromSharedMemory {};
+struct DefaultWarpIteratorAFromSharedMemory {
+};
 
 // TensorOp - Ampere half
 template <typename RegularWarpIterator, typename Policy>
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index c102234a4dfb..2a4300c5cac1 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,10 +125,11 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                                                     \
-    if (!(COND)) {                                                                     \
-        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                                                  \
+#define EVOFORMER_CHECK(COND, ERR)                          \
+    if (!(COND)) {                                          \
+        std::cerr << "[Evoformer Attention]"                \
+                  << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                       \
     }
 #endif
 
diff --git a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
index 8d4173f1a6a2..2550da2fa6ae 100644
--- a/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
+++ b/csrc/deepspeed4science/evoformer_attn/iterators/predicated_tile_iterator_atomic.h
@@ -12,7 +12,8 @@ namespace epilogue {
 namespace threadblock {
 
 template <class AccessType, class Enable = void>
-struct atomic_store {};
+struct atomic_store {
+};
 
 template <class AccessType>
 struct atomic_store<AccessType,
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index a205026ec7c1..f5bfb45dd2e2 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/adam/multi_tensor_apply.dp.hpp b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
index 14a130e2a23e..1c0234b4315c 100644
--- a/csrc/xpu/adam/multi_tensor_apply.dp.hpp
+++ b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
@@ -112,7 +112,8 @@ class multi_tensor_apply_kernel {
 // to make sure multi_tensor_apply_kernel can be used in sycl::buffer
 namespace sycl {
 template <typename T, typename U, typename... ArgTypes>
-struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {};
+struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {
+};
 }  // namespace sycl
 
 template <int depth, typename T, typename... ArgTypes>
diff --git a/csrc/xpu/common/custom_cuda_kernel.dp.cpp b/csrc/xpu/common/custom_cuda_kernel.dp.cpp
index cfd004ef1357..702b567d528d 100644
--- a/csrc/xpu/common/custom_cuda_kernel.dp.cpp
+++ b/csrc/xpu/common/custom_cuda_kernel.dp.cpp
@@ -21,7 +21,8 @@ inline void has_capability_or_fail(const sycl::device& dev,
                 break;
             default:
 #define __SYCL_ASPECT(ASPECT, ID) \
-    case sycl::aspect::ASPECT: return #ASPECT;
+    case sycl::aspect::ASPECT:    \
+        return #ASPECT;
 #define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
 #define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
                 auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
index 097e2d8585cc..f77568be7835 100644
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index 1897afd1fea2..fa41757c895b 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T
-reduce_block_into_lanes(T* x,
-                        T val,
-                        int lanes = 1,
-                        bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
+    T* x,
+    T val,
+    int lanes = 1,
+    bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index 340bc82de508..7b68be6c0c3d 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -292,7 +292,8 @@ def top2gating(logits: Tensor,
                min_capacity: int,
                drop_tokens: bool = True,
                ep_group: Union[torch.distributed.ProcessGroup, None] = None,
-               top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+               top2_2nd_expert_sampling: bool = True,
+               use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top2Gating on logits."""
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
@@ -313,8 +314,12 @@ def top2gating(logits: Tensor,
     mask2 = F.one_hot(indices2_s, num_classes=num_experts)
 
     # Compute locations in capacity buffer
-    locations1 = torch.cumsum(mask1, dim=0) - 1
-    locations2 = torch.cumsum(mask2, dim=0) - 1
+    if not use_tutel:
+        locations1 = torch.cumsum(mask1, dim=0) - 1
+        locations2 = torch.cumsum(mask2, dim=0) - 1
+    else:
+        locations1 = tutel_moe.fast_cumsum_sub_one(mask1)
+        locations2 = tutel_moe.fast_cumsum_sub_one(mask2)
     # Update 2nd's location by accounting for locations of 1st
     locations2 += torch.sum(mask1, dim=0, keepdim=True)
 
@@ -358,6 +363,19 @@ def top2gating(logits: Tensor,
     gates1_s /= denom_s
     gates2_s /= denom_s
 
+    if use_tutel:
+        # return critical information for tutel
+        return l_aux, capacity, num_experts, [
+            indices1_s,
+            indices2_s,
+        ], [
+            locations1_s,
+            locations2_s,
+        ], [
+            gates1_s,
+            gates2_s,
+        ], exp_counts
+
     # Calculate combine_weights and dispatch_mask
     gates1 = einsum("s,se->se", gates1_s, mask1_float)
     gates2 = einsum("s,se->se", gates2_s, mask2_float)
@@ -517,7 +535,8 @@ def forward(self,
 
         elif self.k == 2:
             gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
-                                     self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling)
+                                     self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling,
+                                     use_tutel)
         else:
             gate_output = topkgating(logits, self.k,
                                      self.capacity_factor if self.training else self.eval_capacity_factor,
@@ -568,7 +587,7 @@ def __init__(self,
         self.timers = SynchronizedWallClockTimer()
         self.wall_clock_breakdown = False
 
-        self.use_tutel = use_tutel and TUTEL_INSTALLED and gate.k == 1
+        self.use_tutel = use_tutel and TUTEL_INSTALLED and (gate.k == 1 or gate.k == 2)
 
         if self.use_tutel:
             logger.info('Using Tutel optimizations.')