[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 4512e87d9c51 · 2025-11-14T16:17:40.000Z
for more information, see https://pre-commit.ci
diff --git a/transformer_engine/jax/cpp_extensions/router.py b/transformer_engine/jax/cpp_extensions/router.py
@@ -19,20 +19,30 @@
     "map_score_function",
 ]
 
+
 def map_score_function(score_function: str) -> int:
     score_function_map = {"sigmoid": 0, "softmax": 1}
-    assert score_function in score_function_map, \
-        f"score_function must be 'sigmoid' or 'softmax', got {score_function}"
+    assert (
+        score_function in score_function_map
+    ), f"score_function must be 'sigmoid' or 'softmax', got {score_function}"
     return score_function_map[score_function]
 
+
 class FusedTopkWithScoreFunctionFwdPrimitive(BasePrimitive):
     """
     Fused TopK with Score Function Forward Primitive
     """
 
     name = "te_fused_topk_with_score_function_forward_ffi"
     multiple_results = True  # Returns (probs, routing_map, intermediate_output)
-    impl_static_args = (2, 3, 4, 5, 6, 7,)  # topk, use_pre_softmax, num_groups, group_topk, scaling_factor, score_function,
+    impl_static_args = (
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+    )  # topk, use_pre_softmax, num_groups, group_topk, scaling_factor, score_function,
     inner_primitive = None
     outer_primitive = None
 
@@ -52,7 +62,7 @@ def abstract(
         te_fused_topk_with_score_function_forward abstract
         """
         dtype = dtypes.canonicalize_dtype(logits_aval.dtype)
-        assert len(logits_aval.shape) == 3 # (batch, seqlen, num_experts)
+        assert len(logits_aval.shape) == 3  # (batch, seqlen, num_experts)
 
         probs_aval = logits_aval.update(shape=logits_aval.shape, dtype=dtype)
         routing_map_aval = logits_aval.update(shape=logits_aval.shape, dtype=jnp.bool_)
@@ -78,14 +88,14 @@ def lowering(
         """
         logits_type = ir.RankedTensorType(logits.type)
         logits_shape = logits_type.shape
-        assert len(logits_shape) == 3 # (batch, seqlen, num_experts)
+        assert len(logits_shape) == 3  # (batch, seqlen, num_experts)
         (batch, seqlen, num_experts) = logits_shape
 
         return ffi.ffi_lowering(FusedTopkWithScoreFunctionFwdPrimitive.name)(
             ctx,
             logits,
             expert_bias,
-            num_tokens=batch*seqlen,
+            num_tokens=batch * seqlen,
             num_experts=num_experts,
             topk=topk,
             use_pre_softmax=use_pre_softmax,
@@ -107,17 +117,20 @@ def impl(
         score_function,
     ):
         assert FusedTopkWithScoreFunctionFwdPrimitive.inner_primitive is not None
-        (probs, routing_map, intermediate_output) = FusedTopkWithScoreFunctionFwdPrimitive.inner_primitive.bind(
-            logits,
-            expert_bias,
-            topk=topk,
-            use_pre_softmax=use_pre_softmax,
-            num_groups=num_groups,
-            group_topk=group_topk,
-            scaling_factor=scaling_factor,
-            score_function=score_function,
+        (probs, routing_map, intermediate_output) = (
+            FusedTopkWithScoreFunctionFwdPrimitive.inner_primitive.bind(
+                logits,
+                expert_bias,
+                topk=topk,
+                use_pre_softmax=use_pre_softmax,
+                num_groups=num_groups,
+                group_topk=group_topk,
+                scaling_factor=scaling_factor,
+                score_function=score_function,
+            )
         )
         return probs, routing_map, intermediate_output
+
     @staticmethod
     def batcher(
         batched_args,
@@ -129,7 +142,9 @@ def batcher(
         scaling_factor,
         score_function,
     ):
-        raise NotImplementedError("Batcher not implemented for FusedTopkWithScoreFunctionFwdPrimitive")
+        raise NotImplementedError(
+            "Batcher not implemented for FusedTopkWithScoreFunctionFwdPrimitive"
+        )
 
     @staticmethod
     def infer_sharding_from_operands(
@@ -169,7 +184,8 @@ def partition(
         del result_infos
         out_shardings = (arg_infos[0].sharding, arg_infos[0].sharding, arg_infos[0].sharding)
         arg_shardings = (arg_infos[0].sharding, arg_infos[1].sharding)
-        impl = partial(FusedTopkWithScoreFunctionFwdPrimitive.impl,
+        impl = partial(
+            FusedTopkWithScoreFunctionFwdPrimitive.impl,
             topk=topk,
             use_pre_softmax=use_pre_softmax,
             num_groups=num_groups,
@@ -261,15 +277,15 @@ def lowering(
         """
         intermediate_output_type = ir.RankedTensorType(intermediate_output.type)
         intermediate_output_shape = intermediate_output_type.shape
-        assert len(intermediate_output_shape) == 3 # (batch, seqlen, num_experts)
+        assert len(intermediate_output_shape) == 3  # (batch, seqlen, num_experts)
         (batch, seqlen, num_experts) = intermediate_output_shape
 
         return ffi.ffi_lowering(FusedTopkWithScoreFunctionBwdPrimitive.name)(
             ctx,
             routing_map,
             intermediate_output,
             grad_probs,
-            num_tokens=batch*seqlen,
+            num_tokens=batch * seqlen,
             num_experts=num_experts,
             topk=topk,
             use_pre_softmax=use_pre_softmax,
@@ -307,7 +323,9 @@ def batcher(
         scaling_factor,
         score_function,
     ):
-        raise NotImplementedError("Batcher not implemented for FusedTopkWithScoreFunctionBwdPrimitive")
+        raise NotImplementedError(
+            "Batcher not implemented for FusedTopkWithScoreFunctionBwdPrimitive"
+        )
 
     @staticmethod
     def infer_sharding_from_operands(
@@ -411,6 +429,7 @@ def fused_topk_with_score_function_fwd(
         score_function=score_function,
     )
 
+
 def fused_topk_with_score_function_bwd(
     routing_map: jnp.ndarray,
     intermediate_output: jnp.ndarray,
diff --git a/transformer_engine/jax/csrc/extensions/router.cpp b/transformer_engine/jax/csrc/extensions/router.cpp
@@ -19,14 +19,13 @@ constexpr int kScoreFunctionSigmoid = 0;
 constexpr int kScoreFunctionSoftmax = 1;
 
 Error_Type FusedTopkWithScoreFunctionForwardFFI(
-    cudaStream_t stream, Buffer_Type logits_buf, Buffer_Type expert_bias_buf,
-    Result_Type probs_buf, Result_Type routing_map_buf, Result_Type intermediate_output_buf,
-    int64_t num_tokens, int64_t num_experts, int64_t topk, bool use_pre_softmax,
-    int64_t num_groups, int64_t group_topk, double scaling_factor, int64_t score_function) {
-
+    cudaStream_t stream, Buffer_Type logits_buf, Buffer_Type expert_bias_buf, Result_Type probs_buf,
+    Result_Type routing_map_buf, Result_Type intermediate_output_buf, int64_t num_tokens,
+    int64_t num_experts, int64_t topk, bool use_pre_softmax, int64_t num_groups, int64_t group_topk,
+    double scaling_factor, int64_t score_function) {
   auto logits_dtype = convert_ffi_datatype_to_te_dtype(logits_buf.element_type());
-  auto logits_shape = std::vector<size_t>{static_cast<size_t>(num_tokens),
-                                          static_cast<size_t>(num_experts)};
+  auto logits_shape =
+      std::vector<size_t>{static_cast<size_t>(num_tokens), static_cast<size_t>(num_experts)};
 
   auto *logits = logits_buf.untyped_data();
   auto logits_tensor = TensorWrapper(logits, logits_shape, logits_dtype);
@@ -47,28 +46,28 @@ Error_Type FusedTopkWithScoreFunctionForwardFFI(
   auto routing_map_tensor = TensorWrapper(routing_map, logits_shape, DType::kByte);
 
   auto *intermediate_output = intermediate_output_buf->untyped_data();
-  auto intermediate_output_tensor =
-      TensorWrapper(intermediate_output, logits_shape, logits_dtype);
+  auto intermediate_output_tensor = TensorWrapper(intermediate_output, logits_shape, logits_dtype);
 
   nvte_fused_topk_with_score_function_forward(
       logits_tensor.data(), static_cast<int>(num_tokens), static_cast<int>(num_experts),
       static_cast<int>(topk), static_cast<int>(use_pre_softmax), static_cast<int>(num_groups),
       static_cast<int>(group_topk), static_cast<float>(scaling_factor),
-      static_cast<int>(score_function),
-      expert_bias_tensor.data(), probs_tensor.data(),
+      static_cast<int>(score_function), expert_bias_tensor.data(), probs_tensor.data(),
       routing_map_tensor.data(), intermediate_output_tensor.data(), stream);
 
   return ffi_with_cuda_error_check();
 }
 
-Error_Type FusedTopkWithScoreFunctionBackwardFFI(
-    cudaStream_t stream, Buffer_Type routing_map_buf, Buffer_Type intermediate_output_buf,
-    Buffer_Type grad_probs_buf, Result_Type grad_logits_buf, int64_t num_tokens,
-    int64_t num_experts, int64_t topk, bool use_pre_softmax, double scaling_factor,
-    int64_t score_function) {
+Error_Type FusedTopkWithScoreFunctionBackwardFFI(cudaStream_t stream, Buffer_Type routing_map_buf,
+                                                 Buffer_Type intermediate_output_buf,
+                                                 Buffer_Type grad_probs_buf,
+                                                 Result_Type grad_logits_buf, int64_t num_tokens,
+                                                 int64_t num_experts, int64_t topk,
+                                                 bool use_pre_softmax, double scaling_factor,
+                                                 int64_t score_function) {
   auto grad_probs_dtype = convert_ffi_datatype_to_te_dtype(grad_probs_buf.element_type());
-  auto tensor_shape = std::vector<size_t>{static_cast<size_t>(num_tokens),
-                                          static_cast<size_t>(num_experts)};
+  auto tensor_shape =
+      std::vector<size_t>{static_cast<size_t>(num_tokens), static_cast<size_t>(num_experts)};
 
   auto *routing_map = routing_map_buf.untyped_data();
   auto routing_map_tensor = TensorWrapper(routing_map, tensor_shape, DType::kByte);
@@ -129,4 +128,3 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(FusedTopkWithScoreFunctionBackwardHandler,
 
 }  // namespace jax
 }  // namespace transformer_engine
-
diff --git a/transformer_engine/jax/router.py b/transformer_engine/jax/router.py
@@ -57,13 +57,27 @@ def _fused_topk_with_score_function(
     score_function,
 ):
     outputs, _ = _fused_topk_fwd_rule(
-        logits, expert_bias, topk, use_pre_softmax, num_groups, group_topk, scaling_factor, score_function
+        logits,
+        expert_bias,
+        topk,
+        use_pre_softmax,
+        num_groups,
+        group_topk,
+        scaling_factor,
+        score_function,
     )
     return outputs
 
 
 def _fused_topk_fwd_rule(
-    logits, expert_bias, topk, use_pre_softmax, num_groups, group_topk, scaling_factor, score_function
+    logits,
+    expert_bias,
+    topk,
+    use_pre_softmax,
+    num_groups,
+    group_topk,
+    scaling_factor,
+    score_function,
 ):
     probs, routing_map, intermediate_output = tex.fused_topk_with_score_function_fwd(
         logits,
@@ -84,7 +98,7 @@ def _fused_topk_bwd_rule(
     del num_groups, group_topk
     routing_map, intermediate_output = ctx
     grad_probs, _, _ = grads
-    
+
     grad_logits = tex.fused_topk_with_score_function_bwd(
         routing_map,
         intermediate_output,