lsy323
diff --git a/‎aten/src/ATen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 37 additions & 0 deletions b/‎aten/src/ATen/cuda/detail/LazyNVRTC.cpp
Lines changed: 37 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 12 additions & 3 deletions b/‎aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Lines changed: 12 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 103 additions & 10 deletions b/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 103 additions & 10 deletions
@@ -473,6 +473,7 @@ endif()
 
 if(USE_CUDA AND NOT USE_ROCM)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
+  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
   if($ENV{ATEN_STATIC_CUDA})
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
 
@@ -170,6 +170,43 @@ CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
 CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
 CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
+CUresult CUDAAPI
+cuTensorMapEncodeTiled(
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const cuuint32_t* boxDim,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) {
+  auto fn = reinterpret_cast<decltype(&cuTensorMapEncodeTiled)>(
+      getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuTensorMapEncodeTiled");
+  lazyNVRTC.cuTensorMapEncodeTiled = fn;
+  return fn(
+      tensorMap,
+      tensorDataType,
+      tensorRank,
+      globalAddress,
+      globalDim,
+      globalStrides,
+      boxDim,
+      elementStrides,
+      interleave,
+      swizzle,
+      l2Promotion,
+      oobFill);
+}
+
+#endif
+
 // Irregularly shaped functions
 CUresult CUDAAPI cuLaunchKernel(CUfunction f,
                                 unsigned int gridDimX,
 
@@ -59,16 +59,25 @@ namespace at { namespace cuda {
   _(cuLinkAddData)                               \
   _(cuLinkComplete)                              \
   _(cuFuncSetAttribute)                          \
-  _(cuFuncGetAttribute)
+  _(cuFuncGetAttribute)                          \
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
+#define AT_FORALL_NVRTC_EXTENDED(_)              \
+  AT_FORALL_NVRTC_BASE(_)                        \
+  _(cuTensorMapEncodeTiled)
+#else
+#define AT_FORALL_NVRTC_EXTENDED(_)              \
+  AT_FORALL_NVRTC_BASE(_)
+#endif
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
 #define AT_FORALL_NVRTC(_) \
-  AT_FORALL_NVRTC_BASE(_)  \
+  AT_FORALL_NVRTC_EXTENDED(_)  \
   _(nvrtcGetCUBINSize)     \
   _(nvrtcGetCUBIN)
 #else
 #define AT_FORALL_NVRTC(_) \
-  AT_FORALL_NVRTC_BASE(_)
+  AT_FORALL_NVRTC_EXTENDED(_)
 #endif
 
 #else
 
@@ -1,3 +1,7 @@
+#include <cstdint>
+#include <c10/util/Exception.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
@@ -10,6 +14,7 @@
 #include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
+#include <ATen/native/cuda/RowwiseScaledMM.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -819,24 +824,97 @@ static bool _scaled_mm_allowed_device() {
 #endif
 }
 
+namespace{
+
+enum class ScalingType {
+  TensorWise,
+  RowWise,
+  Error
+};
+
+// Validates the scale tensors to scaled_mm
+// And returns the type of scaling/which kernel to use
+ScalingType get_scaling_type(
+    const c10::optional<at::Tensor>& scale_a,
+    const c10::optional<at::Tensor>& scale_b,
+    int64_t dim_m,
+    int64_t dim_n) {
+  TORCH_CHECK(
+      scale_a.has_value() == scale_b.has_value(),
+      "Both scale_a and scale_b must be present or absent.");
+
+  if (scale_a.has_value()) {
+    // Both Per-Tensor and Row-wise scaling expect fp32 tensors
+    TORCH_CHECK(
+        scale_a->scalar_type() == kFloat && scale_b->scalar_type() == kFloat,
+        "Both scale_a and scale_b must be float (fp32) tensors.");
+
+    // Check the singluar scale case for per-tensor scaling
+    if (scale_a->numel() == 1 && scale_b->numel() == 1) {
+      return ScalingType::TensorWise;
+    } else if (scale_a->dim() == 1 && scale_a->size(0) == dim_m) {
+// Check the per-row scaling case
+#if !defined(USE_ROCM) && !defined(_MSC_VER) || \
+    (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+      TORCH_CHECK(
+          scale_a->dim() == 1 && scale_b->dim() == 1,
+          "Both scale_a and scale_b must be 1-dimensional tensors");
+      TORCH_CHECK(
+          scale_b->size(0) == dim_n,
+          "For row-wise scaling, scale_b must have size ",
+          dim_n,
+          " but got ",
+          scale_b->size(0),
+          ".");
+      TORCH_CHECK(
+          scale_a->is_contiguous() && scale_b->is_contiguous(),
+          "Both scale_a and scale_b must be contiguous.");
+       return ScalingType::RowWise;
+#else
+      TORCH_CHECK(false, "Per-row scaling is not supported for this platform!");
+      return ScalingType::Error;
+#endif // !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) &&
+       // ROCM_VERSION >= 60000)
+    } else {
+      TORCH_CHECK(
+          false,
+          "For row-wise scaling, scale_a must be size ",
+          dim_m,
+          " but got ",
+          scale_a->numel(),
+          " and scale_b must be size ",
+          dim_n,
+          " but got ",
+          scale_b->numel(),
+          ".");
+      // Unreachable
+      return ScalingType::RowWise;
+    }
+  }
+  return ScalingType::Error;
+}
+
+} // namespace
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
 // Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
 // Known limitations:
 //  - Only works if mat1 is row-major and mat2 is column-major
 //  - Only works if matrices sizes are divisible by 32
-//
+//  - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
 //  Arguments:
 //    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
 //    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
 //    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
 //    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
-//    - `scale_a`: a scalar tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
-//    - `scale_b`: a scalar tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
-//    - `scale_result`: a scalar tensor with the scale of the output, only set if the output is a float8 type
+//    - `scale_a`: a scalar or 1-dimensional tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
+//    - `scale_b`: a scalar or 1-dimensional tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
+//    - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
 //    - `use_fast_accum`: if true, enables fast float8 accumulation
 //    - `out`: a reference to the output tensor
-//    - `amax`: a reference to the amax tensor of the output, only needed if the output is a float8 type and will be updated inplace
+//    - `amax`: a reference to the amax tensor of the output, only mutated if the output is a float8 type and will be updated inplace
 
 std::tuple<Tensor&, Tensor&>
 _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
@@ -855,10 +933,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   TORCH_CHECK(
       mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
       mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(!scale_a || (scale_a->numel() == 1 && scale_a->scalar_type() == kFloat),
-       "scale_a must be float scalar");
-  TORCH_CHECK(!scale_b || (scale_b->numel() == 1 && scale_b->scalar_type() == kFloat),
-       "scale_b must be a float scalar");
+
+  // Check what type of scaling we are doing based on inputs
+  ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat2.size(1));
+  TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported");
+
   TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
        "scale_result must be a float scalar");
   TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
@@ -901,12 +980,26 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
                       {scale_result_, "scale_result", 7}};
     checkAllSameGPU(__func__, targs);
   }
-
+  // Validation checks have passed lets resize the output to actual size
   IntArrayRef mat1_sizes = mat1.sizes();
   IntArrayRef mat2_sizes = mat2.sizes();
   at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
   at::native::resize_output(amax, {});
 
+  // We are doing row-wise scaling
+  if (scaling_choice == ScalingType::RowWise) {
+    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precsion output types are supported for row-wise scaling.");
+    at::cuda::detail::f8f8bf16_rowwise(
+        mat1,
+        mat2,
+        scale_a.value(),
+        scale_b.value(),
+        bias,
+        use_fast_accum,
+        out);
+    return {out, amax};
+  }
+
   cublasCommonArgs args(mat1, mat2, out);
   const auto out_dtype_ = args.result->scalar_type();
   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");