ROCm
diff --git a/‎aten/src/ATen/DLConvertor.cpp
+2 b/‎aten/src/ATen/DLConvertor.cpp
+2
diff --git a/‎aten/src/ATen/Dispatch_v2.h
+1-1 b/‎aten/src/ATen/Dispatch_v2.h
+1-1
diff --git a/‎aten/src/ATen/native/Copy.cpp
+2-2 b/‎aten/src/ATen/native/Copy.cpp
+2-2
diff --git a/‎aten/src/ATen/native/TensorCompare.cpp
+2-1 b/‎aten/src/ATen/native/TensorCompare.cpp
+2-1
diff --git a/‎aten/src/ATen/native/cpu/CopyKernel.cpp
+4-4 b/‎aten/src/ATen/native/cpu/CopyKernel.cpp
+4-4
diff --git a/‎aten/src/ATen/native/cpu/FillKernel.cpp
+3 b/‎aten/src/ATen/native/cpu/FillKernel.cpp
+3
diff --git a/‎aten/src/ATen/native/cpu/IndexKernel.cpp
+7-1 b/‎aten/src/ATen/native/cpu/IndexKernel.cpp
+7-1
diff --git a/‎aten/src/ATen/native/cuda/Copy.cu
+23-1 b/‎aten/src/ATen/native/cuda/Copy.cu
+23-1
diff --git a/‎aten/src/ATen/native/cuda/Indexing.cu
+35-5 b/‎aten/src/ATen/native/cuda/Indexing.cu
+35-5
diff --git a/‎aten/src/ATen/native/cuda/jit_utils.h
+4 b/‎aten/src/ATen/native/cuda/jit_utils.h
+4
diff --git a/‎c10/core/Scalar.h
+2-9 b/‎c10/core/Scalar.h
+2-9
diff --git a/‎c10/core/ScalarType.cpp
+3 b/‎c10/core/ScalarType.cpp
+3
diff --git a/‎c10/core/ScalarType.h
+19-3 b/‎c10/core/ScalarType.h
+19-3
@@ -63,10 +63,12 @@ DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::BFloat16:
       dtype.code = DLDataTypeCode::kDLBfloat;
       break;
+    // TODO(#146647): use macro here instead of spelling out each shell dtype
     case ScalarType::Float8_e5m2:
     case ScalarType::Float8_e5m2fnuz:
     case ScalarType::Float8_e4m3fn:
     case ScalarType::Float8_e4m3fnuz:
+    case ScalarType::Float8_e8m0fnu:
       TORCH_CHECK(false, "float8 types are not supported by dlpack");
       break;
     case ScalarType::QInt8:
 
@@ -87,7 +87,7 @@
 
 #define AT_FLOAT8_TYPES                                          \
   c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz
+      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
 
 #define AT_INTEGRAL_TYPES \
   c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
 
@@ -59,8 +59,8 @@ bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
 #if !defined(C10_MOBILE)
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)                              \
         AT_DISPATCH_V2(                             \
-            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16, kFloat8_e5m2,            \
-            kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            TYPE, NAME, AT_WRAP(__VA_ARGS__), kComplexHalf, kHalf, kBool, kBFloat16,            \
+            AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_CP_TYPES(TYPE, NAME, ...)     \
         AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(    \
 
@@ -403,7 +403,8 @@ Tensor isinf(const Tensor &self) {
 
 Tensor isfinite(const Tensor& self) {
   // Note: Integral tensor values are always finite
-  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true) ||
+      self.scalar_type() == kFloat8_e8m0fnu) {
     return at::ones_like(self, at::kBool, at::MemoryFormat::Preserve);
   }
 
 
@@ -204,12 +204,12 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                       \
         AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                                       \
             kComplexHalf, kHalf, kBool,              \
-            kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #define _AT_DISPATCH_ALL_TYPES_NO_CF(TYPE, NAME, ...)              \
         AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),                    \
-            kBool, kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \
-            kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+            kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), \
+            AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                                               \
         AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                                               \
 
@@ -51,6 +51,9 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
     fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
   } else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
     fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e8m0fnu) {
+    // TODO(#146647): use macro here instead of spelling out each float8 dtype
+    fill_non_native_type<at::Float8_e8m0fnu>(iter, value_scalar);
   } else {
     AT_DISPATCH_V2(
       iter.dtype(), "fill_cpu", AT_WRAP([&]() {
 
@@ -184,7 +184,13 @@ void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
       }
     }),
     AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-    AT_EXPAND(AT_FLOAT8_TYPES),
+    // AT_EXPAND(AT_FLOAT8_TYPES),
+    // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+    // should not be supported here, then reenable AT_FLOAT8_DTYPES
+    kFloat8_e4m3fn,
+    kFloat8_e5m2,
+    kFloat8_e4m3fnuz,
+    kFloat8_e5m2fnuz,
     kComplexHalf,
     kHalf,
     kBool,
 
@@ -144,6 +144,28 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
          gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
          break;
     }
+  } else if (dtype == kFloat8_e8m0fnu) {
+    // TODO(#146647): clean this up, too much copy-pasta
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e8m0fnu(value);
+         });
+         break;
+      default:
+         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e8m0fnu x) { return x; });
+         break;
+    }
   } else {
     TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
   }
@@ -157,7 +179,7 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
     AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
-  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
+  } else if (isFloat8Type(dtype)) {
      float8_copy_kernel_cuda(iter);
   } else if (iter.dtype(1) == kFloat && (dtype == kBFloat16 || dtype == kHalf)) {
      if (dtype == kBFloat16) {
 
@@ -712,7 +712,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           }),
           AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-          AT_EXPAND(AT_FLOAT8_TYPES),
+          // AT_EXPAND(AT_FLOAT8_TYPES),
+          // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+          // should not be supported here, then reenable AT_FLOAT8_DTYPES
+          kFloat8_e4m3fn,
+          kFloat8_e5m2,
+          kFloat8_e4m3fnuz,
+          kFloat8_e5m2fnuz,
           kComplexHalf,
           kHalf,
           kBool,
@@ -738,7 +744,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
               C10_CUDA_KERNEL_LAUNCH_CHECK();
             }),
             AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
+            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+            // should not be supported here, then reenable AT_FLOAT8_DTYPES
+            kFloat8_e4m3fn,
+            kFloat8_e5m2,
+            kFloat8_e4m3fnuz,
+            kFloat8_e5m2fnuz,
             kComplexHalf,
             kHalf,
             kBool,
@@ -762,7 +774,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                 C10_CUDA_KERNEL_LAUNCH_CHECK();
               }),
               AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
               kComplexHalf,
               kHalf,
               kBool,
@@ -784,7 +802,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
                 C10_CUDA_KERNEL_LAUNCH_CHECK();
               }),
               AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-              AT_EXPAND(AT_FLOAT8_TYPES),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
               kComplexHalf,
               kHalf,
               kBool,
@@ -809,7 +833,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
               C10_CUDA_KERNEL_LAUNCH_CHECK();
             }),
             AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
-            AT_EXPAND(AT_FLOAT8_TYPES),
+            // AT_EXPAND(AT_FLOAT8_TYPES),
+            // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+            // should not be supported here, then reenable AT_FLOAT8_DTYPES
+            kFloat8_e4m3fn,
+            kFloat8_e5m2,
+            kFloat8_e4m3fnuz,
+            kFloat8_e5m2fnuz,
             kComplexHalf,
             kHalf,
             kBool,
 
@@ -225,6 +225,10 @@ template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
 template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
     return "at::Float8_e4m3fnuz";
 }
+template <> inline std::string typeName<at::Float8_e8m0fnu>() {
+    // TODO(#146647): Can the code here be made generic for any scalartype?
+    return "at::Float8_e8m0fnu";
+}
 
 #define TYPE_NAME_CASE(ctype, scalartype)                    \
   case ScalarType::scalartype:  return typeName<ctype>();
 
@@ -49,16 +49,9 @@ class C10_API Scalar {
 #define DEFINE_IMPLICIT_CTOR(type, name) \
   Scalar(type vv) : Scalar(vv, true) {}
 
-  AT_FORALL_SCALAR_TYPES_AND7(
-      Half,
-      BFloat16,
-      Float8_e5m2,
-      Float8_e4m3fn,
-      Float8_e5m2fnuz,
-      Float8_e4m3fnuz,
-      ComplexHalf,
-      DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_SCALAR_TYPES_AND3(Half, BFloat16, ComplexHalf, DEFINE_IMPLICIT_CTOR)
   AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_FLOAT8_TYPES(DEFINE_IMPLICIT_CTOR)
 
   // Helper constructors to allow Scalar creation from long and long long types
   // As std::is_same_v<long, long long> is false(except Android), one needs to
 
@@ -222,6 +222,9 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
       return std::make_pair("float8_e5m2fnuz", "");
     case c10::ScalarType::Float8_e4m3fnuz:
       return std::make_pair("float8_e4m3fnuz", "");
+    case c10::ScalarType::Float8_e8m0fnu:
+      // TODO(#146647): macroify all of this
+      return std::make_pair("float8_e8m0fnu", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }
 
@@ -7,6 +7,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/Half.h>
 #include <c10/util/bits.h>
 #include <c10/util/complex.h>
@@ -102,7 +103,8 @@ struct dummy_int1_7_t {};
   _(c10::dummy_int1_7_t<4>, Int4) /* 40 */               \
   _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
   _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
-  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */
+  _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
+  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -146,7 +148,8 @@ struct dummy_int1_7_t {};
   _(at::Float8_e5m2, Float8_e5m2)              \
   _(at::Float8_e4m3fn, Float8_e4m3fn)          \
   _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
-  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
 
 enum class ScalarType : int8_t {
 #define DEFINE_ST_ENUM_VAL_(_1, n) n,
@@ -317,6 +320,13 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
   _(c10::quint4x2, QUInt4x2)    \
   _(c10::quint2x4, QUInt2x4)
 
+#define AT_FORALL_FLOAT8_TYPES(_)         \
+  _(at::Float8_e5m2, Float8_e5m2)         \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
+
 #define AT_FORALL_COMPLEX_TYPES(_)     \
   _(c10::complex<float>, ComplexFloat) \
   _(c10::complex<double>, ComplexDouble)
@@ -372,7 +382,8 @@ inline bool isIntegralType(ScalarType t) {
 
 inline bool isFloat8Type(ScalarType t) {
   return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
-      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz;
+      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz ||
+      t == ScalarType::Float8_e8m0fnu;
 }
 
 inline bool isReducedFloatingType(ScalarType t) {
@@ -446,6 +457,10 @@ inline bool isSignedType(ScalarType t) {
     return std::numeric_limits< \
         ::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;
 
+  // TODO(#146647): If we expect to have numeric_limits for everything,
+  // let's just have a big macro for the whole thing.
+  // If we're hardcoding it, let's just use the macro and a "true"/"false"
+  // below?
   switch (t) {
     case ScalarType::QInt8:
     case ScalarType::QUInt8:
@@ -467,6 +482,7 @@ inline bool isSignedType(ScalarType t) {
       CASE_ISSIGNED(Float8_e5m2fnuz);
       CASE_ISSIGNED(Float8_e4m3fn);
       CASE_ISSIGNED(Float8_e4m3fnuz);
+      CASE_ISSIGNED(Float8_e8m0fnu);
       CASE_ISSIGNED(Byte);
       CASE_ISSIGNED(Char);
       CASE_ISSIGNED(Short);
Original file line number	Diff line number	Diff line change
`@@ -403,7 +403,8 @@ Tensor isinf(const Tensor &self) {`
`403`	`403`
`404`	`404`	`Tensor isfinite(const Tensor& self) {`
`405`	`405`	`// Note: Integral tensor values are always finite`
`406`		`- if (c10::isIntegralType(self.scalar_type(), /includeBool=/true)) {`
	`406`	`+ if (c10::isIntegralType(self.scalar_type(), /includeBool=/true) \|\|`
	`407`	`+ self.scalar_type() == kFloat8_e8m0fnu) {`
`407`	`408`	`return at::ones_like(self, at::kBool, at::MemoryFormat::Preserve);`
`408`	`409`	`}`
`409`	`410`