[ROCm] OCP FP8 Support for new GPUs (pytorch#146632)

petrex · jeffdaily · pruthvistony · commit 20ad852d1a97 · 2025-04-02T21:52:54.000-05:00
TLDR: Follow up/ Build on top of pytorch#144476. add OCP FP8 support for gfx950 refer to pytorch/ao#1677 This pull request includes several changes to improve compatibility and support for new GPU architectures and data types, particularly for ROCm. The key updates involve adding support for new ROCm versions and GPU architectures, updating data type handling, and removing outdated checks. * [`aten/src/ATen/Context.cpp`](diffhunk://#diff-33de472d304acbe57d693c8567370c638068bedc1aa0ce8e9dc115dad05a7810L323-R326): Added support for new GPU architectures `gfx1200`, `gfx1201`, and `gfx950` based on ROCm version checks. * [`aten/src/ATen/native/cuda/Blas.cpp`](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abL196-R199): Updated architecture support in multiple functions to include `gfx1200`, `gfx1201`, and `gfx950` based on ROCm version checks. [[1]](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abL196-R199) [[2]](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abL865-R876) * [`aten/src/ATen/cuda/CUDADataType.h`](diffhunk://#diff-9188bb13b1a49f459141f5f9b875593d1c5ce2beb5ad711fdbaf5bc7089ec015L81-L98): Enhanced data type conversion to include new float8 types for both CUDA and ROCm environments. * [`aten/src/ATen/cuda/tunable/GemmHipblaslt.h`](diffhunk://#diff-bfa1a3b5d4bef1892bf50338775f3b0fd8cd31fc1868148f3968b98aefb68e3fL29-R80): Updated `HipDataTypeFor` template to handle new float8 types and added hard-coded enum values for ROCm versions prior to 6.3. * [`cmake/public/LoadHIP.cmake`](diffhunk://#diff-b98e27b9a5f196a6965a99ee5a7bb15b3fc633d6375b767635b1b04ccb2fd3d5L169-L197): Removed the check for `HIP_NEW_TYPE_ENUMS` as it is no longer necessary with the updated ROCm versions. [[1]](diffhunk://#diff-b98e27b9a5f196a6965a99ee5a7bb15b3fc633d6375b767635b1b04ccb2fd3d5L169-L197) [[2]](diffhunk://#diff-b98e27b9a5f196a6965a99ee5a7bb15b3fc633d6375b767635b1b04ccb2fd3d5L211-R182) These changes ensure better compatibility and performance on newer hardware and software environments, particularly for users leveraging ROCm and CUDA for deep learning and scientific computing tasks. Pull Request resolved: pytorch#146632 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -318,9 +318,12 @@ at::BlasBackend Context::blasPreferredBackend() {
   if (blas_preferred_backend == at::BlasBackend::Cublaslt) {
     static const bool hipblaslt_unsupported = []() {
       static const std::vector<std::string> archs = {
-          "gfx90a", "gfx940", "gfx941", "gfx942",
+          "gfx90a", "gfx942"
 #if ROCM_VERSION >= 60300
-          "gfx1100", "gfx1101"
+          , "gfx1100", "gfx1101", "gfx1200", "gfx1201"
+#endif
+#if ROCM_VERSION >= 60500
+          , "gfx950"
 #endif
       };
       for (auto index: c10::irange(getNumGPUs())) {
diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
@@ -78,24 +78,17 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
       return CUDA_R_64I;
     case c10::ScalarType::BFloat16:
       return CUDA_R_16BF;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
     case c10::ScalarType::Float8_e4m3fn:
       return CUDA_R_8F_E4M3;
     case c10::ScalarType::Float8_e5m2:
       return CUDA_R_8F_E5M2;
 #endif
 #if defined(USE_ROCM)
-#if defined(HIP_NEW_TYPE_ENUMS)
     case c10::ScalarType::Float8_e4m3fnuz:
       return HIP_R_8F_E4M3_FNUZ;
     case c10::ScalarType::Float8_e5m2fnuz:
       return HIP_R_8F_E5M2_FNUZ;
-#else
-    case c10::ScalarType::Float8_e4m3fnuz:
-      return static_cast<hipDataType>(1000);
-    case c10::ScalarType::Float8_e5m2fnuz:
-      return static_cast<hipDataType>(1001);
-#endif
 #endif
     default:
       TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -26,38 +26,65 @@
 namespace at::cuda::tunable {
 
 template <typename T>
-constexpr hipblasDatatype_t HipDataTypeFor();
+constexpr hipDataType HipDataTypeFor();
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<float>() {
+constexpr hipDataType HipDataTypeFor<float>() {
   return HIP_R_32F;
 }
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<Half>() {
+constexpr hipDataType HipDataTypeFor<Half>() {
   return HIP_R_16F;
 }
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<BFloat16>() {
+constexpr hipDataType HipDataTypeFor<BFloat16>() {
   return HIP_R_16BF;
 }
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<double>() {
+constexpr hipDataType HipDataTypeFor<double>() {
   return HIP_R_64F;
 }
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e4m3fnuz>() {
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fnuz>() {
   return HIP_R_8F_E4M3_FNUZ;
 }
 
 template <>
-constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e5m2fnuz>() {
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2fnuz>() {
   return HIP_R_8F_E5M2_FNUZ;
 }
 
+// This code is instantiated regardless of ROCm version.
+// Prior to ROCm 6.3, we hard-code the known enum values.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fn>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E4M3;
+#else
+  return static_cast<hipDataType>(28);
+#endif
+}
+
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E5M2;
+#else
+  return static_cast<hipDataType>(29);
+#endif
+}
+
+// This type is not intended for matrix types but rather a scale factor.
+// Return a dummy value to satisfy linker.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
+  return static_cast<hipDataType>(500);
+}
+
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
   return 1;
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -21,6 +21,7 @@
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
 #include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
 #include <c10/util/StringUtil.h>
 #include <fmt/printf.h>
 
@@ -181,6 +182,11 @@ inline const char* TypeName(Float8_e5m2fnuz v) {
   return "Float8_e5m2fnuz";
 }
 
+template <>
+inline const char* TypeName(Float8_e8m0fnu v) {
+  return "Float8_e8m0fnu";
+}
+
 template <>
 inline const char* TypeName(c10::complex<double> v) {
   return "c10::complex<double>";
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -191,9 +191,12 @@ static bool isSupportedHipLtROCmArch(int index) {
     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
     std::string device_arch = prop->gcnArchName;
     static const std::vector<std::string> archs = {
-        "gfx90a", "gfx940", "gfx941", "gfx942",
+        "gfx90a", "gfx942"
 #if ROCM_VERSION >= 60300
-        "gfx1100", "gfx1101"
+        , "gfx1100", "gfx1101", "gfx1200", "gfx1201"
+#endif
+#if ROCM_VERSION >= 60500
+        , "gfx950"
 #endif
     };
     for (std::string arch : archs) {
@@ -862,7 +865,15 @@ static bool _scaled_mm_allowed_device() {
     auto dprops = at::cuda::getCurrentDeviceProperties();
 #ifdef USE_ROCM
     std::string device_arch = dprops->gcnArchName;
-    static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
+    static const std::vector<std::string> archs = {
+        "gfx942"
+#if ROCM_VERSION >= 60300
+        ,"gfx1200", "gfx1201"
+#endif
+#if ROCM_VERSION >= 60500
+        ,"gfx950"
+#endif
+    };
     for (std::string arch : archs) {
         size_t substring = device_arch.find(arch);
         if (substring != std::string::npos) {
@@ -1144,6 +1155,34 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
                 BLASOP_A, BLASOP_B> scaledgemm{};                       \
             scaledgemm(&params);                                        \
           }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
         }
     AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
       bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
@@ -1186,7 +1225,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
         TORCH_CHECK(false, "unreachable");
       }
     }),
-    kHalf, kBFloat16, kFloat8_e4m3fnuz, kFloat8_e5m2fnuz, AT_EXPAND(AT_FLOATING_TYPES));
+    kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
 #undef TUNABLE_DISPATCH
   }
   else
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -1026,8 +1026,8 @@ if(USE_ROCM)
     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
     list(APPEND HIP_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
     list(APPEND HIP_CXX_FLAGS -DHIP_ENABLE_WARP_SYNC_BUILTINS)
-    if(HIP_NEW_TYPE_ENUMS)
-      list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
+    if(WIN32)
+      add_definitions(-DROCM_ON_WINDOWS)
     endif()
     add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
     add_definitions(-DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
@@ -175,34 +175,7 @@ if(HIP_FOUND)
     # roctx is part of roctracer
     find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
 
-    # check whether HIP declares new types
     set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
-    set(file "${PROJECT_BINARY_DIR}/hip_new_types.cc")
-    file(WRITE ${file} ""
-      "#include <hip/library_types.h>\n"
-      "int main() {\n"
-      "    hipDataType baz = HIP_R_8F_E4M3_FNUZ;\n"
-      "    return 0;\n"
-      "}\n"
-      )
-
-    try_compile(hip_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
-      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
-      COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
-      OUTPUT_VARIABLE hip_compile_output)
-
-    if(hip_compile_result)
-      set(HIP_NEW_TYPE_ENUMS ON)
-      #message("HIP is using new type enums: ${hip_compile_output}")
-      message("HIP is using new type enums")
-    else()
-      set(HIP_NEW_TYPE_ENUMS OFF)
-      #message("HIP is NOT using new type enums: ${hip_compile_output}")
-      message("HIP is NOT using new type enums")
-    endif()
-  else() # Win32
-    # With HIP-SDK 6.2, HIP declares new enum types on Windows
-    set(HIP_NEW_TYPE_ENUMS ON)
   endif()
 
   if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -48,7 +48,13 @@
 def blaslt_supported_device():
     if torch.cuda.is_available():
         if torch.version.hip:
-            for arch in ['gfx90a', 'gfx94']:
+            ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
+            archs = ['gfx90a', 'gfx94']
+            if ROCM_VERSION >= (6, 3):
+                archs.extend(['gfx110', 'gfx120'])
+            if ROCM_VERSION >= (6, 5):
+                archs.append('gfx95')
+            for arch in archs:
                 if arch in torch.cuda.get_device_properties(0).gcnArchName:
                     return True
         else:
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -214,7 +214,7 @@ def _expand_to_batch(t: torch.Tensor):
 
 f8_msg = "FP8 is only supported on H100+ and sm_89 and MI300+ devices"
 
-if torch.version.hip:
+if torch.version.hip and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
     e5m2_type = torch.float8_e5m2fnuz
     E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fnuz).max
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
@@ -225,12 +225,16 @@ def max_clock_rate():
             return 1700
         elif "gfx908" in gcn_arch:
             return 1502
+        elif "gfx12" in gcn_arch:
+            return 1700
         elif "gfx11" in gcn_arch:
             return 1700
         elif "gfx103" in gcn_arch:
             return 1967
         elif "gfx101" in gcn_arch:
             return 1144
+        elif "gfx95" in gcn_arch:
+            return 1700  # TODO: placeholder, get actual value
         else:
             return 1100
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -82,7 +82,15 @@ def evaluate_platform_supports_cudnn_attention():
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
-            return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+            ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
+            archs = ['gfx94']
+            if ROCM_VERSION >= (6, 3):
+                archs.extend(['gfx120'])
+            if ROCM_VERSION >= (6, 5):
+                archs.append('gfx95')
+            for arch in archs:
+                if arch in torch.cuda.get_device_properties(0).gcnArchName:
+                    return True
         else:
             return SM90OrLater or torch.cuda.get_device_capability() == (8, 9)
     return False
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -3863,6 +3863,8 @@
         ("CUDA_C_64I", ("HIP_C_64I", CONV_TYPE, API_RUNTIME)),
         ("CUDA_R_64U", ("HIP_R_64U", CONV_TYPE, API_RUNTIME)),
         ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_8F_E4M3", ("HIP_R_8F_E4M3", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_8F_E5M2", ("HIP_R_8F_E5M2", CONV_TYPE, API_RUNTIME)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),