WIP NVfp4

drisspg · drisspg · commit 1c007a4eac65 · 2025-06-18T14:29:57.000-07:00
stack-info: PR: #2408, branch: drisspg/stack/78
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -441,3 +441,42 @@ def test_inference_subclass(elem_dtype, bias: bool, compile: bool):
     assert sqnr >= SQNR_THRESHOLD, (
         f"Got a sqnr of {sqnr} for {elem_dtype} and bias={bias}"
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for float4 gemm"
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("compile", [True, False])
+@torch.no_grad()
+@skip_if_rocm("ROCm float4 gemm require gfx950")
+def test_inference_subclass_nvfp4(bias: bool, compile: bool):
+    """
+    Test NVFP4 recipe with scale_dtype=float8_e4m3fn and block_size=16
+    """
+    m = nn.Linear(32, 128, bias=bias, dtype=torch.bfloat16, device="cuda")
+    m_mx = copy.deepcopy(m)
+
+    config = MXFPInferenceConfig(
+        activation_dtype=torch.float4_e2m1fn_x2,
+        weight_dtype=torch.float4_e2m1fn_x2,
+        scale_dtype=torch.float8_e4m3fn,
+        block_size=16,
+        gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,
+    )
+    quantize_(m_mx, config=config)
+    if compile:
+        m_mx = torch.compile(m_mx, fullgraph=True)
+
+    x = torch.randn(128, 32, device="cuda", dtype=torch.bfloat16)
+    y_ref = m(x)
+    y_mx = m_mx(x)
+    sqnr = compute_error(y_ref, y_mx)
+    SQNR_THRESHOLD = 15.0  # Float4 threshold
+    assert sqnr >= SQNR_THRESHOLD, (
+        f"Got a sqnr of {sqnr} for NVFP4 recipe with bias={bias}"
+    )
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
     elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:
-        assert block_size == 32, (
-            f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"
+        assert block_size in [16, 32], (
+            f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"
         )
-        valid_dtypes = [torch.float8_e4m3fn]
+        valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]
         assert elem_dtype in valid_dtypes, (
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -104,6 +104,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 w_elem_dtype,
                 block_size,
                 weight_hp.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -133,6 +134,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 grad_elem_dtype,
                 block_size,
                 grad_output_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -155,6 +157,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 in_elem_dtype,
                 block_size,
                 input_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -93,8 +93,8 @@ def _addmm_mx_dispatch(
         M, K, N = a.shape[0], a.shape[1], b.shape[1]
         assert a._data.is_contiguous()
         assert b._data.t().is_contiguous()
-        assert a._block_size == 32, f"Invalid block size {a._block_size}"
-        assert b._block_size == 32, f"Invalid block size {b._block_size}"
+        assert a._block_size in [16, 32], f"Invalid block size {a._block_size}"
+        assert b._block_size in [16, 32], f"Invalid block size {b._block_size}"
 
         a_scale = a._scale_e8m0.view(M, K // a._block_size)
         b_scale = b._scale_e8m0.view(N, K // b._block_size)
@@ -176,6 +176,7 @@ def mx_t(func, types, args, kwargs):
         old._elem_dtype,
         old._block_size,
         old._orig_dtype,
+        old._scale_dtype,
         old._use_fp4_custom_triton_dequant_kernel,
         old._gemm_kernel_choice,
         old._pack_fp6,
@@ -220,6 +221,7 @@ def mx_view_op(func, types, args, kwargs):
         args[0]._elem_dtype,
         args[0]._block_size,
         args[0]._orig_dtype,
+        args[0]._scale_dtype,
         args[0]._use_fp4_custom_triton_dequant_kernel,
         args[0]._gemm_kernel_choice,
         args[0]._pack_fp6,
@@ -281,6 +283,7 @@ def mx_slice(func, types, args, kwargs):
             x._elem_dtype,
             x._block_size,
             x._orig_dtype,
+            x._scale_dtype,
             x._use_fp4_custom_triton_dequant_kernel,
             x._gemm_kernel_choice,
             x._pack_fp6,
diff --git a/torchao/prototype/mx_formats/mx_subclass.py b/torchao/prototype/mx_formats/mx_subclass.py
@@ -6,7 +6,7 @@
 
 import types
 from dataclasses import dataclass
-from typing import Optional
+from typing import Literal, Optional, Union
 
 import torch
 
@@ -27,6 +27,30 @@
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_sm_at_least_100
 
 
+def _validate_scale_dtype(
+    block_size: int,
+    weight_dtype: torch.dtype,
+    activation_dtype: torch.dtype,
+    scale_dtype: torch.dtype,
+):
+    """Validate that the scale dtype is one of the supported float8 types."""
+    assert scale_dtype in [
+        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
+    ], f"Unsupported scale_dtype {scale_dtype}, must be float8_e8m0fnu or float8_e4m3fn"
+    if scale_dtype == torch.float8_e8m0fnu:
+        _validate_elem_dtype(weight_dtype)
+        _validate_elem_dtype(activation_dtype)
+        return
+
+    assert (
+        weight_dtype == activation_dtype and weight_dtype == torch.float4_e2m1fn_x2
+    ), (
+        f"scale_dtype {scale_dtype} is only supported with weight_dtype {weight_dtype} and activation_dtype {activation_dtype}, got weight_dtype {weight_dtype} and activation_dtype {activation_dtype}"
+    )
+    assert block_size == 16, f"For NVFP4, block_size must be 16, got {block_size}"
+
+
 # Note: This API is extra prototype and will change in the future
 @dataclass
 class MXFPInferenceConfig(AOBaseConfig):
@@ -61,12 +85,16 @@ class MXFPInferenceConfig(AOBaseConfig):
     - MXTensor in torchao.prototype.mx_formats.mx_tensor
     """
 
-    block_size: int = 32
+    block_size: Union[Literal[32], Literal[16]] = 32
 
-    # Dtypes for Input and Weights
+    # Dtypes for Input and Weights, supports Fp8 and Fp4 formats
     activation_dtype: torch.dtype = torch.float8_e4m3fn
     weight_dtype: torch.dtype = torch.float8_e4m3fn
 
+    # Supports float8_e4m3fn, float8_e8m0fnu
+    # e8m0 for MX and e4m3 for NVFP4 on Cuda compatable devices
+    scale_dtype: torch.dtype = torch.float8_e8m0fnu
+
     # Which kernel to run for mm
     gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.CUBLAS
 
@@ -82,6 +110,9 @@ def __post_init__(self):
         _validate_gemm_kernel_choice(
             self.gemm_kernel_choice, self.block_size, self.weight_dtype
         )
+        _validate_scale_dtype(
+            self.block_size, self.weight_dtype, self.activation_dtype, self.scale_dtype
+        )
 
 
 def _linear_extra_repr(self):
@@ -92,6 +123,7 @@ def _input_activation_quant_func_mxfp(
     x: torch.Tensor,
     activation_dtype: torch.dtype,
     block_size: int,
+    scale_dtype: Optional[torch.dtype] = None,
     scale: Optional[torch.Tensor] = None,
 ):
     """ """
@@ -102,6 +134,7 @@ def _input_activation_quant_func_mxfp(
         x,
         activation_dtype,
         block_size=block_size,
+        scale_dtype=scale_dtype,
         gemm_kernel_choice=None,  # Get from weight
         pack_fp6=False,  # TODO
     )
@@ -131,6 +164,7 @@ def _mx_inference_linear_transform(
         weight,
         weight_dtype,
         block_size=config.block_size,
+        scale_dtype=config.scale_dtype,
         gemm_kernel_choice=config.gemm_kernel_choice,
         pack_fp6=False,  # TODO
     )
@@ -139,6 +173,7 @@ def _mx_inference_linear_transform(
     input_quant_kwargs = {
         "block_size": config.block_size,
         "activation_dtype": activation_dtype,
+        "scale_dtype": config.scale_dtype,
         "scale": None,
     }
 
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -18,7 +18,7 @@
 """
 
 from enum import Enum, auto
-from typing import Callable, Dict, Union
+from typing import Callable, Dict, Optional, Union
 
 import torch
 
@@ -146,6 +146,7 @@ def to_mx(
     data_hp: torch.Tensor,
     elem_dtype: Union[torch.dtype, str],
     block_size: int,
+    scale_dtype: Optional[torch.dtype] = None,
     scaling_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR,
     pack_fp6: bool = False,
 ):
@@ -473,6 +474,7 @@ def __new__(
         elem_dtype,
         block_size,
         orig_dtype,
+        scale_dtype,
         use_fp4_custom_triton_dequant_kernel,
         gemm_kernel_choice,
         pack_fp6,
@@ -544,6 +546,7 @@ def __new__(
         self._elem_dtype = elem_dtype
         self._block_size = block_size
         self._orig_dtype = orig_dtype
+        self._scale_dtype = scale_dtype
         self._use_fp4_custom_triton_dequant_kernel = (
             use_fp4_custom_triton_dequant_kernel
         )
@@ -589,20 +592,22 @@ def to_mx(
         data_hp: torch.Tensor,
         elem_dtype: Union[torch.dtype, str],
         block_size: int = BLOCK_SIZE_DEFAULT,
+        scale_dtype: Optional[torch.dtype] = None,
         scaling_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR,
         use_fp4_custom_triton_dequant_kernel: bool = False,
         gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.EMULATED,
         pack_fp6: bool = False,
     ):
         scale_e8m0_biased, data_lp = to_mx(
-            data_hp, elem_dtype, block_size, scaling_mode, pack_fp6
+            data_hp, elem_dtype, block_size, scale_dtype, scaling_mode, pack_fp6
         )
         return MXTensor(
             scale_e8m0_biased,
             data_lp,
             elem_dtype,
             block_size,
             data_hp.dtype,
+            scale_dtype,
             use_fp4_custom_triton_dequant_kernel,
             gemm_kernel_choice,
             pack_fp6,
@@ -613,6 +618,7 @@ def __tensor_flatten__(self):
             "_elem_dtype": self._elem_dtype,
             "_block_size": self._block_size,
             "_orig_dtype": self._orig_dtype,
+            "_scale_dtype": self._scale_dtype,
             "_use_fp4_custom_triton_dequant_kernel": self._use_fp4_custom_triton_dequant_kernel,
             "_gemm_kernel_choice": self._gemm_kernel_choice,
             "_pack_fp6": self._pack_fp6,
@@ -632,6 +638,7 @@ def __tensor_unflatten__(
             metadata["_elem_dtype"],
             metadata["_block_size"],
             metadata["_orig_dtype"],
+            metadata["_scale_dtype"],
             metadata["_use_fp4_custom_triton_dequant_kernel"],
             metadata["_gemm_kernel_choice"],
             metadata["_pack_fp6"],
@@ -664,6 +671,7 @@ def _same_metadata(cls, self: "MXTensor", src: "MXTensor") -> bool:
             and self._elem_dtype == src._elem_dtype
             and self._block_size == src._block_size
             and self._orig_dtype == src._orig_dtype
+            and self._scale_dtype == src._scale_dtype
             and self._use_fp4_custom_triton_dequant_kernel
             == src._use_fp4_custom_triton_dequant_kernel
             and self._gemm_kernel_choice == src._gemm_kernel_choice

Original file line number	Diff line number	Diff line change
`@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):`
`57`	`57`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`58`	`58`	`)`
`59`	`59`	`elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:`
`60`		`- assert block_size == 32, (`
`61`		`- f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"`
	`60`	`+ assert block_size in [16, 32], (`
	`61`	`+ f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"`
`62`	`62`	`)`
`63`		`- valid_dtypes = [torch.float8_e4m3fn]`
	`63`	`+ valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]`
`64`	`64`	`assert elem_dtype in valid_dtypes, (`
`65`	`65`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`66`	`66`	`)`