WIP NVfp4

drisspg · drisspg · commit c58c5b09b9fe · 2025-06-18T13:33:06.000-07:00
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -441,3 +441,42 @@ def test_inference_subclass(elem_dtype, bias: bool, compile: bool):
     assert sqnr >= SQNR_THRESHOLD, (
         f"Got a sqnr of {sqnr} for {elem_dtype} and bias={bias}"
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for float4 gemm"
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("compile", [True, False])
+@torch.no_grad()
+@skip_if_rocm("ROCm float4 gemm require gfx950")
+def test_inference_subclass_nvfp4(bias: bool, compile: bool):
+    """
+    Test NVFP4 recipe with scale_dtype=float8_e4m3fn and block_size=16
+    """
+    m = nn.Linear(32, 128, bias=bias, dtype=torch.bfloat16, device="cuda")
+    m_mx = copy.deepcopy(m)
+
+    config = MXFPInferenceConfig(
+        activation_dtype=torch.float4_e2m1fn_x2,
+        weight_dtype=torch.float4_e2m1fn_x2,
+        scale_dtype=torch.float8_e4m3fn,  # NVFP4 scale dtype
+        block_size=16,  # NVFP4 block size
+        gemm_kernel_choice=MXGemmKernelChoice.CUTLASS,
+    )
+    quantize_(m_mx, config=config)
+    if compile:
+        m_mx = torch.compile(m_mx, fullgraph=True)
+
+    x = torch.randn(128, 32, device="cuda", dtype=torch.bfloat16)
+    y_ref = m(x)
+    y_mx = m_mx(x)
+    sqnr = compute_error(y_ref, y_mx)
+    SQNR_THRESHOLD = 15.0  # Float4 threshold
+    assert sqnr >= SQNR_THRESHOLD, (
+        f"Got a sqnr of {sqnr} for NVFP4 recipe with bias={bias}"
+    )
diff --git a/torchao/prototype/mx_formats/mx_subclass.py b/torchao/prototype/mx_formats/mx_subclass.py
@@ -6,7 +6,7 @@
 
 import types
 from dataclasses import dataclass
-from typing import Optional
+from typing import Literal, Optional, Union
 
 import torch
 
@@ -27,6 +27,30 @@
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_sm_at_least_100
 
 
+def _validate_scale_dtype(
+    block_size: int,
+    weight_dtype: torch.dtype,
+    activation_dtype: torch.dtype,
+    scale_dtype: torch.dtype,
+):
+    """Validate that the scale dtype is one of the supported float8 types."""
+    assert scale_dtype in [
+        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
+    ], f"Unsupported scale_dtype {scale_dtype}, must be float8_e8m0fnu or float8_e4m3fn"
+    if scale_dtype == torch.float8_e8m0fnu:
+        _validate_elem_dtype(weight_dtype)
+        _validate_elem_dtype(activation_dtype)
+        return
+
+    assert (
+        weight_dtype == activation_dtype and weight_dtype == torch.float4_e2m1fn_x2
+    ), (
+        f"scale_dtype {scale_dtype} is only supported with weight_dtype {weight_dtype} and activation_dtype {activation_dtype}, got weight_dtype {weight_dtype} and activation_dtype {activation_dtype}"
+    )
+    assert block_size == 16, f"For NVFP4, block_size must be 16, got {block_size}"
+
+
 # Note: This API is extra prototype and will change in the future
 @dataclass
 class MXFPInferenceConfig(AOBaseConfig):
@@ -61,12 +85,16 @@ class MXFPInferenceConfig(AOBaseConfig):
     - MXTensor in torchao.prototype.mx_formats.mx_tensor
     """
 
-    block_size: int = 32
+    block_size: Union[Literal[32], Literal[16]] = 32
 
-    # Dtypes for Input and Weights
+    # Dtypes for Input and Weights, supports Fp8 and Fp4 formats
     activation_dtype: torch.dtype = torch.float8_e4m3fn
     weight_dtype: torch.dtype = torch.float8_e4m3fn
 
+    # Supports float8_e4m3fn, float8_e8m0fnu
+    # e8m0 for MX and e4m3 for NVFP4 on Cuda compatable devices
+    scale_dtype: torch.dtype = torch.float8_e8m0fnu
+
     # Which kernel to run for mm
     gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.CUBLAS
 
@@ -82,6 +110,9 @@ def __post_init__(self):
         _validate_gemm_kernel_choice(
             self.gemm_kernel_choice, self.block_size, self.weight_dtype
         )
+        _validate_scale_dtype(
+            self.block_size, self.weight_dtype, self.activation_dtype, self.scale_dtype
+        )
 
 
 def _linear_extra_repr(self):