WIP NVfp4

drisspg · drisspg · commit 7448f458a626 · 2025-06-18T21:56:06.000-07:00
stack-info: PR: #2408, branch: drisspg/stack/78
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,7 +25,10 @@
     MXInferenceLinear,
     MXLinear,
 )
-from torchao.prototype.mx_formats.mx_subclass import MXFPInferenceConfig
+from torchao.prototype.mx_formats.mx_subclass import (
+    MXFPInferenceConfig,
+    NVFP4InferenceConfig,
+)
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import skip_if_rocm
@@ -441,3 +444,36 @@ def test_inference_subclass(elem_dtype, bias: bool, compile: bool):
     assert sqnr >= SQNR_THRESHOLD, (
         f"Got a sqnr of {sqnr} for {elem_dtype} and bias={bias}"
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for float4 gemm"
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("compile", [True, False])
+@torch.no_grad()
+@skip_if_rocm("ROCm float4 gemm require gfx950")
+def test_inference_subclass_nvfp4(bias: bool, compile: bool):
+    """
+    Test NVFP4 recipe with scale_dtype=float8_e4m3fn and block_size=16
+    """
+    m = nn.Linear(32, 128, bias=bias, dtype=torch.bfloat16, device="cuda")
+    m_mx = copy.deepcopy(m)
+
+    config = NVFP4InferenceConfig()
+    quantize_(m_mx, config=config)
+    if compile:
+        m_mx = torch.compile(m_mx, fullgraph=True)
+
+    x = torch.randn(128, 32, device="cuda", dtype=torch.bfloat16)
+    y_ref = m(x)
+    y_mx = m_mx(x)
+    sqnr = compute_error(y_ref, y_mx)
+    SQNR_THRESHOLD = 15.0  # Float4 threshold
+    assert sqnr >= SQNR_THRESHOLD, (
+        f"Got a sqnr of {sqnr} for NVFP4 recipe with bias={bias}"
+    )
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
     elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:
-        assert block_size == 32, (
-            f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"
+        assert block_size in [16, 32], (
+            f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"
         )
-        valid_dtypes = [torch.float8_e4m3fn]
+        valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]
         assert elem_dtype in valid_dtypes, (
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -104,6 +104,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 w_elem_dtype,
                 block_size,
                 weight_hp.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -133,6 +134,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 grad_elem_dtype,
                 block_size,
                 grad_output_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -155,6 +157,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 in_elem_dtype,
                 block_size,
                 input_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
diff --git a/torchao/prototype/mx_formats/mx_subclass.py b/torchao/prototype/mx_formats/mx_subclass.py
@@ -20,6 +20,7 @@
     _validate_gemm_kernel_choice,
 )
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
 from torchao.quantization.quant_api import to_linear_activation_quantized
 from torchao.quantization.transform_module import (
     register_quantize_module_handler,
@@ -63,7 +64,7 @@ class MXFPInferenceConfig(AOBaseConfig):
 
     block_size: int = 32
 
-    # Dtypes for Input and Weights
+    # Dtypes for Input and Weights, supports Fp8 and Fp4 formats
     activation_dtype: torch.dtype = torch.float8_e4m3fn
     weight_dtype: torch.dtype = torch.float8_e4m3fn
 
@@ -151,7 +152,106 @@ def _mx_inference_linear_transform(
     return module
 
 
+@dataclass
+class NVFP4InferenceConfig(AOBaseConfig):
+    """
+    NVIDIA FP4 (NVFP4) Inference Quantization Configuration
+
+    This is a specialized configuration for NVIDIA's FP4 format with UE4M3 scales.
+    It provides defaults optimized for NVFP4:
+    - Data: float4_e2m1fn_x2
+    - Scales: float8_e4m3fn (UE4M3)
+    - Block size: 16 (required for NVFP4)
+    - CUBLAS kernel (optimized for VEC16_UE4M3)
+    """
+
+    block_size: int = 16  # NVFP4 requires block size 16
+
+    # NVFP4 uses FP4 data
+    activation_dtype: torch.dtype = torch.float4_e2m1fn_x2
+    weight_dtype: torch.dtype = torch.float4_e2m1fn_x2
+
+    # NVFP4 uses E4M3 scales
+    scale_dtype: torch.dtype = torch.float8_e4m3fn
+
+    # CUBLAS is preferred for NVFP4 with VEC16_UE4M3 support
+    gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.CUBLAS
+
+    # Set some magic perf settings
+    set_inductor_config: bool = False
+
+    def __post_init__(self):
+        # Validate NVFP4 constraints
+        assert self.activation_dtype == torch.float4_e2m1fn_x2, (
+            f"NVFP4 requires activation_dtype=float4_e2m1fn_x2, got {self.activation_dtype}"
+        )
+        assert self.weight_dtype == torch.float4_e2m1fn_x2, (
+            f"NVFP4 requires weight_dtype=float4_e2m1fn_x2, got {self.weight_dtype}"
+        )
+        assert self.scale_dtype == torch.float8_e4m3fn, (
+            f"NVFP4 requires scale_dtype=float8_e4m3fn, got {self.scale_dtype}"
+        )
+        assert self.block_size == 16, (
+            f"NVFP4 requires block_size=16, got {self.block_size}"
+        )
+
+
+def _input_activation_quant_func_nvfp4(
+    x: torch.Tensor,
+    block_size: int = 16,
+    scale: Optional[torch.Tensor] = None,
+):
+    """NVFP4-specific activation quantization function"""
+    # TODO: scale for static quant
+    activation = NVFP4Tensor.to_nvfp4(
+        x,
+        block_size=block_size,
+    )
+    return activation
+
+
+@register_quantize_module_handler(NVFP4InferenceConfig)
+def _nvfp4_inference_linear_transform(
+    module: torch.nn.Module, config: NVFP4InferenceConfig
+):
+    """Quantization handler for NVFP4InferenceConfig"""
+    assert is_sm_at_least_100(), "NVFP4 is only supported on sm100+ machines"
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
+
+    weight = module.weight
+    assert weight.dtype == torch.bfloat16, (
+        f"Only supporting bf16 out dtype for now, got {weight.dtype}"
+    )
+
+    # Convert weight to NVFP4 Tensor
+    quantized_weight = NVFP4Tensor.to_nvfp4(
+        weight,
+        block_size=config.block_size,
+    )
+
+    input_quant_func = _input_activation_quant_func_nvfp4
+    input_quant_kwargs = {
+        "block_size": config.block_size,
+        "scale": None,
+    }
+
+    quantized_weight = to_linear_activation_quantized(
+        quantized_weight, input_quant_func, quant_kwargs=input_quant_kwargs
+    )
+
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
 if TORCH_VERSION_AT_LEAST_2_5:
     torch.serialization.add_safe_globals(
-        [MXTensor, MXGemmKernelChoice, _input_activation_quant_func_mxfp]
+        [
+            MXTensor,
+            NVFP4Tensor,
+            MXGemmKernelChoice,
+            _input_activation_quant_func_mxfp,
+            _input_activation_quant_func_nvfp4,
+        ]
     )
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):`
`57`	`57`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`58`	`58`	`)`
`59`	`59`	`elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:`
`60`		`- assert block_size == 32, (`
`61`		`- f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"`
	`60`	`+ assert block_size in [16, 32], (`
	`61`	`+ f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"`
`62`	`62`	`)`
`63`		`- valid_dtypes = [torch.float8_e4m3fn]`
	`63`	`+ valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]`
`64`	`64`	`assert elem_dtype in valid_dtypes, (`
`65`	`65`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`66`	`66`	`)`