WIP NVfp4

drisspg · drisspg · commit 034f892fdf25 · 2025-06-18T21:26:18.000-07:00
stack-info: PR: #2408, branch: drisspg/stack/78
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,7 +25,10 @@
     MXInferenceLinear,
     MXLinear,
 )
-from torchao.prototype.mx_formats.mx_subclass import MXFPInferenceConfig
+from torchao.prototype.mx_formats.mx_subclass import (
+    MXFPInferenceConfig,
+    NVFP4InferenceConfig,
+)
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import skip_if_rocm
@@ -441,3 +444,36 @@ def test_inference_subclass(elem_dtype, bias: bool, compile: bool):
     assert sqnr >= SQNR_THRESHOLD, (
         f"Got a sqnr of {sqnr} for {elem_dtype} and bias={bias}"
     )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for float4 gemm"
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("compile", [True, False])
+@torch.no_grad()
+@skip_if_rocm("ROCm float4 gemm require gfx950")
+def test_inference_subclass_nvfp4(bias: bool, compile: bool):
+    """
+    Test NVFP4 recipe with scale_dtype=float8_e4m3fn and block_size=16
+    """
+    m = nn.Linear(32, 128, bias=bias, dtype=torch.bfloat16, device="cuda")
+    m_mx = copy.deepcopy(m)
+
+    config = NVFP4InferenceConfig()
+    quantize_(m_mx, config=config)
+    if compile:
+        m_mx = torch.compile(m_mx, fullgraph=True)
+
+    x = torch.randn(128, 32, device="cuda", dtype=torch.bfloat16)
+    y_ref = m(x)
+    y_mx = m_mx(x)
+    sqnr = compute_error(y_ref, y_mx)
+    SQNR_THRESHOLD = 15.0  # Float4 threshold
+    assert sqnr >= SQNR_THRESHOLD, (
+        f"Got a sqnr of {sqnr} for NVFP4 recipe with bias={bias}"
+    )
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
     elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:
-        assert block_size == 32, (
-            f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"
+        assert block_size in [16, 32], (
+            f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"
         )
-        valid_dtypes = [torch.float8_e4m3fn]
+        valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]
         assert elem_dtype in valid_dtypes, (
             f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"
         )
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -104,6 +104,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 w_elem_dtype,
                 block_size,
                 weight_hp.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -133,6 +134,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 grad_elem_dtype,
                 block_size,
                 grad_output_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
@@ -155,6 +157,7 @@ def backward(ctx, grad_output_hp: torch.Tensor):
                 in_elem_dtype,
                 block_size,
                 input_hp_r.dtype,
+                None,  # scale_dtype
                 False,
                 gemm_kernel_choice,
                 False,
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -33,6 +33,7 @@
 )
 from torchao.prototype.mx_formats.mx_tensor import (  # noqa: E501
     MXTensor,
+    NVFP4Tensor,
     tensor_size_hp_to_fp4x2,
     tensor_size_hpx3_to_fp6x4,
 )
@@ -93,8 +94,8 @@ def _addmm_mx_dispatch(
         M, K, N = a.shape[0], a.shape[1], b.shape[1]
         assert a._data.is_contiguous()
         assert b._data.t().is_contiguous()
-        assert a._block_size == 32, f"Invalid block size {a._block_size}"
-        assert b._block_size == 32, f"Invalid block size {b._block_size}"
+        assert a._block_size in [16, 32], f"Invalid block size {a._block_size}"
+        assert b._block_size in [16, 32], f"Invalid block size {b._block_size}"
 
         a_scale = a._scale_e8m0.view(M, K // a._block_size)
         b_scale = b._scale_e8m0.view(N, K // b._block_size)
@@ -144,42 +145,97 @@ def _addmm_mx_dispatch(
     return res
 
 
+def _addmm_nvfp4_dispatch(
+    a: NVFP4Tensor, b: NVFP4Tensor, aten_op, bias: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """
+    Core implementation for NVFP4Tensor operations
+    Uses E4M3 scales and always uses CUBLAS for FP4 operations
+    """
+    # NVFP4 operations with E4M3 scales
+    M, K, N = a.shape[0], a.shape[1], b.shape[1]
+    assert a._data.is_contiguous()
+    assert b._data.t().is_contiguous()
+    assert a._block_size == 16, f"NVFP4 requires block_size=16, got {a._block_size}"
+    assert b._block_size == 16, f"NVFP4 requires block_size=16, got {b._block_size}"
+
+    # NVFP4 uses E4M3 scales, not E8M0
+    a_scale = a._scale_e4m3.view(M, K // a._block_size)
+    b_scale = b._scale_e4m3.view(N, K // b._block_size)
+    a_scale_block = to_blocked(a_scale)
+    b_scale_block = to_blocked(b_scale)
+
+    # NVFP4 always uses CUBLAS with VEC16_UE4M3 scale mode
+    res = torch._scaled_mm(
+        a._data,
+        b._data,
+        a_scale_block.view(torch.float8_e4m3fn),
+        b_scale_block.view(torch.float8_e4m3fn),
+        bias=bias,
+        out_dtype=torch.bfloat16,
+    )
+
+    return res
+
+
 @implements([aten.mm.default, aten.matmul.default])
 def mx_mm(func, types, args, kwargs):
     a = args[0]
     b = args[1]
-    assert isinstance(a, MXTensor) and isinstance(b, MXTensor)
 
-    return _addmm_mx_dispatch(a, b, func)
+    # Handle both MXTensor and NVFP4Tensor
+    if isinstance(a, MXTensor) and isinstance(b, MXTensor):
+        return _addmm_mx_dispatch(a, b, func)
+    elif isinstance(a, NVFP4Tensor) and isinstance(b, NVFP4Tensor):
+        return _addmm_nvfp4_dispatch(a, b, func)
+    else:
+        raise ValueError(f"Unsupported tensor types: {type(a)}, {type(b)}")
 
 
 @implements([aten.addmm.default])
 def mx_addmm(func, types, args, kwargs):
-    assert (
-        isinstance(args[0], torch.Tensor)
-        and isinstance(args[1], MXTensor)
-        and isinstance(args[2], MXTensor)
-    )
     bias = args[0]
     a = args[1]
     b = args[2]
-    return _addmm_mx_dispatch(a, b, func, bias=bias)
+
+    assert isinstance(bias, torch.Tensor), (
+        f"Bias must be torch.Tensor, got {type(bias)}"
+    )
+
+    # Handle both MXTensor and NVFP4Tensor
+    if isinstance(a, MXTensor) and isinstance(b, MXTensor):
+        return _addmm_mx_dispatch(a, b, func, bias=bias)
+    elif isinstance(a, NVFP4Tensor) and isinstance(b, NVFP4Tensor):
+        return _addmm_nvfp4_dispatch(a, b, func, bias=bias)
+    else:
+        raise ValueError(f"Unsupported tensor types: {type(a)}, {type(b)}")
 
 
 @implements([aten.t.default])
 def mx_t(func, types, args, kwargs):
     # For now, only transpose(input, 0, 1) is supported.
     old = args[0]
-    new = MXTensor(
-        old._scale_e8m0,
-        old._data.t(),
-        old._elem_dtype,
-        old._block_size,
-        old._orig_dtype,
-        old._use_fp4_custom_triton_dequant_kernel,
-        old._gemm_kernel_choice,
-        old._pack_fp6,
-    )
+
+    if isinstance(old, MXTensor):
+        new = MXTensor(
+            old._scale_e8m0,
+            old._data.t(),
+            old._elem_dtype,
+            old._block_size,
+            old._orig_dtype,
+            old._use_fp4_custom_triton_dequant_kernel,
+            old._gemm_kernel_choice,
+            old._pack_fp6,
+        )
+    elif isinstance(old, NVFP4Tensor):
+        new = NVFP4Tensor(
+            old._scale_e4m3,
+            old._data.t(),
+            old._block_size,
+            old._orig_dtype,
+        )
+    else:
+        raise ValueError(f"Unsupported tensor type: {type(old)}")
     return new
 
 
@@ -205,25 +261,43 @@ def unwrap(x):
 
 @implements([aten.view.default])
 def mx_view_op(func, types, args, kwargs):
-    data = args[0]._data
+    tensor = args[0]
+    data = tensor._data
     new_size = args[1]
-    if args[0]._elem_dtype == torch.float4_e2m1fn_x2:
-        # special case fp4 as we pack two elements per byte
+
+    if isinstance(tensor, MXTensor):
+        if tensor._elem_dtype == torch.float4_e2m1fn_x2:
+            # special case fp4 as we pack two elements per byte
+            new_size = tensor_size_hp_to_fp4x2(new_size, data.is_contiguous())
+        elif (
+            tensor._elem_dtype in [DTYPE_FP6_E3M2, DTYPE_FP6_E2M3] and tensor._pack_fp6
+        ):
+            # special case fp6 as we pack 4 elements in 3 bytes
+            new_size = tensor_size_hpx3_to_fp6x4(new_size, data.is_contiguous())
+
+        new_data = func(data, new_size, *args[2:], **kwargs)
+        return MXTensor(
+            tensor._scale_e8m0,
+            new_data,
+            tensor._elem_dtype,
+            tensor._block_size,
+            tensor._orig_dtype,
+            tensor._use_fp4_custom_triton_dequant_kernel,
+            tensor._gemm_kernel_choice,
+            tensor._pack_fp6,
+        )
+    elif isinstance(tensor, NVFP4Tensor):
+        # NVFP4 is always fp4 packed
         new_size = tensor_size_hp_to_fp4x2(new_size, data.is_contiguous())
-    elif args[0]._elem_dtype in [DTYPE_FP6_E3M2, DTYPE_FP6_E2M3] and args[0]._pack_fp6:
-        # special case fp6 as we pack 4 elements in 3 bytes
-        new_size = tensor_size_hpx3_to_fp6x4(new_size, data.is_contiguous())
-    new_data = func(data, new_size, *args[2:], **kwargs)
-    return MXTensor(
-        args[0]._scale_e8m0,
-        new_data,
-        args[0]._elem_dtype,
-        args[0]._block_size,
-        args[0]._orig_dtype,
-        args[0]._use_fp4_custom_triton_dequant_kernel,
-        args[0]._gemm_kernel_choice,
-        args[0]._pack_fp6,
-    )
+        new_data = func(data, new_size, *args[2:], **kwargs)
+        return NVFP4Tensor(
+            tensor._scale_e4m3,
+            new_data,
+            tensor._block_size,
+            tensor._orig_dtype,
+        )
+    else:
+        raise ValueError(f"Unsupported tensor type: {type(tensor)}")
 
 
 @implements([aten.slice.Tensor])
@@ -235,8 +309,15 @@ def mx_slice(func, types, args, kwargs):
 
     M, K = x.shape[0], x.shape[1]
 
-    # TODO why doesn't scale have shape?
-    scale_shaped = x._scale_e8m0.view(M, K // x._block_size)
+    # Handle different scale tensors for different tensor types
+    if isinstance(x, MXTensor):
+        scale_tensor = x._scale_e8m0
+    elif isinstance(x, NVFP4Tensor):
+        scale_tensor = x._scale_e4m3
+    else:
+        raise ValueError(f"Unsupported tensor type: {type(x)}")
+
+    scale_shaped = scale_tensor.view(M, K // x._block_size)
 
     if dim == 0:
         # Slicing along the first dimension (rows) TODO assuming that dim 1 is reduciton dim for now
@@ -267,15 +348,14 @@ def mx_slice(func, types, args, kwargs):
             scale_shaped, 1, start_block, end_block, step
         ).flatten()
     else:
+        tensor_name = "MXTensor/NVFP4Tensor"
         raise ValueError(
-            f"MXTensor only supports slicing along dimensions 0 and 1, got dim={dim}"
+            f"{tensor_name} only supports slicing along dimensions 0 and 1, got dim={dim}"
         )
 
-    return return_and_correct_aliasing(
-        func,
-        args,
-        kwargs,
-        MXTensor(
+    # Create appropriate tensor type
+    if isinstance(x, MXTensor):
+        result_tensor = MXTensor(
             sliced_scale,
             sliced_data,
             x._elem_dtype,
@@ -284,7 +364,20 @@ def mx_slice(func, types, args, kwargs):
             x._use_fp4_custom_triton_dequant_kernel,
             x._gemm_kernel_choice,
             x._pack_fp6,
-        ),
+        )
+    else:  # NVFP4Tensor
+        result_tensor = NVFP4Tensor(
+            sliced_scale,
+            sliced_data,
+            x._block_size,
+            x._orig_dtype,
+        )
+
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        result_tensor,
     )
 
 
diff --git a/torchao/prototype/mx_formats/mx_subclass.py b/torchao/prototype/mx_formats/mx_subclass.py
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -57,10 +57,10 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):`
`57`	`57`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`58`	`58`	`)`
`59`	`59`	`elif gemm_kernel_choice == MXGemmKernelChoice.CUBLAS:`
`60`		`- assert block_size == 32, (`
`61`		`- f"block_size must be 32 to use the cuBLAS MX gemm kernels, got {block_size}"`
	`60`	`+ assert block_size in [16, 32], (`
	`61`	`+ f"block_size must be in [16, 32] to use the cuBLAS MX gemm kernels, got {block_size}"`
`62`	`62`	`)`
`63`		`- valid_dtypes = [torch.float8_e4m3fn]`
	`63`	`+ valid_dtypes = [torch.float8_e4m3fn, torch.float4_e2m1fn_x2]`
`64`	`64`	`assert elem_dtype in valid_dtypes, (`
`65`	`65`	`f"elem_dtype must be one of {valid_dtypes} to use the CUTLASS MX gemm kernels, got {elem_dtype}"`
`66`	`66`	`)`