[WC][OpenVINO] FP8_E4M3

daniil-lyakhov · daniil-lyakhov · commit 526e48b1f861 · 2025-10-08T17:27:06.000+02:00
diff --git a/docs/Algorithms.md b/docs/Algorithms.md
@@ -13,6 +13,7 @@
   - NF4 compression mode
   - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3)
   - MX-compliant types - MXFP4 and MXFP8_E4M3
+  - FP8 type - FP8_E4M3
   - Mixed precision weights compression
   - Grouped weights compression
 
diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md
@@ -47,6 +47,7 @@ NNCF can automatically distribute precision assignments based on quantization se
 | CB4_F8E4M3       | E4M3         | FP16       | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
 | MXFP4            | E2M1         | E8M0       | Group-wise (32)          | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
 | MXFP8_E4M3       | E4M3         | E8M0       | Group-wise (32)          | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
+|   FP8_E4M3       | E4M3         | FP16       | Per-channel / Group-wise | [FP8](https://arxiv.org/pdf/2209.05433) |
 
 **Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.
 
diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py
@@ -92,6 +92,7 @@ class CompressWeightsMode(StrEnum):
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
     :param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
+    :param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
     :param CODEBOOK: Codebook (LUT) quantization format.
     :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
     """
@@ -105,6 +106,7 @@ class CompressWeightsMode(StrEnum):
     INT8 = "int8"  # Deprecated mode
     MXFP4 = "mxfp4"
     MXFP8_E4M3 = "mxfp8_e4m3"
+    FP8_E4M3 = "fp8_e4m3"
     CODEBOOK = "codebook"
 
 
diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -65,6 +65,7 @@
     CompressWeightsMode.NF4,
     CompressWeightsMode.MXFP4,
     CompressWeightsMode.MXFP8_E4M3,
+    CompressWeightsMode.FP8_E4M3,
 ]
 SUPPORTED_DATA_TYPES = [
     TensorDataType.float16,
@@ -300,6 +301,7 @@ def __init__(
             NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
             MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
             MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
+            FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale.
         :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
             and the rest to backup_mode).
         :param group_size: number of weights (e.g. 128) in the channel dimension
diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py
@@ -61,6 +61,7 @@ def is_integer(self):
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
         ]
diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -231,6 +231,8 @@ def _create_compression_subgraph(
         elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3:
             compression_dtype = ov.Type.f8e4m3
             scale_dtype = ov.Type.f8e8m0
+        elif compression_config.mode == CompressWeightsMode.FP8_E4M3:
+            compression_dtype = ov.Type.f8e4m3
         elif compression_config.mode == CompressWeightsMode.INT4_SYM:
             compression_dtype = ov.Type.i4
         elif compression_config.mode == CompressWeightsMode.INT4_ASYM:
diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py
@@ -440,6 +440,7 @@ def transform_model(
                 CompressWeightsMode.NF4,
                 CompressWeightsMode.MXFP4,
                 CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
             ]:
                 msg = f"{compression_config.mode.value} is not supported."
                 raise nncf.ParameterNotSupportedError(msg)
diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py
@@ -180,6 +180,7 @@ def transform_model(
                 CompressWeightsMode.NF4,
                 CompressWeightsMode.MXFP4,
                 CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
             ]:
                 msg = f"{compression_config.mode.value} is not supported."
                 raise nncf.ParameterNotSupportedError(msg)
diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -81,7 +81,7 @@ def calculate_float_quantization_params(
     weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
 ) -> Tensor:
     """
-    Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization.
+    Calculates the scale for nf4 or mxfp4/mxfp8_e4m3/fp8_e4m3 quantization.
 
     :param weight: Weight array to compress.
     :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max).
@@ -97,6 +97,7 @@ def calculate_float_quantization_params(
     FP_MAX_VALS = {
         CompressWeightsMode.MXFP4: 6.0,
         CompressWeightsMode.MXFP8_E4M3: 448.0,
+        CompressWeightsMode.FP8_E4M3: 448.0,
     }
     if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()):
         if config.mode in FP_MAX_VALS:
@@ -146,17 +147,17 @@ def do_float_quantization(
 ) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided,
-    and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
+    and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP8_E4M3) weight quantization.
     For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
-    For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
-    TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+    For MXFP4, MXFP8_E4M3, FP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
+    TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3 and FP8_E4M3 once ticket 164851 is resolved
 
     :param weight: Weight array to compress.
     :param config: Weight compression configuration.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
-    :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
-             optional indexes for codebook.
+    :return: Returns quantized (for MXFP4, MXFP8_E4M3 and FP8_E4M3 normalized) weight tensor and
+        corresponding scale tensor and optional indexes for codebook.
     """
     assert not config.is_integer
 
@@ -192,7 +193,7 @@ def do_float_quantization(
         )
         return compressed_weight, scale, indexes
     else:
-        # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+        # TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3, FP8_E4M3 once ticket 164851 is resolved
         compressed_weight = norm_weight
     return compressed_weight, scale, None
 
diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py
@@ -460,6 +460,7 @@ def compress_weights(
         MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
         MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale.
             The size of group is 32.
+        FP8_E4M3 - is FP8 format with E4M3 values sharing a group-level FP16 scale.
     :type mode: nncf.CompressWeightsMode
     :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
         and the rest to INT8_ASYM).
@@ -517,14 +518,18 @@ def compress_weights(
         from nncf.torch.nncf_network import NNCFNetwork
         from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {"gptq": gptq, "lora_correction": lora_correction}
@@ -567,14 +572,18 @@ def compress_weights(
             compress_weights_impl as fx_compression_weights_impl,
         )
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {
@@ -610,14 +619,18 @@ def compress_weights(
             msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
             raise nncf.ParameterNotSupportedError(msg)
 
-        if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
-            CompressWeightsMode.MXFP4,
-            CompressWeightsMode.MXFP8_E4M3,
-        ]:
-            msg = (
-                "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]."
-            )
-            raise nncf.ParameterNotSupportedError(msg)
+        if any((awq, scale_estimation, gptq, lora_correction)):
+            not_supported_modes = [
+                CompressWeightsMode.MXFP4,
+                CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
+            ]
+            if mode in not_supported_modes:
+                msg = (
+                    "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined,"
+                    f" but mode in {[m.value for m in not_supported_modes]}."
+                )
+                raise nncf.ParameterNotSupportedError(msg)
 
         if gptq and lora_correction:
             msg = "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them."
@@ -632,14 +645,18 @@ def compress_weights(
     elif backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"ONNX backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1145,7 +1145,7 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode):
         (CompressWeightsMode.MXFP4, ov.Type.f4e2m1),
     ],
 )
-def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size):
+def test_mixed_precision_mxfp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size):
     # Use hidden dim % 32 == 0 to make it possible to quantize in MX format
     model = SequentialMatmulModel(mm_hidden_dim=32).ov_model
     dataset = Dataset([np.ones([1, 4, 32]), np.arange(128).reshape(1, 4, 32)])
@@ -1179,6 +1179,70 @@ def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode
     assert ref_e8m0_nodes == names_e8m0
 
 
+@pytest.mark.parametrize(
+    ("sensitivity_metric", "all_layers", "ratio", "ref_ids", "group_size"),
+    (
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, [], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0], None),
+        (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, [], None),
+        (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2], None),
+        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], None),
+        # One test to check manual group size setup is working as expected
+        (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], 128),
+    ),
+)
+@pytest.mark.parametrize(
+    "mode, ov_type",
+    [
+        (CompressWeightsMode.FP8_E4M3, ov.Type.f8e4m3),
+    ],
+)
+def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size):
+    model = SequentialMatmulModel(mm_hidden_dim=128).ov_model
+    dataset = Dataset([np.ones([1, 4, 128]), np.arange(512).reshape(1, 4, 128)])
+    kwargs = {}
+    if group_size is not None:
+        kwargs["group_size"] = group_size
+    compressed_model = compress_weights(
+        model,
+        mode=mode,
+        ratio=ratio,
+        all_layers=all_layers,
+        sensitivity_metric=sensitivity_metric,
+        dataset=dataset,
+        **kwargs,
+    )
+    ops = []
+    for op in compressed_model.get_ordered_ops():
+        if op.get_element_type() == ov_type:
+            # Check effective default group size == 32
+            assert tuple(op.shape) == (128, 1, 128)
+            ops.append(op)
+
+    names_fp = {op.get_friendly_name() for op in ops}
+    ref_fp_nodes = {f"weights_{i}" for i in ref_ids}
+    assert ref_fp_nodes == names_fp
+
+    names_scales = {
+        op.get_friendly_name()
+        for op in compressed_model.get_ordered_ops()
+        if op.get_element_type() == ov.Type.f16 and "scale" in op.get_friendly_name()
+    }
+    ref_scale_nodes = {f"weights_{i}/scale" for i in range(5)}
+    assert ref_scale_nodes == names_scales
+
+
 @pytest.mark.parametrize(
     ("mode", "all_layers", "ratio", "ref_ids"),
     (
@@ -1206,6 +1270,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
         op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f8e4m3
     }
 
+    print(names_codebook)
     assert ref_ids == len(names_codebook)
 
 
diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py
@@ -54,6 +54,7 @@
     CompressWeightsMode.NF4,
     CompressWeightsMode.MXFP4,
     CompressWeightsMode.MXFP8_E4M3,
+    CompressWeightsMode.FP8_E4M3,
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ def is_integer(self):`
`61`	`61`	`CompressWeightsMode.NF4,`
`62`	`62`	`CompressWeightsMode.MXFP4,`
`63`	`63`	`CompressWeightsMode.MXFP8_E4M3,`
	`64`	`+ CompressWeightsMode.FP8_E4M3,`
`64`	`65`	`CompressWeightsMode.CODEBOOK,`
`65`	`66`	`CompressWeightsMode.CB4_F8E4M3,`
`66`	`67`	`]`