Skip to content

Commit 526e48b

Browse files
[WC][OpenVINO] FP8_E4M3
1 parent 62b808b commit 526e48b

File tree

12 files changed

+120
-25
lines changed

12 files changed

+120
-25
lines changed

docs/Algorithms.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- NF4 compression mode
1414
- Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3)
1515
- MX-compliant types - MXFP4 and MXFP8_E4M3
16+
- FP8 type - FP8_E4M3
1617
- Mixed precision weights compression
1718
- Grouped weights compression
1819

docs/usage/post_training_compression/weights_compression/Usage.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ NNCF can automatically distribute precision assignments based on quantization se
4747
| CB4_F8E4M3 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
4848
| MXFP4 | E2M1 | E8M0 | Group-wise (32) | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
4949
| MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
50+
| FP8_E4M3 | E4M3 | FP16 | Per-channel / Group-wise | [FP8](https://arxiv.org/pdf/2209.05433) |
5051

5152
**Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.
5253

src/nncf/parameters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class CompressWeightsMode(StrEnum):
9292
:param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
9393
:param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
9494
:param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
95+
:param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
9596
:param CODEBOOK: Codebook (LUT) quantization format.
9697
:param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
9798
"""
@@ -105,6 +106,7 @@ class CompressWeightsMode(StrEnum):
105106
INT8 = "int8" # Deprecated mode
106107
MXFP4 = "mxfp4"
107108
MXFP8_E4M3 = "mxfp8_e4m3"
109+
FP8_E4M3 = "fp8_e4m3"
108110
CODEBOOK = "codebook"
109111

110112

src/nncf/quantization/algorithms/weight_compression/algorithm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
CompressWeightsMode.NF4,
6666
CompressWeightsMode.MXFP4,
6767
CompressWeightsMode.MXFP8_E4M3,
68+
CompressWeightsMode.FP8_E4M3,
6869
]
6970
SUPPORTED_DATA_TYPES = [
7071
TensorDataType.float16,
@@ -300,6 +301,7 @@ def __init__(
300301
NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
301302
MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
302303
MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
304+
FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale.
303305
:param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
304306
and the rest to backup_mode).
305307
:param group_size: number of weights (e.g. 128) in the channel dimension

src/nncf/quantization/algorithms/weight_compression/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def is_integer(self):
6161
CompressWeightsMode.NF4,
6262
CompressWeightsMode.MXFP4,
6363
CompressWeightsMode.MXFP8_E4M3,
64+
CompressWeightsMode.FP8_E4M3,
6465
CompressWeightsMode.CODEBOOK,
6566
CompressWeightsMode.CB4_F8E4M3,
6667
]

src/nncf/quantization/algorithms/weight_compression/openvino_backend.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ def _create_compression_subgraph(
231231
elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3:
232232
compression_dtype = ov.Type.f8e4m3
233233
scale_dtype = ov.Type.f8e8m0
234+
elif compression_config.mode == CompressWeightsMode.FP8_E4M3:
235+
compression_dtype = ov.Type.f8e4m3
234236
elif compression_config.mode == CompressWeightsMode.INT4_SYM:
235237
compression_dtype = ov.Type.i4
236238
elif compression_config.mode == CompressWeightsMode.INT4_ASYM:

src/nncf/quantization/algorithms/weight_compression/torch_backend.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ def transform_model(
440440
CompressWeightsMode.NF4,
441441
CompressWeightsMode.MXFP4,
442442
CompressWeightsMode.MXFP8_E4M3,
443+
CompressWeightsMode.FP8_E4M3,
443444
]:
444445
msg = f"{compression_config.mode.value} is not supported."
445446
raise nncf.ParameterNotSupportedError(msg)

src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def transform_model(
180180
CompressWeightsMode.NF4,
181181
CompressWeightsMode.MXFP4,
182182
CompressWeightsMode.MXFP8_E4M3,
183+
CompressWeightsMode.FP8_E4M3,
183184
]:
184185
msg = f"{compression_config.mode.value} is not supported."
185186
raise nncf.ParameterNotSupportedError(msg)

src/nncf/quantization/algorithms/weight_compression/weight_lowering.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def calculate_float_quantization_params(
8181
weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
8282
) -> Tensor:
8383
"""
84-
Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization.
84+
Calculates the scale for nf4 or mxfp4/mxfp8_e4m3/fp8_e4m3 quantization.
8585
8686
:param weight: Weight array to compress.
8787
:param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max).
@@ -97,6 +97,7 @@ def calculate_float_quantization_params(
9797
FP_MAX_VALS = {
9898
CompressWeightsMode.MXFP4: 6.0,
9999
CompressWeightsMode.MXFP8_E4M3: 448.0,
100+
CompressWeightsMode.FP8_E4M3: 448.0,
100101
}
101102
if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()):
102103
if config.mode in FP_MAX_VALS:
@@ -146,17 +147,17 @@ def do_float_quantization(
146147
) -> tuple[Tensor, Tensor, Tensor]:
147148
"""
148149
Computes quantization scale if not provided,
149-
and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
150+
and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP8_E4M3) weight quantization.
150151
For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
151-
For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
152-
TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
152+
For MXFP4, MXFP8_E4M3, FP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
153+
TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3 and FP8_E4M3 once ticket 164851 is resolved
153154
154155
:param weight: Weight array to compress.
155156
:param config: Weight compression configuration.
156157
:param reduction_axes: Axes, along which to reduce (collect) different statistics.
157158
:param precomputed_scale: Optional precomputed scale.
158-
:return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
159-
optional indexes for codebook.
159+
:return: Returns quantized (for MXFP4, MXFP8_E4M3 and FP8_E4M3 normalized) weight tensor and
160+
corresponding scale tensor and optional indexes for codebook.
160161
"""
161162
assert not config.is_integer
162163

@@ -192,7 +193,7 @@ def do_float_quantization(
192193
)
193194
return compressed_weight, scale, indexes
194195
else:
195-
# TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
196+
# TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3, FP8_E4M3 once ticket 164851 is resolved
196197
compressed_weight = norm_weight
197198
return compressed_weight, scale, None
198199

src/nncf/quantization/quantize_model.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ def compress_weights(
460460
MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
461461
MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale.
462462
The size of group is 32.
463+
FP8_E4M3 - is FP8 format with E4M3 values sharing a group-level FP16 scale.
463464
:type mode: nncf.CompressWeightsMode
464465
:param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
465466
and the rest to INT8_ASYM).
@@ -517,14 +518,18 @@ def compress_weights(
517518
from nncf.torch.nncf_network import NNCFNetwork
518519
from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl
519520

520-
if mode in [
521+
not_supported_modes = [
521522
CompressWeightsMode.NF4,
522523
CompressWeightsMode.MXFP4,
523524
CompressWeightsMode.MXFP8_E4M3,
525+
CompressWeightsMode.FP8_E4M3,
524526
CompressWeightsMode.CODEBOOK,
525527
CompressWeightsMode.CB4_F8E4M3,
526-
]:
527-
msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
528+
]
529+
if mode in not_supported_modes:
530+
msg = (
531+
f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
532+
)
528533
raise nncf.ParameterNotSupportedError(msg)
529534

530535
options = {"gptq": gptq, "lora_correction": lora_correction}
@@ -567,14 +572,18 @@ def compress_weights(
567572
compress_weights_impl as fx_compression_weights_impl,
568573
)
569574

570-
if mode in [
575+
not_supported_modes = [
571576
CompressWeightsMode.NF4,
572577
CompressWeightsMode.MXFP4,
573578
CompressWeightsMode.MXFP8_E4M3,
579+
CompressWeightsMode.FP8_E4M3,
574580
CompressWeightsMode.CODEBOOK,
575581
CompressWeightsMode.CB4_F8E4M3,
576-
]:
577-
msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
582+
]
583+
if mode in not_supported_modes:
584+
msg = (
585+
f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
586+
)
578587
raise nncf.ParameterNotSupportedError(msg)
579588

580589
options = {
@@ -610,14 +619,18 @@ def compress_weights(
610619
msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
611620
raise nncf.ParameterNotSupportedError(msg)
612621

613-
if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
614-
CompressWeightsMode.MXFP4,
615-
CompressWeightsMode.MXFP8_E4M3,
616-
]:
617-
msg = (
618-
"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]."
619-
)
620-
raise nncf.ParameterNotSupportedError(msg)
622+
if any((awq, scale_estimation, gptq, lora_correction)):
623+
not_supported_modes = [
624+
CompressWeightsMode.MXFP4,
625+
CompressWeightsMode.MXFP8_E4M3,
626+
CompressWeightsMode.FP8_E4M3,
627+
]
628+
if mode in not_supported_modes:
629+
msg = (
630+
"AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined,"
631+
f" but mode in {[m.value for m in not_supported_modes]}."
632+
)
633+
raise nncf.ParameterNotSupportedError(msg)
621634

622635
if gptq and lora_correction:
623636
msg = "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them."
@@ -632,14 +645,18 @@ def compress_weights(
632645
elif backend == BackendType.ONNX:
633646
from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
634647

635-
if mode in [
648+
not_supported_modes = [
636649
CompressWeightsMode.NF4,
637650
CompressWeightsMode.MXFP4,
638651
CompressWeightsMode.MXFP8_E4M3,
652+
CompressWeightsMode.FP8_E4M3,
639653
CompressWeightsMode.CODEBOOK,
640654
CompressWeightsMode.CB4_F8E4M3,
641-
]:
642-
msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
655+
]
656+
if mode in not_supported_modes:
657+
msg = (
658+
f"ONNX backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
659+
)
643660
raise nncf.ParameterNotSupportedError(msg)
644661

645662
options = {

0 commit comments

Comments
 (0)