Add int4 weight-only embedding QAT

andrewor14 · andrewor14 · commit 1cc1b42666c4 · 2024-09-25T15:55:30.000-07:00
Based on changes in D62664322 by Tijmen Blankevoort.

TODO:
- add convert path
- add tests
diff --git a/torchao/quantization/prototype/qat/_module_swap_api.py b/torchao/quantization/prototype/qat/_module_swap_api.py
@@ -28,23 +28,23 @@
     _choose_qparams_per_token_asymmetric,
     _fake_quantize_per_channel_group,
     _fake_quantize_per_token,
+    _get_qmin_qmax,
 )
 
 
-# TODO: deprecate this flow in favor of the tensor subclass flow under qat/api.py
-# This is currently needed for DDP and FSDP1, which are not compatible with the
-# subclass flow.
+# TODO: make module swap the main flow again, and remove the quantize_ flow
+# TODO: rename this file to linear.py
+
+# =========================================================
+# |   Linear int8 dynamic activations + int4 weight QAT   |
+# =========================================================
 
 
 class Int8DynActInt4WeightQATQuantizerModuleSwap(Int8DynActInt4WeightQATQuantizer):
     """
     Quantizer for performing QAT on a model, where linear layers have int8
     dynamic per token fake quantized activations and int4 fake quantized
     grouped per channel weights.
-
-    Note: This quantizer is implemented using module swaps and may be
-    deprecated in the future. Please use `Int8DynActInt4WeightQATQuantizer`
-    instead if possible.
     """
 
     def prepare(
@@ -92,7 +92,7 @@ def _convert_qat_linear_8da4w(module: torch.nn.Module):
 
             # Load weights and qparams into quantized linear
             n_bit = 4
-            (qmin, qmax) = child._get_qmin_qmax(n_bit)
+            (qmin, qmax) = _get_qmin_qmax(n_bit)
             (s, zp) = get_group_qparams_symmetric(child.weight, n_bit, child.groupsize)
             from torchao._executorch_ops import _quantized_decomposed_quantize_per_channel_group_wrapper
             q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
@@ -156,7 +156,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             (act_scales, act_zp) = _choose_qparams_per_token_asymmetric(
                 x, self.scales_precision, self.zero_points_precision,
             )
-            (act_qmin, act_qmax) = self._get_qmin_qmax(8)
+            (act_qmin, act_qmax) = _get_qmin_qmax(8)
             x_fq = _fake_quantize_per_token(
                 x, act_scales, act_zp, act_qmin, act_qmax,
             )
@@ -170,7 +170,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             )
             # TODO: pass zp dtype to `get_group_qparams_symmetric` instead
             weight_zp = weight_zp.to(self.zero_points_precision)
-            (weight_qmin, weight_qmax) = self._get_qmin_qmax(4)
+            (weight_qmin, weight_qmax) = _get_qmin_qmax(4)
             w_fq = _fake_quantize_per_channel_group(
                 self.weight,
                 weight_scales,
@@ -183,12 +183,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             w_fq = self.weight
         return F.linear(x_fq, w_fq)
 
-    # TODO: move this to common util
-    def _get_qmin_qmax(self, n_bit: int):
-        qmin = -(2 ** (n_bit - 1))
-        qmax = 2 ** (n_bit - 1) - 1
-        return (qmin, qmax)
-
 
 def enable_8da4w_fake_quant_module_swap(mod: torch.nn.Module):
     """
@@ -206,19 +200,15 @@ def disable_8da4w_fake_quant_module_swap(mod: torch.nn.Module):
         mod.disable_fake_quant()
 
 
-# ==================
-# |   int4wo QAT   |
-# ==================
+# ===================================
+# |   Linear int4 weight-only QAT   |
+# ===================================
 
 
 class Int4WeightOnlyQATQuantizerModuleSwap(Int4WeightOnlyQATQuantizer):
     """
     Quantizer for performing QAT on a model, where linear layers have
     int4 fake quantized grouped per channel weights.
-
-    Note: This quantizer is implemented using module swaps and may be
-    deprecated in the future. Please use `Int4WeightOnlyQATQuantizer`
-    instead if possible.
     """
 
     def prepare(
diff --git a/torchao/quantization/prototype/qat/embedding.py b/torchao/quantization/prototype/qat/embedding.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+
+from torchao.quantization.unified import TwoStepQuantizer
+from torchao.quantization.utils import get_group_qparams_symmetric
+from torchao.quantization.quant_api import (
+    _replace_with_custom_fn_if_matches_filter,
+)
+from .utils import (
+    _fake_quantize_per_channel_group,
+    _get_qmin_qmax,
+)
+
+
+# ======================================
+# |   Embedding int4 weight-only QAT   |
+# ======================================
+
+class Int4WeightOnlyEmbeddingQATQuantizer(TwoStepQuantizer):
+    """
+    Quantizer for performing QAT on a model, where embedding layers have
+    int4 fake quantized grouped per channel weights.
+    """
+
+    def __init__(
+        self,
+        group_size: int = 256,
+        scale_precision: torch.dtype = torch.float32,
+        zero_point_precision: torch.dtype = torch.int32,
+    ) -> None:
+        super().__init__()
+        self.group_size: int = group_size
+        self.scale_precision: torch.dtype = scale_precision
+        self.zero_point_precision: torch.dtype = zero_point_precision,
+
+    def prepare(
+        self,
+        model: torch.nn.Module,
+        *args: Any,
+        **kwargs: Any
+    ) -> torch.nn.Module:
+        """
+        Swap `nn.Embedding` modules with `Int4WeightOnlyQATEmbedding`.
+        """
+        def filter_fn(child: torch.nn.Module, cur_fqn:str) -> bool:
+            return isinstance(child, nn.Embedding)
+
+        def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
+            new_embedding = Int4WeightOnlyQATEmbedding(
+                group_size=self.group_size,
+
+                # other nn.Embedding args
+                num_embeddings=child.num_embeddings,
+                embedding_dim=child.embedding_dim,
+                padding_idx=child.padding_idx,
+                max_norm=child.max_norm,
+                norm_type=child.norm_type,
+                scale_grad_by_freq=child.scale_grad_by_freq,
+                sparse=child.sparse,
+                device=child.weight.device,
+            )
+            # In distributed training, the model may be instantiated
+            # on the meta device, in which case there is no need to
+            # copy the weights, and doing so will result in an error
+            if child.weight.device != torch.device("meta"):
+                new_embedding.weight = child.weight
+            return new_embedding
+
+        _replace_with_custom_fn_if_matches_filter(model, replacement_fn, filter_fn)
+        return model
+
+    def convert(
+        self,
+        model: torch.nn.Module,
+        *args: Any,
+        **kwargs: Any
+    ) -> torch.nn.Module:
+        """
+        Swap `Int4WeightOnlyQATEmbedding` with `Int4WeightOnlyEmbedding`
+        """
+        # TODO: implement this
+        print("Warning: int4 weight-only embedding convert flow not implemented yet")
+        return model
+
+
+class Int4WeightOnlyQATEmbedding(torch.nn.Embedding):
+    """
+    This module implements a embedding layer with int4 fake quantized
+    grouped per channel weights.
+
+    args:
+        group_size: the number of elements in each quantized group for weights
+        scale_precision: precision of per group scales
+        zero_point_precision: precision of per group zero points
+    """
+
+    def __init__(
+        self, 
+        group_size: int = 32, 
+        scale_precision: torch.dtype = torch.float32,
+        zero_point_precision: torch.dtype = torch.int32,
+        *args, 
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.bit_width = 4
+        self.group_size = group_size
+        self.scale_precision = scale_precision
+        self.zero_point_precision = zero_point_precision
+        self._fake_quant_enabled = True
+    
+    def forward(self, x):
+        weight = self.weight
+
+        if self._fake_quant_enabled:
+            (weight_scales, weight_zp) = get_group_qparams_symmetric(
+                self.weight, self.bit_width, self.group_size, self.scale_precision,
+            )
+            # TODO: pass zp dtype to `get_group_qparams_symmetric` instead
+            weight_zp = weight_zp.to(self.zero_point_precision)
+            (weight_qmin, weight_qmax) = _get_qmin_qmax(self.bit_width)
+            w_fq = _fake_quantize_per_channel_group(
+                self.weight,
+                weight_scales,
+                weight_zp,
+                weight_qmin,
+                weight_qmax,
+                self.group_size,
+            )
+        else:
+            w_fq = self.weight
+
+        return F.embedding(
+            x, w_fq, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse,
+        )
+
+    def enable_fake_quant(self, enabled: bool = True):
+        self._fake_quant_enabled = enabled
+
+    def disable_fake_quant(self):
+        self.enable_fake_quant(False)
+
+
+class Int4WeightOnlyEmbedding(torch.nn.Embedding):
+    """
+    This module implements a embedding layer with int4 quantized
+    grouped per channel weights.
+    """
+    pass
diff --git a/torchao/quantization/prototype/qat/utils.py b/torchao/quantization/prototype/qat/utils.py
@@ -259,3 +259,8 @@ def insert_subclass(lin):
         return lin
 
     return insert_subclass
+
+def _get_qmin_qmax(n_bit: int):
+    qmin = -(2 ** (n_bit - 1))
+    qmax = 2 ** (n_bit - 1) - 1
+    return (qmin, qmax)