Add an option to not quantize embedding layer when doing quantization.

qihqi · qihqi · commit 5704b30ca617 · 2024-10-14T18:16:22.000Z
This helps in getting better quality for small models (gemma 2b) etc.
diff --git a/jetstream_pt/quantize_model.py b/jetstream_pt/quantize_model.py
@@ -1,4 +1,5 @@
 import torch
+from absl import flags
 from .layers import (
     create_quantized_from_nn_linear,
     create_quantized_from_nn_embedding,
@@ -7,6 +8,13 @@
 )
 
 
+_QUANTIZE_EMBEDDING = flags.DEFINE_bool(
+    "internal_quantize_embedding_layer",
+    True,
+    "Whether to quantize embedding layer or not. Defaults to true",
+)
+
+
 def quantize_model(float_model, config):
   """Apply quantization to linear layers."""
 
@@ -17,7 +25,7 @@ def quantize_nn_mod(float_model):
         new_mod = mod.get_quantized_version()
       elif isinstance(mod, torch.nn.Linear):
         new_mod = create_quantized_from_nn_linear(mod, config)
-      elif isinstance(mod, torch.nn.Embedding):
+      elif isinstance(mod, torch.nn.Embedding) and _QUANTIZE_EMBEDDING.value:
         new_mod = create_quantized_from_nn_embedding(mod, config)
 
       if new_mod:
diff --git a/jetstream_pt/third_party/gemma/model.py b/jetstream_pt/third_party/gemma/model.py
@@ -437,7 +437,7 @@ def forward(
     hidden_states = self.norm(hidden_states)
 
     embedder_weight = self.embedder.weight
-    if self.env.quant_config.enable_weight_quantization:
+    if hasattr(self.embedder, "weight_scaler"):
       embedder_weight = embedder_weight * self.embedder.weight_scaler
     logits = torch.matmul(hidden_states, embedder_weight.t())
     return logits