Add advanced usage section to quantization README (google-ai-edge#112)

paulinesho · web-flow · commit 5d3367c601a2 · 2024-07-31T20:32:32.000Z
* Add advanced usage section to quantization README

BUG=b/356164136

* address comments

* address comments
diff --git a/ai_edge_torch/generative/quantize/README.md b/ai_edge_torch/generative/quantize/README.md
@@ -18,7 +18,29 @@ Once converted, you will get a quantized `.tflite` model which will be ready for
 
 In the current release, the following schemes are supported:
 
-* Dynamic range quantization with FP32 activations and INT8 weights for linear ops
-* FP16 quantization with FP16 weights and FP32 activations and computation for all ops
+* Dynamic range quantization: FP32 activations, INT8 weights, and integer computation
+* Weight-only quantization: FP32 activations, INT8 weights, and floating point computation
+* FP16 quantization: FP16 weights, FP32 activations and floating point computation for all ops
+
+These correspond to the available recipes in `quant_recipes.py`.
+
+## Advanced usage
+
+In addition to configuring quantization using pre-configured recipes in `quant_recipes.py`, users can also customize their recipes according to their specific needs using the `LayerQuantRecipe` and `GenerativeQuantRecipe` API.
+
+`LayerQuantRecipe` specifies at a Generative API layer (`ai_edge_torch/generative/layers`) level how ops within should be quantized. `GenerativeQuantRecipe` specifies at a model level how each component of a Generative API model should be quantized. With these configuration classes, selective quantization can be configured as follows:
+
+```
+def custom_selective_quantization_recipe() -> quant_config.QuantConfig:
+  return quant_config.QuantConfig(
+      generative_recipe=quant_recipe.GenerativeQuantRecipe(
+          default=create_layer_quant_fp16(),
+          embedding=create_layer_quant_int8_dynamic(),
+          attention=create_layer_quant_int8_weight_only(),
+          feedforward=create_layer_quant_int8_dynamic(),
+      )
+  )
+```
+
+For example, this recipe specifies that the embedding table, attention, and feedforward layers should be quantized to INT8. Specifically, for attention layers the computation should be in FP32. All other ops should be quantized to the default scheme which is specified as FP16.
 
-These correspond to the available recipes in `quant_recipes.py`
diff --git a/ai_edge_torch/generative/quantize/quant_recipe_utils.py b/ai_edge_torch/generative/quantize/quant_recipe_utils.py
@@ -41,6 +41,16 @@ def create_layer_quant_int8_dynamic() -> quant_recipe.LayerQuantRecipe:
   )
 
 
+def create_layer_quant_int8_weight_only() -> quant_recipe.LayerQuantRecipe:
+  return quant_recipe.LayerQuantRecipe(
+      activation_dtype=quant_attrs.Dtype.FP32,
+      weight_dtype=quant_attrs.Dtype.INT8,
+      mode=quant_attrs.Mode.WEIGHT_ONLY,
+      algorithm=quant_attrs.Algorithm.MIN_MAX,
+      granularity=quant_attrs.Granularity.CHANNELWISE,
+  )
+
+
 def create_layer_quant_fp16() -> quant_recipe.LayerQuantRecipe:
   return quant_recipe.LayerQuantRecipe(
       activation_dtype=quant_attrs.Dtype.FP32,
diff --git a/ai_edge_torch/generative/quantize/quant_recipes.py b/ai_edge_torch/generative/quantize/quant_recipes.py
@@ -40,6 +40,14 @@ def full_int8_dynamic_recipe() -> quant_config.QuantConfig:
   )
 
 
+def full_int8_weight_only_recipe() -> quant_config.QuantConfig:
+  return quant_config.QuantConfig(
+      generative_recipe=quant_recipe.GenerativeQuantRecipe(
+          default=quant_recipe_utils.create_layer_quant_int8_weight_only(),
+      )
+  )
+
+
 def full_fp16_recipe() -> quant_config.QuantConfig:
   return quant_config.QuantConfig(
       generative_recipe=quant_recipe.GenerativeQuantRecipe(
diff --git a/ai_edge_torch/generative/test/test_quantize.py b/ai_edge_torch/generative/test/test_quantize.py
@@ -111,6 +111,7 @@ def _feedforward_int8_dynamic_recipe() -> quant_config.QuantConfig:
       [
           (quant_recipes.full_fp16_recipe()),
           (quant_recipes.full_int8_dynamic_recipe()),
+          (quant_recipes.full_int8_weight_only_recipe()),
           (_attention_int8_dynamic_recipe()),
           (_feedforward_int8_dynamic_recipe()),
       ]

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,7 @@ def _feedforward_int8_dynamic_recipe() -> quant_config.QuantConfig:`
`111`	`111`	`[`
`112`	`112`	`(quant_recipes.full_fp16_recipe()),`
`113`	`113`	`(quant_recipes.full_int8_dynamic_recipe()),`
	`114`	`+ (quant_recipes.full_int8_weight_only_recipe()),`
`114`	`115`	`(_attention_int8_dynamic_recipe()),`
`115`	`116`	`(_feedforward_int8_dynamic_recipe()),`
`116`	`117`	`]`