Keep quantization enabled during calibration (#1299)

kylesayrs · web-flow · commit 5a77b590e7bb · 2025-04-01T16:19:01.000Z
## Purpose ## * Revert the behavior regression introduced as a result of #1114 * When calibrating a model using the `QuantizationModifier`, quantization should be enabled when calibrating ## Changes ## * Remove "disabling quantization" from the calibration forward pass * Add "disabling quantization" to the sequential pipelines in order to continue to disable quantization during calibration for GPTQ and SGPT * When [calibration pipelines become shared between modifiers](#1279), the decision of whether to disabling quantization during calibration will have to be moved to the calibration pipelines themselves. Some work needs to be done to demonstrate that GPTQ and SGPT do not suffer accuracy regression from enabling activation quantization during calibration (in theory, the change should increase accuracy) --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -12,7 +12,7 @@
     maybe_inject_pos_embeddings,
     to_next_layer_kwargs,
 )
-from llmcompressor.utils.helpers import calibration_forward_context
+from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
     from llmcompressor.modifiers import Modifier
@@ -51,7 +51,7 @@ def run_pipeline(
     # find layers
     layers = match_modules(model, sequential_targets)
 
-    with calibration_forward_context(model):
+    with calibration_forward_context(model), DisableQuantization(model):
         # prepare intermediates cache
         intermediates: IntermediatesCache = capture_first_layer_intermediates(
             model, layers[0], dataloader
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -8,7 +8,7 @@
 from llmcompressor.modifiers.utils.hooks import HooksMixin
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.sequential.helpers import trace_subgraphs
-from llmcompressor.utils.helpers import calibration_forward_context
+from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
     from llmcompressor.modifiers import Modifier
@@ -50,7 +50,7 @@ def run_pipeline(
     sample_input = next(iter(dataloader))
     subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
 
-    with calibration_forward_context(model):
+    with calibration_forward_context(model), DisableQuantization(model):
         # prepare intermediates cache
         model_device = get_execution_device(model)
         intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -1013,7 +1013,7 @@ def __exit__(self, _exc_type, _exc_val, _exc_tb):
 @contextlib.contextmanager
 def DisableQuantization(module: torch.nn.Module):
     """
-    Disable quantization from QuantizationModifier
+    Disable quantization during forward passes after applying a quantization config
     """
     try:
         module.apply(disable_quantization)
@@ -1040,13 +1040,11 @@ def calibration_forward_context(model: PreTrainedModel):
 
     - Remove gradient calculations
     - Disable the KV cache
-    - Disable quantization during forward pass
     - Disable train mode and enable eval mode
     """
     with (
         torch.no_grad(),
         DisableKVCache(model),
-        DisableQuantization(model),
         eval_context(model),
     ):
         yield
diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py
@@ -134,11 +134,12 @@ def test_calibration_forward_context():
     model = torch.nn.Linear(1, 1)
     model.config = SimpleNamespace()
     model.config.use_cache = True
+    model.train()
 
     with calibration_forward_context(model):
         assert not torch.is_grad_enabled()
-        assert not model.quantization_enabled
         assert not model.config.use_cache
+        assert not model.training
     assert torch.is_grad_enabled()
-    assert model.quantization_enabled
     assert model.config.use_cache
+    assert model.training