Skip to content

SQ and QM: Remove torch.cuda.empty_cache, use calibration_forward_context #1114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 8 additions & 12 deletions src/llmcompressor/modifiers/quantization/quantization/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
run_calibration_forward,
)
from llmcompressor.observers.helpers import get_observer_token_count
from llmcompressor.utils.helpers import calibration_forward_context

__all__ = ["QuantizationModifier"]

Expand Down Expand Up @@ -309,18 +310,13 @@ def _calibrate(self, module: Module):
f"{len(self.calibration_dataloader_)} samples..."
)

module_training = module.training
module.eval()

run_calibration_forward(
module,
self.calibration_dataloader_,
self.num_calibration_steps,
self.calibration_function_,
)

if module_training:
module.train()
with calibration_forward_context(module):
run_calibration_forward(
module,
self.calibration_dataloader_,
self.num_calibration_steps,
self.calibration_function_,
)

def _check_token_distribution(
self, model: Module, threshold: Optional[float] = None
Expand Down
17 changes: 8 additions & 9 deletions src/llmcompressor/modifiers/smoothquant/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward
from llmcompressor.utils.fsdp.helpers import get_fsdp_parent
from llmcompressor.utils.helpers import calibration_forward_context
from llmcompressor.utils.pytorch.module import (
get_layers,
get_matching_layer,
Expand Down Expand Up @@ -250,12 +251,13 @@ def _calibrate(self, model: Module, calibration_dataloader: List):
" CompressionSession to run the SmoothQuant modifier"
)

run_calibration_forward(
model,
calibration_dataloader,
self.num_calibration_steps,
self.calibration_function,
)
with calibration_forward_context(model):
run_calibration_forward(
model,
calibration_dataloader,
self.num_calibration_steps,
self.calibration_function,
)

# remove the hooks now that we are done calibrating
self.remove_hooks()
Expand Down Expand Up @@ -313,9 +315,6 @@ def smooth(module):
smooth(layer)
smooth(smooth_layer)

# clear out allocated smoothing scales
torch.cuda.empty_cache()

def _calculate_smoothing_scales(
self, balance_layers: List[Module], activation_scales: torch.Tensor
) -> List[float]:
Expand Down
4 changes: 0 additions & 4 deletions src/llmcompressor/modifiers/utils/pytorch_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,6 @@ def run_calibration_forward(
with torch.no_grad():
forward_fn(batch, module=model)

# TODO: not ideal, figure out where we aren't freeing memory instead
# currently without this we run OOM on the 2nd forward pass
torch.cuda.empty_cache()


def is_moe_model(model: Module) -> bool:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ def test_correct_compressor_inferred(
if is_24:
weights = _make_24_sparse(weights)
else:
weights[0, :] = torch.ones(4, ) # guarantee not 24 sparse
weights[0, :] = torch.ones((4,)) # guarantee not 24 sparse

quantization_config = _quantization_config_from_string(quant_style, quant_type)
quantization_args = quantization_config.config_groups["group_0"].weights
Expand Down