Skip to content

Commit 806da33

Browse files
authored
[Test Patch] Remove redundant code for "Fix/update test_run_compressed" (#1072)
SUMMARY: Removed breakpoints and addressed comments for #970 TEST PLAN: Ran pytest for the two test files #970 ORIGINAL PR DESCRIPTION: ~~Contingent on merge of huggingface/transformers#34719 ^ has been merged not yet released SUMMARY: Update run_compressed tests from decompression tests to run_comrpressed tests -> test if run_compressed True/False models generate the same output Add decompress tests that copies attrs from the source dir path's model to the target model. TEST PLAN: ran the test using transformers main must pass tests/llmcompressor/transformers/compression/test_decompress.py and tests/llmcompressor/transformers/compression/test_run_compressed.py
1 parent 0755398 commit 806da33

File tree

11 files changed

+256
-53
lines changed

11 files changed

+256
-53
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "commit"
2+
test_type: "regression"
3+
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
4+
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "commit"
2+
test_type: "regression"
3+
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
4+
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "commit"
2+
test_type: "regression"
3+
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
4+
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "commit"
2+
test_type: "regression"
3+
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
4+
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
cadence: "commit"
22
test_type: "regression"
3-
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
4-
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
3+
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
4+
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
cadence: "commit"
22
test_type: "regression"
3-
model_stub: "nm-testing/tinyllama-w4a16-compressed"
4-
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
3+
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
4+
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
cadence: "commit"
2+
test_type: "regression"
3+
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
4+
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed

tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml

Lines changed: 0 additions & 4 deletions
This file was deleted.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
cadence: "commit"
22
test_type: "regression"
3-
model_stub: "nm-testing/tinyllama-w8a8-compressed"
4-
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
3+
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
4+
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import copy
2+
import shutil
3+
import tempfile
4+
import unittest
5+
6+
from compressed_tensors import QUANTIZATION_CONFIG_NAME
7+
from compressed_tensors.compressors import ModelCompressor
8+
from compressed_tensors.quantization import QuantizationStatus
9+
from parameterized import parameterized_class
10+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
11+
from transformers.utils.quantization_config import CompressedTensorsConfig
12+
13+
from tests.testing_utils import parse_params, requires_gpu
14+
15+
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
16+
17+
18+
@requires_gpu
19+
@parameterized_class(parse_params(CONFIG_DIR))
20+
class TestDecompression(unittest.TestCase):
21+
"""
22+
Check that HFQuantizer decompression is working as expected.
23+
Manually decompress a compressed model and compare the generations
24+
25+
Decompression:
26+
Given a skeleton model and path to the optimized model,
27+
write the optimized model's safetensors to the skeleton model and decompress
28+
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
29+
30+
"""
31+
32+
compressed_model_stub = None
33+
skeleton_model_stub = None
34+
35+
SAMPLE_INPUTS = [
36+
"I love 4-bit quantization because",
37+
"What is the capital of France?",
38+
"def fibonacci(n):",
39+
]
40+
41+
@classmethod
42+
def setUpClass(self):
43+
self.test_dir = tempfile.mkdtemp()
44+
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
45+
46+
# Decompress using HFQuantizer from AutoModelForCausalLM
47+
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
48+
self.compressed_model_stub,
49+
torch_dtype="auto",
50+
device_map="auto",
51+
quantization_config=CompressedTensorsConfig(run_compressed=False),
52+
)
53+
54+
# Manually decompress this model
55+
self.dense_model = AutoModelForCausalLM.from_pretrained(
56+
self.skeleton_model_stub,
57+
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
58+
device_map=self.decompressed_model_hf_quantizer.device,
59+
)
60+
61+
# decompression from HFQuantizer should populate weight_scale
62+
assert hasattr(
63+
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
64+
"weight_scale",
65+
)
66+
67+
# dense model should not have weight_scale populated
68+
assert not hasattr(
69+
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
70+
)
71+
72+
config = AutoConfig.from_pretrained(self.compressed_model_stub)
73+
74+
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
75+
self.compressor = ModelCompressor.from_compression_config(compression_config)
76+
self.compressor.quantization_config.quantization_status = (
77+
QuantizationStatus.FROZEN
78+
)
79+
80+
# use the model_path to load the decompressed weights into dense_model
81+
dense_model = copy.deepcopy(self.dense_model)
82+
83+
# overwrite the weights of the dense model
84+
self.compressor.decompress(
85+
model_path=self.compressed_model_stub,
86+
model=self.dense_model,
87+
)
88+
89+
# self.dense_model should be decompressed
90+
assert dense_model is not self.dense_model
91+
92+
self.decompressed_model_manual = self.dense_model
93+
94+
assert hasattr(
95+
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
96+
"weight_scale",
97+
)
98+
99+
def test_hf_quantizer_decompress_match_manual_decompress(self):
100+
manual_device = self.decompressed_model_manual.device
101+
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
102+
103+
self.decompressed_model_manual = self.decompressed_model_manual.to(
104+
manual_device
105+
)
106+
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
107+
decompressed_model_hf_quantizer
108+
)
109+
110+
for input in self.SAMPLE_INPUTS:
111+
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
112+
self.decompressed_model_manual.device
113+
)
114+
inputs = inputs.to(self.decompressed_model_manual.device)
115+
116+
decompressed_model_manual_output = self.tokenizer.batch_decode(
117+
self.decompressed_model_manual.generate(**inputs, max_length=50)
118+
)
119+
120+
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
121+
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
122+
)
123+
124+
assert (
125+
decompressed_model_hf_quantizer_out == decompressed_model_manual_output
126+
)
127+
128+
@classmethod
129+
def tearDownClass(self):
130+
shutil.rmtree(self.test_dir)
131+
del self.dense_model
132+
del self.decompressed_model_hf_quantizer
133+
del self.decompressed_model_manual
Lines changed: 97 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,133 @@
1+
import copy
12
import shutil
23
import tempfile
34
import unittest
45

5-
import torch
66
from compressed_tensors import QUANTIZATION_CONFIG_NAME
77
from compressed_tensors.compressors import ModelCompressor
88
from compressed_tensors.quantization import QuantizationStatus
99
from parameterized import parameterized_class
1010
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
11+
from transformers.utils.quantization_config import CompressedTensorsConfig
1112

1213
from tests.testing_utils import parse_params, requires_gpu
1314

14-
CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"
15+
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
1516

1617

1718
@requires_gpu
1819
@parameterized_class(parse_params(CONFIG_DIR))
19-
class TestQuantizationMatches(unittest.TestCase):
20-
model_stub = None
21-
empty_model = None
20+
class TestDecompression(unittest.TestCase):
21+
"""
22+
Check that HFQuantizer decompression is working as expected.
23+
Manually decompress a compressed model and compare the generations
24+
25+
Decompression:
26+
Given a skeleton model and path to the optimized model,
27+
write the optimized model's safetensors to the skeleton model and decompress
28+
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
29+
30+
"""
31+
32+
compressed_model_stub = None
33+
skeleton_model_stub = None
34+
35+
SAMPLE_INPUTS = [
36+
"I love 4-bit quantization because",
37+
"What is the capital of France?",
38+
"def fibonacci(n):",
39+
]
2240

2341
@classmethod
24-
def setUpClass(cls):
25-
cls.test_dir = tempfile.mkdtemp()
42+
def setUpClass(self):
43+
self.test_dir = tempfile.mkdtemp()
44+
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
2645

27-
# TODO: Give option on HFQuantizer to run run_compressed True/False
28-
# currently hardcoded to True
29-
cls.compressed_model = AutoModelForCausalLM.from_pretrained(
30-
cls.model_stub,
46+
# Decompress using HFQuantizer from AutoModelForCausalLM
47+
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
48+
self.compressed_model_stub,
3149
torch_dtype="auto",
3250
device_map="auto",
33-
# run_compressed=True, # TODO: Give option on HFQuantizer
51+
quantization_config=CompressedTensorsConfig(run_compressed=False),
3452
)
35-
# TODO: Use ModelCompressor until decompression is supported through
36-
# HFQuant/run_compressed can be turned off.
37-
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
38-
cls.empty_model,
39-
torch_dtype=cls.compressed_model.dtype,
40-
device_map=cls.compressed_model.device,
53+
54+
# Manually decompress this model
55+
self.dense_model = AutoModelForCausalLM.from_pretrained(
56+
self.skeleton_model_stub,
57+
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
58+
device_map=self.decompressed_model_hf_quantizer.device,
59+
)
60+
61+
# decompression from HFQuantizer should populate weight_scale
62+
assert hasattr(
63+
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
64+
"weight_scale",
65+
)
66+
67+
# dense model should not have weight_scale populated
68+
assert not hasattr(
69+
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
4170
)
42-
config = AutoConfig.from_pretrained(cls.model_stub)
71+
72+
config = AutoConfig.from_pretrained(self.compressed_model_stub)
73+
4374
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
44-
cls.compressor = ModelCompressor.from_compression_config(compression_config)
45-
cls.compressor.quantization_config.quantization_status = (
75+
self.compressor = ModelCompressor.from_compression_config(compression_config)
76+
self.compressor.quantization_config.quantization_status = (
4677
QuantizationStatus.FROZEN
4778
)
48-
cls.compressor.decompress(
49-
model_path=cls.model_stub, model=cls.uncompressed_model
79+
80+
# use the model_path to load the decompressed weights into dense_model
81+
dense_model = copy.deepcopy(self.dense_model)
82+
83+
# overwrite the weights of the dense model
84+
self.compressor.decompress(
85+
model_path=self.compressed_model_stub,
86+
model=self.dense_model,
5087
)
5188

52-
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
89+
# self.dense_model should be decompressed
90+
assert dense_model is not self.dense_model
5391

54-
def test_compressed_matches_uncompressed(self):
55-
SAMPLE_INPUT = [
56-
"I love 4-bit quantization because",
57-
"What is the capital of France?",
58-
"def fibonacci(n):",
59-
]
92+
self.decompressed_model_manual = self.dense_model
6093

61-
inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
62-
self.compressed_model.device
94+
assert hasattr(
95+
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
96+
"weight_scale",
6397
)
64-
compressed_output = self.tokenizer.batch_decode(
65-
self.compressed_model.generate(**inputs, max_length=50)
98+
99+
def test_hf_quantizer_decompress_match_manual_decompress(self):
100+
manual_device = self.decompressed_model_manual.device
101+
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
102+
103+
self.decompressed_model_manual = self.decompressed_model_manual.to(
104+
manual_device
66105
)
67-
uncompressed_output = self.tokenizer.batch_decode(
68-
self.uncompressed_model.generate(**inputs, max_length=50)
106+
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
107+
decompressed_model_hf_quantizer
69108
)
70109

71-
for idx in range(len(SAMPLE_INPUT)):
72-
assert compressed_output[idx] == uncompressed_output[idx]
110+
for input in self.SAMPLE_INPUTS:
111+
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
112+
self.decompressed_model_manual.device
113+
)
114+
inputs = inputs.to(self.decompressed_model_manual.device)
115+
116+
decompressed_model_manual_output = self.tokenizer.batch_decode(
117+
self.decompressed_model_manual.generate(**inputs, max_length=50)
118+
)
119+
120+
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
121+
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
122+
)
123+
124+
assert (
125+
decompressed_model_hf_quantizer_out == decompressed_model_manual_output
126+
)
73127

74128
@classmethod
75-
def tearDownClass(cls):
76-
shutil.rmtree(cls.test_dir)
77-
del cls.compressed_model
78-
del cls.uncompressed_model
79-
torch.cuda.empty_cache()
129+
def tearDownClass(self):
130+
shutil.rmtree(self.test_dir)
131+
del self.dense_model
132+
del self.decompressed_model_hf_quantizer
133+
del self.decompressed_model_manual

0 commit comments

Comments
 (0)