@@ -197,13 +197,17 @@ def _verify_embedding_mode(self) -> None:
197
197
def _parse_quant_hf_config (self ):
198
198
quant_cfg = getattr (self .hf_config , "quantization_config" , None )
199
199
if quant_cfg is None :
200
- # compress -tensors uses a "compression_config" key
200
+ # compressed -tensors uses a "compression_config" key
201
201
quant_cfg = getattr (self .hf_config , "compression_config" , None )
202
202
return quant_cfg
203
203
204
204
def _verify_quantization (self ) -> None :
205
205
supported_quantization = [* QUANTIZATION_METHODS ]
206
206
rocm_supported_quantization = ["gptq" , "squeezellm" ]
207
+ optimized_quantization_methods = [
208
+ "fp8" , "marlin" , "gptq_marlin_24" , "gptq_marlin" , "awq_marlin" ,
209
+ "fbgemm_fp8" , "compressed_tensors" , "compressed-tensors"
210
+ ]
207
211
if self .quantization is not None :
208
212
self .quantization = self .quantization .lower ()
209
213
@@ -242,9 +246,7 @@ def _verify_quantization(self) -> None:
242
246
raise ValueError (
243
247
f"{ self .quantization } quantization is currently not "
244
248
f"supported in ROCm." )
245
- if (self .quantization
246
- not in ("fp8" , "marlin" , "gptq_marlin_24" , "gptq_marlin" ,
247
- "awq_marlin" , "fbgemm_fp8" , "compressed_tensors" )):
249
+ if self .quantization not in optimized_quantization_methods :
248
250
logger .warning (
249
251
"%s quantization is not fully "
250
252
"optimized yet. The speed can be slower than "
0 commit comments