10
10
from ..utils .logger import logger
11
11
from ..utils .memory_tracker import memory_tracker
12
12
import time
13
+ from tqdm .auto import tqdm
13
14
14
15
try :
15
16
import ctransformers
17
+ from ctransformers import AutoModelForCausalLM as CTAutoModel
16
18
CT_AVAILABLE = True
17
19
except ImportError :
18
20
CT_AVAILABLE = False
@@ -149,14 +151,18 @@ def quantize(
149
151
for i in range (0 , total_layers , self .chunk_size )]
150
152
151
153
start_time = time .perf_counter ()
152
- for chunk_idx , chunk in enumerate (chunks ):
153
- logger .log_info (f"\n Processing chunk { chunk_idx + 1 } /{ len (chunks )} " )
154
+
155
+ # Create progress bar for chunks
156
+ chunk_pbar = tqdm (chunks , desc = "Processing chunks" , position = 0 )
157
+ layer_pbar = tqdm (total = total_layers , desc = "Quantizing layers" , position = 1 , leave = True )
158
+
159
+ for chunk_idx , chunk in enumerate (chunk_pbar ):
160
+ chunk_pbar .set_description (f"Processing chunk { chunk_idx + 1 } /{ len (chunks )} " )
154
161
155
162
for idx , (name , module ) in enumerate (chunk , 1 ):
156
163
try :
157
164
current_layer = idx + chunk_idx * self .chunk_size
158
- logger .log_info (f"\n Quantizing layer { current_layer } /{ total_layers } : { name } " )
159
- logger .log_info (f"Layer shape: { list (module .weight .shape )} " )
165
+ layer_pbar .set_description (f"Layer { current_layer } /{ total_layers } : { name } " )
160
166
161
167
# Move layer to target device if needed
162
168
if module .weight .device != device :
@@ -174,11 +180,11 @@ def quantize(
174
180
else :
175
181
setattr (self .model , name , quantized_layer )
176
182
177
- # Log progress
183
+ # Update progress
184
+ layer_pbar .update (1 )
178
185
elapsed_time = time .perf_counter () - start_time
179
- progress = current_layer / total_layers
180
- eta = elapsed_time / progress - elapsed_time if progress > 0 else 0
181
- logger .log_info (f"Progress: { progress * 100 :.1f} % | ETA: { eta :.1f} s" )
186
+ eta = elapsed_time / (current_layer / total_layers ) - elapsed_time if current_layer > 0 else 0
187
+ layer_pbar .set_postfix ({"ETA" : f"{ eta :.1f} s" })
182
188
183
189
self ._clear_memory ()
184
190
@@ -191,6 +197,10 @@ def quantize(
191
197
torch .cuda .empty_cache ()
192
198
gc .collect ()
193
199
200
+ # Close progress bars
201
+ layer_pbar .close ()
202
+ chunk_pbar .close ()
203
+
194
204
# Log final statistics
195
205
total_time = time .perf_counter () - start_time
196
206
logger .log_info ("\n " + "=" * 60 )
@@ -322,28 +332,31 @@ def convert_to_gguf(self, output_path: str):
322
332
self .model .to ('cpu' )
323
333
memory_tracker .log_memory ("model_moved_to_cpu" )
324
334
325
- # Prepare GGUF conversion config
326
- config = {
327
- "quantization" : {
328
- "bits" : self .bits ,
329
- "type" : self .quant_type ,
330
- "group_size" : self .group_size if self .group_size > 0 else None ,
331
- },
332
- "metadata" : {
333
- "description" : "Model quantized using QuantLLM GGUF quantizer" ,
334
- "format_version" : "legacy" if self .legacy_format else "latest" ,
335
- "has_act_desc" : self .desc_act ,
336
- "has_tensor_desc" : self .desc_ten
337
- }
338
- }
335
+ # Save model in HF format first
336
+ temp_dir = f"{ output_path } _temp_hf"
337
+ self .model .save_pretrained (temp_dir )
339
338
340
339
# Convert using ctransformers
341
- ctransformers .convert (
342
- self .model ,
343
- output_path ,
344
- config = config ,
345
- legacy = self .legacy_format
346
- )
340
+ try :
341
+ # Use ctransformers to load and save in GGUF format
342
+ ct_model = CTAutoModel .from_pretrained (
343
+ temp_dir ,
344
+ model_type = "llama" , # Default to llama, can be parameterized later
345
+ model_file = None ,
346
+ config = {
347
+ "max_new_tokens" : 2048 ,
348
+ "context_length" : 2048 ,
349
+ "gpu_layers" : 0 # CPU conversion
350
+ }
351
+ )
352
+ ct_model .save_pretrained (output_path )
353
+
354
+ import shutil
355
+ shutil .rmtree (temp_dir , ignore_errors = True )
356
+
357
+ except Exception as e :
358
+ logger .log_error (f"CTTransformers conversion failed: { str (e )} " )
359
+ raise
347
360
348
361
memory_tracker .log_memory ("gguf_conversion_complete" )
349
362
logger .log_info ("GGUF conversion completed successfully" )
@@ -362,3 +375,4 @@ def _clear_memory(self):
362
375
torch .cuda .synchronize ()
363
376
memory_tracker .clear_memory ()
364
377
378
+
0 commit comments