Skip to content

Commit 8de0c54

Browse files
Merge pull request #14 from codewithdark-git/feature/add_GGUF
Add the GGUF for Quantization
2 parents 2624cd1 + e307ca7 commit 8de0c54

File tree

3 files changed

+44
-31
lines changed

3 files changed

+44
-31
lines changed

docs/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ sphinx-autodoc-typehints>=1.18.3
55
myst-parser>=0.18.1
66
# Only include minimal package dependencies for docs building
77
torch>=2.0.0
8-
transformers>=4.30.0
8+
transformers>=4.30.0
9+
tqdm>=4.65.0

quantllm/api/high_level.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,6 @@ def quantize_from_pretrained(
139139
raise
140140
finally:
141141
memory_tracker.clear_memory()
142-
143-
144142

145143
@staticmethod
146144
def save_quantized_model(

quantllm/quant/gguf.py

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
from ..utils.logger import logger
1111
from ..utils.memory_tracker import memory_tracker
1212
import time
13+
from tqdm.auto import tqdm
1314

1415
try:
1516
import ctransformers
17+
from ctransformers import AutoModelForCausalLM as CTAutoModel
1618
CT_AVAILABLE = True
1719
except ImportError:
1820
CT_AVAILABLE = False
@@ -149,14 +151,18 @@ def quantize(
149151
for i in range(0, total_layers, self.chunk_size)]
150152

151153
start_time = time.perf_counter()
152-
for chunk_idx, chunk in enumerate(chunks):
153-
logger.log_info(f"\nProcessing chunk {chunk_idx + 1}/{len(chunks)}")
154+
155+
# Create progress bar for chunks
156+
chunk_pbar = tqdm(chunks, desc="Processing chunks", position=0)
157+
layer_pbar = tqdm(total=total_layers, desc="Quantizing layers", position=1, leave=True)
158+
159+
for chunk_idx, chunk in enumerate(chunk_pbar):
160+
chunk_pbar.set_description(f"Processing chunk {chunk_idx + 1}/{len(chunks)}")
154161

155162
for idx, (name, module) in enumerate(chunk, 1):
156163
try:
157164
current_layer = idx + chunk_idx * self.chunk_size
158-
logger.log_info(f"\nQuantizing layer {current_layer}/{total_layers}: {name}")
159-
logger.log_info(f"Layer shape: {list(module.weight.shape)}")
165+
layer_pbar.set_description(f"Layer {current_layer}/{total_layers}: {name}")
160166

161167
# Move layer to target device if needed
162168
if module.weight.device != device:
@@ -174,11 +180,11 @@ def quantize(
174180
else:
175181
setattr(self.model, name, quantized_layer)
176182

177-
# Log progress
183+
# Update progress
184+
layer_pbar.update(1)
178185
elapsed_time = time.perf_counter() - start_time
179-
progress = current_layer / total_layers
180-
eta = elapsed_time / progress - elapsed_time if progress > 0 else 0
181-
logger.log_info(f"Progress: {progress*100:.1f}% | ETA: {eta:.1f}s")
186+
eta = elapsed_time / (current_layer / total_layers) - elapsed_time if current_layer > 0 else 0
187+
layer_pbar.set_postfix({"ETA": f"{eta:.1f}s"})
182188

183189
self._clear_memory()
184190

@@ -191,6 +197,10 @@ def quantize(
191197
torch.cuda.empty_cache()
192198
gc.collect()
193199

200+
# Close progress bars
201+
layer_pbar.close()
202+
chunk_pbar.close()
203+
194204
# Log final statistics
195205
total_time = time.perf_counter() - start_time
196206
logger.log_info("\n" + "="*60)
@@ -322,28 +332,31 @@ def convert_to_gguf(self, output_path: str):
322332
self.model.to('cpu')
323333
memory_tracker.log_memory("model_moved_to_cpu")
324334

325-
# Prepare GGUF conversion config
326-
config = {
327-
"quantization": {
328-
"bits": self.bits,
329-
"type": self.quant_type,
330-
"group_size": self.group_size if self.group_size > 0 else None,
331-
},
332-
"metadata": {
333-
"description": "Model quantized using QuantLLM GGUF quantizer",
334-
"format_version": "legacy" if self.legacy_format else "latest",
335-
"has_act_desc": self.desc_act,
336-
"has_tensor_desc": self.desc_ten
337-
}
338-
}
335+
# Save model in HF format first
336+
temp_dir = f"{output_path}_temp_hf"
337+
self.model.save_pretrained(temp_dir)
339338

340339
# Convert using ctransformers
341-
ctransformers.convert(
342-
self.model,
343-
output_path,
344-
config=config,
345-
legacy=self.legacy_format
346-
)
340+
try:
341+
# Use ctransformers to load and save in GGUF format
342+
ct_model = CTAutoModel.from_pretrained(
343+
temp_dir,
344+
model_type="llama", # Default to llama, can be parameterized later
345+
model_file=None,
346+
config={
347+
"max_new_tokens": 2048,
348+
"context_length": 2048,
349+
"gpu_layers": 0 # CPU conversion
350+
}
351+
)
352+
ct_model.save_pretrained(output_path)
353+
354+
import shutil
355+
shutil.rmtree(temp_dir, ignore_errors=True)
356+
357+
except Exception as e:
358+
logger.log_error(f"CTTransformers conversion failed: {str(e)}")
359+
raise
347360

348361
memory_tracker.log_memory("gguf_conversion_complete")
349362
logger.log_info("GGUF conversion completed successfully")
@@ -362,3 +375,4 @@ def _clear_memory(self):
362375
torch.cuda.synchronize()
363376
memory_tracker.clear_memory()
364377

378+

0 commit comments

Comments
 (0)