K quant patch / instructions

#11
city96 · Aug 21, 2024 · 8d1fbbc · 8d1fbbc · AbstractEyes · Aug 22, 2024
1 parent 1ebce77
commit 8d1fbbc
Show file tree

Hide file tree

Showing 4 changed files with 254 additions and 62 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 *.bin
 *.gguf
 *.safetensors
+tools/llama.cpp/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/tools/README.md b/tools/README.md
@@ -5,15 +5,45 @@ git clone https://github.com/ggerganov/llama.cpp
 pip install llama.cpp/gguf-py
 ```
 
-To quantize:
+
+To convert your initial source model to FP16 (or BF16), run the following command:
+```
+python convert.py --src E:\models\unet\flux1-dev.safetensors
+```
+
+
+To quantize the model, first apply the provided patch to the llama.cpp repo you've just cloned.
+```
+cd llama.cpp
+git checkout tags/b3600
+git apply ..\lcpp.patch
+```
+
+
+The compile the llama-quantize binary. This example uses cmake, on linux you can just use make.
 ```
-python convert.py --src ~/ComfyUI/models/unet/flux1-dev.safetensors --dst ~/ComfyUI/models/unet/flux1-dev-Q4_0.gguf --qtype Q4_0
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Debug -j10 --target llama-quantize
+cd ..
+cd ..
 ```
 
-Working quant types: Q4_0, Q5_0, Q8_0, F16
+
+Now you can use the newly build binary to quantize your model to the desired format:
+```
+llama.cpp\build\bin\Debug\llama-quantize.exe E:\models\unet\flux1-dev-BF16.gguf E:\models\unet\flux1-dev-Q4_K_S.gguf Q4_K_S
+```
+
+
+You can extract the patch again with `git diff src\llama.cpp > lcpp.patch` if you wish to change something and contribute back.
+
 
 > [!WARNING]  
-> Do not use the diffusers UNET for flux, it won't work, use the default checkpoint that comes with the model or convert it.
+> Do not use the diffusers UNET for flux, it won't work, use the default/reference checkpoint format. This is due to q/k/v being merged into one qkv key. You can convert it by loading it in ComfyUI and saving it using the built-in "ModelSave" node.
 
-> [!IMPORTANT]  
-> The model format is very much WIP. I don't recommend uploading the model files created with this method anywhere until proper metadata is added, although the key/quantization format is unlikely to change.
+
+> [!WARNING]  
+> Do not quantize SDXL / SD1 / other Conv2D heavy models. There's little to no benefit with these models. If you do, make sure to **extract the UNET model first**.
+>This should be obvious, but also don't use the resulting llama-quantize binary with LLMs.
diff --git a/tools/convert.py b/tools/convert.py
@@ -1,5 +1,6 @@
 # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
 import os
+import sys
 import torch
 import numpy as np
 import gguf # This needs to be the llama.cpp one specifically!
@@ -9,27 +10,13 @@
 from safetensors.torch import load_file
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Generate GGUF files from single SD ckpt")
+    parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
     parser.add_argument("--src", required=True, help="Source model ckpt file.")
     parser.add_argument("--dst", help="Output  unet gguf file.")
-    parser.add_argument("--qtype", default="F16", help="Quant type [default: f16]")
     args = parser.parse_args()
 
     if not os.path.isfile(args.src):
         parser.error("No input provided!")
-
-    if args.dst is None:
-        args.dst = os.path.splitext(args.src)[0] + f"_{args.qtype}.gguf"
-        args.dst = os.path.basename(args.dst)
-
-    if os.path.isfile(args.dst):
-        input("Output exists enter to continue or ctrl+c to abort!")
-
-    try:
-        args.ftype = getattr(gguf.LlamaFileType, f"MOSTLY_{args.qtype}")
-        args.qtype = getattr(gguf.GGMLQuantizationType, args.qtype)
-    except AttributeError:
-        parser.error(f"Unknown quant/file type {args.qtype}")
 
     return args
 
@@ -39,10 +26,21 @@ def load_state_dict(path):
         state_dict = state_dict.get("model", state_dict)
     else:
         state_dict = load_file(path)
-    return state_dict
+
+    # only keep unet with no prefix!
+    sd = {}
+    has_prefix = any(["model.diffusion_model." in x for x in state_dict.keys()])
+    for k, v in state_dict.items():
+        if has_prefix and "model.diffusion_model." not in k:
+            continue
+        if has_prefix:
+            k = k.replace("model.diffusion_model.", "")
+        sd[k] = v
+
+    return sd
 
-def load_model(args):
-    state_dict = load_state_dict(args.src)
+def load_model(path):
+    state_dict = load_state_dict(path)
 
     # from ComfyUI model detection
     if "transformer_blocks.0.attn.norm_added_k.weight" in state_dict:
@@ -64,34 +62,31 @@ def load_model(args):
     writer = gguf.GGUFWriter(path=None, arch=arch)
     return (writer, state_dict)
 
-def handle_metadata(args, writer, state_dict):
-    # TODO: actual metadata
-    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    writer.add_file_type(args.ftype)
-
 def handle_tensors(args, writer, state_dict):
     # TODO list:
     # - do something about this being awful and hacky
 
     max_name_len = max([len(s) for s in state_dict.keys()]) + 4
     for key, data in tqdm(state_dict.items()):
-        if data.dtype == torch.bfloat16:
-            data = data.to(torch.float32)
-        data = data.numpy()
-
         old_dtype = data.dtype
 
+        if data.dtype == torch.bfloat16:
+            data = data.to(torch.float32).numpy()
+        else:
+            data = data.numpy()
+
         n_dims = len(data.shape)
-        data_qtype = args.qtype
         data_shape = data.shape
+        data_qtype = getattr(
+            gguf.GGMLQuantizationType,
+            "BF16" if old_dtype == torch.bfloat16 else "F16"
+        )
 
         # get number of parameters (AKA elements) in this tensor
         n_params = 1
         for dim_size in data_shape:
             n_params *= dim_size
 
-        fallback = gguf.GGMLQuantizationType.F16
-
         # keys to keep as max precision
         blacklist = [
             "time_embedding.",
@@ -105,7 +100,7 @@ def handle_tensors(args, writer, state_dict):
         ]
 
         if any([x in key for x in blacklist]) and ".weight" in key:
-            data_qtype = fallback
+            data_qtype = gguf.GGMLQuantizationType.F32
 
         if n_dims == 1: 
             # one-dimensional tensors should be kept in F32
@@ -118,20 +113,12 @@ def handle_tensors(args, writer, state_dict):
 
         elif n_dims == 4:
             if min(data.shape[:2]) == 4: # output tensor
-                data_qtype = fallback
+                data_qtype = gguf.GGMLQuantizationType.F16
             elif data_shape[-1] == 3: # 3x3 kernel
-                data_qtype = fallback
+                data_qtype = gguf.GGMLQuantizationType.F16
             elif data_shape[-1] == 1: # 1x1 kernel
                 #data = np.squeeze(data) # don't do this
-                data_qtype = fallback
-
-        # TODO: find keys to keep in higher precision(s) / qtypes
-        # if "time_emb_proj.weight" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
-        # if ".to_v.weight" in key or ".to_out" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
-        # if "ff.net" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
+                data_qtype = gguf.GGMLQuantizationType.F16
 
         try:
             data = gguf.quants.quantize(data, data_qtype)
@@ -144,31 +131,32 @@ def handle_tensors(args, writer, state_dict):
             data_qtype = gguf.GGMLQuantizationType.F16
             data = gguf.quants.quantize(data, data_qtype)
 
-        assert len(key) < 64, f"Invalid key length! Cannot store in gguf file. {key}"
         new_name = key # do we need to rename?
 
         shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
         tqdm.write(f"{f'%-{max_name_len}s' % f'{new_name}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
 
         writer.add_tensor(new_name, data, raw_dtype=data_qtype)
 
-warning = """
-######################################################
-      The quantized file format needs more work.
-Consider **not** uploading the resulting files for now
-######################################################
-"""
-
 if __name__ == "__main__":
     args = parse_args()
-    writer, state_dict = load_model(args)
+    path = args.src
+    writer, state_dict = load_model(path)
+
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    if next(iter(state_dict.values())).dtype == torch.bfloat16:
+        out_path = f"{os.path.splitext(path)[0]}-BF16.gguf"
+        writer.add_file_type(gguf.LlamaFileType.MOSTLY_BF16)
+    else:
+        out_path = f"{os.path.splitext(path)[0]}-F16.gguf"
+        writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)
 
-    handle_metadata(args, writer, state_dict)
-    handle_tensors(args, writer, state_dict)
+    out_path = args.dst or out_path
+    if os.path.isfile(out_path):
+        input("Output exists enter to continue or ctrl+c to abort!")
 
-    writer.write_header_to_file(path=(args.dst or "test.gguf"))
+    handle_tensors(path, writer, state_dict)
+    writer.write_header_to_file(path=out_path)
     writer.write_kv_data_to_file()
     writer.write_tensors_to_file(progress=True)
     writer.close()
-
-    print(warning)