diff --git a/.gitignore b/.gitignore index a485d15..691fb41 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.bin *.gguf *.safetensors +tools/llama.cpp/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/tools/README.md b/tools/README.md index ead935a..d330a17 100644 --- a/tools/README.md +++ b/tools/README.md @@ -5,15 +5,45 @@ git clone https://github.com/ggerganov/llama.cpp pip install llama.cpp/gguf-py ``` -To quantize: + +To convert your initial source model to FP16 (or BF16), run the following command: +``` +python convert.py --src E:\models\unet\flux1-dev.safetensors +``` + + +To quantize the model, first apply the provided patch to the llama.cpp repo you've just cloned. +``` +cd llama.cpp +git checkout tags/b3600 +git apply ..\lcpp.patch +``` + + +The compile the llama-quantize binary. This example uses cmake, on linux you can just use make. ``` -python convert.py --src ~/ComfyUI/models/unet/flux1-dev.safetensors --dst ~/ComfyUI/models/unet/flux1-dev-Q4_0.gguf --qtype Q4_0 +mkdir build +cd build +cmake .. +cmake --build . --config Debug -j10 --target llama-quantize +cd .. +cd .. ``` -Working quant types: Q4_0, Q5_0, Q8_0, F16 + +Now you can use the newly build binary to quantize your model to the desired format: +``` +llama.cpp\build\bin\Debug\llama-quantize.exe E:\models\unet\flux1-dev-BF16.gguf E:\models\unet\flux1-dev-Q4_K_S.gguf Q4_K_S +``` + + +You can extract the patch again with `git diff src\llama.cpp > lcpp.patch` if you wish to change something and contribute back. + > [!WARNING] -> Do not use the diffusers UNET for flux, it won't work, use the default checkpoint that comes with the model or convert it. +> Do not use the diffusers UNET for flux, it won't work, use the default/reference checkpoint format. This is due to q/k/v being merged into one qkv key. You can convert it by loading it in ComfyUI and saving it using the built-in "ModelSave" node. -> [!IMPORTANT] -> The model format is very much WIP. I don't recommend uploading the model files created with this method anywhere until proper metadata is added, although the key/quantization format is unlikely to change. + +> [!WARNING] +> Do not quantize SDXL / SD1 / other Conv2D heavy models. There's little to no benefit with these models. If you do, make sure to **extract the UNET model first**. +>This should be obvious, but also don't use the resulting llama-quantize binary with LLMs. diff --git a/tools/convert.py b/tools/convert.py index 7264c92..b6b5828 100644 --- a/tools/convert.py +++ b/tools/convert.py @@ -1,5 +1,6 @@ # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) import os +import sys import torch import numpy as np import gguf # This needs to be the llama.cpp one specifically! @@ -9,27 +10,13 @@ from safetensors.torch import load_file def parse_args(): - parser = argparse.ArgumentParser(description="Generate GGUF files from single SD ckpt") + parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET") parser.add_argument("--src", required=True, help="Source model ckpt file.") parser.add_argument("--dst", help="Output unet gguf file.") - parser.add_argument("--qtype", default="F16", help="Quant type [default: f16]") args = parser.parse_args() if not os.path.isfile(args.src): parser.error("No input provided!") - - if args.dst is None: - args.dst = os.path.splitext(args.src)[0] + f"_{args.qtype}.gguf" - args.dst = os.path.basename(args.dst) - - if os.path.isfile(args.dst): - input("Output exists enter to continue or ctrl+c to abort!") - - try: - args.ftype = getattr(gguf.LlamaFileType, f"MOSTLY_{args.qtype}") - args.qtype = getattr(gguf.GGMLQuantizationType, args.qtype) - except AttributeError: - parser.error(f"Unknown quant/file type {args.qtype}") return args @@ -39,10 +26,21 @@ def load_state_dict(path): state_dict = state_dict.get("model", state_dict) else: state_dict = load_file(path) - return state_dict + + # only keep unet with no prefix! + sd = {} + has_prefix = any(["model.diffusion_model." in x for x in state_dict.keys()]) + for k, v in state_dict.items(): + if has_prefix and "model.diffusion_model." not in k: + continue + if has_prefix: + k = k.replace("model.diffusion_model.", "") + sd[k] = v + + return sd -def load_model(args): - state_dict = load_state_dict(args.src) +def load_model(path): + state_dict = load_state_dict(path) # from ComfyUI model detection if "transformer_blocks.0.attn.norm_added_k.weight" in state_dict: @@ -64,34 +62,31 @@ def load_model(args): writer = gguf.GGUFWriter(path=None, arch=arch) return (writer, state_dict) -def handle_metadata(args, writer, state_dict): - # TODO: actual metadata - writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - writer.add_file_type(args.ftype) - def handle_tensors(args, writer, state_dict): # TODO list: # - do something about this being awful and hacky max_name_len = max([len(s) for s in state_dict.keys()]) + 4 for key, data in tqdm(state_dict.items()): - if data.dtype == torch.bfloat16: - data = data.to(torch.float32) - data = data.numpy() - old_dtype = data.dtype + if data.dtype == torch.bfloat16: + data = data.to(torch.float32).numpy() + else: + data = data.numpy() + n_dims = len(data.shape) - data_qtype = args.qtype data_shape = data.shape + data_qtype = getattr( + gguf.GGMLQuantizationType, + "BF16" if old_dtype == torch.bfloat16 else "F16" + ) # get number of parameters (AKA elements) in this tensor n_params = 1 for dim_size in data_shape: n_params *= dim_size - fallback = gguf.GGMLQuantizationType.F16 - # keys to keep as max precision blacklist = [ "time_embedding.", @@ -105,7 +100,7 @@ def handle_tensors(args, writer, state_dict): ] if any([x in key for x in blacklist]) and ".weight" in key: - data_qtype = fallback + data_qtype = gguf.GGMLQuantizationType.F32 if n_dims == 1: # one-dimensional tensors should be kept in F32 @@ -118,20 +113,12 @@ def handle_tensors(args, writer, state_dict): elif n_dims == 4: if min(data.shape[:2]) == 4: # output tensor - data_qtype = fallback + data_qtype = gguf.GGMLQuantizationType.F16 elif data_shape[-1] == 3: # 3x3 kernel - data_qtype = fallback + data_qtype = gguf.GGMLQuantizationType.F16 elif data_shape[-1] == 1: # 1x1 kernel #data = np.squeeze(data) # don't do this - data_qtype = fallback - - # TODO: find keys to keep in higher precision(s) / qtypes - # if "time_emb_proj.weight" in key: - # data_qtype = gguf.GGMLQuantizationType.F16 - # if ".to_v.weight" in key or ".to_out" in key: - # data_qtype = gguf.GGMLQuantizationType.F16 - # if "ff.net" in key: - # data_qtype = gguf.GGMLQuantizationType.F16 + data_qtype = gguf.GGMLQuantizationType.F16 try: data = gguf.quants.quantize(data, data_qtype) @@ -144,7 +131,6 @@ def handle_tensors(args, writer, state_dict): data_qtype = gguf.GGMLQuantizationType.F16 data = gguf.quants.quantize(data, data_qtype) - assert len(key) < 64, f"Invalid key length! Cannot store in gguf file. {key}" new_name = key # do we need to rename? shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" @@ -152,23 +138,25 @@ def handle_tensors(args, writer, state_dict): writer.add_tensor(new_name, data, raw_dtype=data_qtype) -warning = """ -###################################################### - The quantized file format needs more work. -Consider **not** uploading the resulting files for now -###################################################### -""" - if __name__ == "__main__": args = parse_args() - writer, state_dict = load_model(args) + path = args.src + writer, state_dict = load_model(path) + + writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + if next(iter(state_dict.values())).dtype == torch.bfloat16: + out_path = f"{os.path.splitext(path)[0]}-BF16.gguf" + writer.add_file_type(gguf.LlamaFileType.MOSTLY_BF16) + else: + out_path = f"{os.path.splitext(path)[0]}-F16.gguf" + writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16) - handle_metadata(args, writer, state_dict) - handle_tensors(args, writer, state_dict) + out_path = args.dst or out_path + if os.path.isfile(out_path): + input("Output exists enter to continue or ctrl+c to abort!") - writer.write_header_to_file(path=(args.dst or "test.gguf")) + handle_tensors(path, writer, state_dict) + writer.write_header_to_file(path=out_path) writer.write_kv_data_to_file() writer.write_tensors_to_file(progress=True) writer.close() - - print(warning) diff --git a/tools/lcpp.patch b/tools/lcpp.patch new file mode 100644 index 0000000..2c12ffe --- /dev/null +++ b/tools/lcpp.patch @@ -0,0 +1,173 @@ +diff --git a/src/llama.cpp b/src/llama.cpp +index 5ab65ea9..44fe3fe6 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -212,6 +212,8 @@ enum llm_arch { + LLM_ARCH_JAIS, + LLM_ARCH_NEMOTRON, + LLM_ARCH_EXAONE, ++ LLM_ARCH_FLUX, ++ LLM_ARCH_SD1, + LLM_ARCH_UNKNOWN, + }; + +@@ -259,6 +261,8 @@ static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_JAIS, "jais" }, + { LLM_ARCH_NEMOTRON, "nemotron" }, + { LLM_ARCH_EXAONE, "exaone" }, ++ { LLM_ARCH_FLUX, "flux" }, ++ { LLM_ARCH_SD1, "sd1" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, + }; + +@@ -1337,6 +1341,8 @@ static const std::map> LLM_TENSOR_NA + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, ++ { LLM_ARCH_FLUX, { {} }, }, ++ { LLM_ARCH_SD1, { {} }, }, + { + LLM_ARCH_UNKNOWN, + { +@@ -4629,6 +4635,13 @@ static void llm_load_hparams( + // get general kv + ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); + ++ // Disable LLM metadata for image models ++ if (model.arch == LLM_ARCH_FLUX) { ++ model.ftype = ml.ftype; ++ hparams.rope_type = llama_rope_type(&model); ++ return; ++ } ++ + // get hparams kv + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + +@@ -15854,26 +15867,40 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + return std::make_pair(i_layer, n_layer); + }; + ++ if ( ++ (name.find("model.diffusion_model.") != std::string::npos) || ++ (name.find("first_stage_model.") != std::string::npos) || ++ (name.find("single_transformer_blocks.") != std::string::npos) ++ ) { ++ throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model"); ++ } ++ + // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings + // with the quantization of the output tensor +- if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { +- if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { +- new_type = qs.params->output_tensor_type; +- } else { +- int nx = tensor->ne[0]; +- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { +- new_type = GGML_TYPE_Q8_0; +- } +- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || +- ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || +- ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { +- new_type = GGML_TYPE_Q5_K; +- } +- else if (new_type != GGML_TYPE_Q8_0) { +- new_type = GGML_TYPE_Q6_K; +- } +- } +- } else if (name == "token_embd.weight") { ++ if ( // KEEP IN FP32 ++ (name == tn(LLM_TENSOR_OUTPUT, "weight")) || ++ (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) || ++ (name.find("img_in.") != std::string::npos) || ++ (name.find("time_in.in_layer.") != std::string::npos) || ++ (name.find("vector_in.in_layer.") != std::string::npos) || ++ (name.find("guidance_in.in_layer.") != std::string::npos) || ++ (name.find("final_layer.linear.") != std::string::npos) ++ ) { ++ if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { ++ new_type = qs.params->output_tensor_type; ++ } else { ++ new_type = GGML_TYPE_F32; ++ } ++ } else if ( // KEEP IN FP16 ++ (name == "token_embd.weight") || ++ (name.find("time_embedding.") != std::string::npos) || ++ (name.find("add_embedding.") != std::string::npos) || ++ (name.find("txt_in.") != std::string::npos) || ++ (name.find("time_in.") != std::string::npos) || ++ (name.find("vector_in.") != std::string::npos) || ++ (name.find("guidance_in.") != std::string::npos) || ++ (name.find("final_layer.") != std::string::npos) ++ ) { + if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { + new_type = qs.params->token_embedding_type; + } else { +@@ -15891,10 +15918,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + new_type == GGML_TYPE_Q4_0_8_8) { + new_type = GGML_TYPE_Q4_0; + } ++ else { // if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) { ++ new_type = GGML_TYPE_F16; ++ } + } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { +- if (name.find("attn_v.weight") != std::string::npos) { ++ if ((name.find("attn_v.weight") != std::string::npos) || (name.find(".to_v.weight") != std::string::npos)) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + ++qs.i_attention_wv; +@@ -15916,7 +15946,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + } + } +- } else if (name.find("attn_v.weight") != std::string::npos) { ++ } else if ((name.find("attn_v.weight") != std::string::npos) || (name.find(".to_v.weight") != std::string::npos)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } +@@ -15954,7 +15984,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + new_type = GGML_TYPE_Q8_0; + } + ++qs.i_attention_wv; +- } else if (name.find("attn_k.weight") != std::string::npos) { ++ } else if ((name.find("attn_k.weight") != std::string::npos) || (name.find("to_k.weight") != std::string::npos)) { + if (qs.model.hparams.n_expert == 8) { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + // TODO: explore better strategies +@@ -15966,7 +15996,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_IQ2_S; + } +- } else if (name.find("attn_q.weight") != std::string::npos) { ++ } else if ((name.find("attn_q.weight") != std::string::npos) || (name.find("to_q.weight") != std::string::npos)) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + new_type = GGML_TYPE_IQ3_XXS; + } +@@ -16038,7 +16068,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } +- else if (name.find("attn_qkv.weight") != std::string::npos) { ++ else if ((name.find("attn_qkv.weight") != std::string::npos) || (name.find("attn.qkv.weight") != std::string::npos)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } +@@ -16107,6 +16137,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n + } + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); + ++qs.n_fallback; ++ // Force FP16 fallback - needed due to Conv2D ++ new_type = GGML_TYPE_F16; + } + + return new_type; +@@ -17432,6 +17464,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { + case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_JAIS: ++ case LLM_ARCH_FLUX: ++ case LLM_ARCH_SD1: + return LLAMA_ROPE_TYPE_NONE; + + // use what we call a normal RoPE, operating on pairs of consecutive head values