diff --git a/.gitignore b/.gitignore
index a485d15..691fb41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.bin
 *.gguf
 *.safetensors
+tools/llama.cpp/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/tools/README.md b/tools/README.md
index ead935a..d330a17 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -5,15 +5,45 @@ git clone https://github.com/ggerganov/llama.cpp
 pip install llama.cpp/gguf-py
 ```
 
-To quantize:
+
+To convert your initial source model to FP16 (or BF16), run the following command:
+```
+python convert.py --src E:\models\unet\flux1-dev.safetensors
+```
+
+
+To quantize the model, first apply the provided patch to the llama.cpp repo you've just cloned.
+```
+cd llama.cpp
+git checkout tags/b3600
+git apply ..\lcpp.patch
+```
+
+
+The compile the llama-quantize binary. This example uses cmake, on linux you can just use make.
 ```
-python convert.py --src ~/ComfyUI/models/unet/flux1-dev.safetensors --dst ~/ComfyUI/models/unet/flux1-dev-Q4_0.gguf --qtype Q4_0
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Debug -j10 --target llama-quantize
+cd ..
+cd ..
 ```
 
-Working quant types: Q4_0, Q5_0, Q8_0, F16
+
+Now you can use the newly build binary to quantize your model to the desired format:
+```
+llama.cpp\build\bin\Debug\llama-quantize.exe E:\models\unet\flux1-dev-BF16.gguf E:\models\unet\flux1-dev-Q4_K_S.gguf Q4_K_S
+```
+
+
+You can extract the patch again with `git diff src\llama.cpp > lcpp.patch` if you wish to change something and contribute back.
+
 
 > [!WARNING]  
-> Do not use the diffusers UNET for flux, it won't work, use the default checkpoint that comes with the model or convert it.
+> Do not use the diffusers UNET for flux, it won't work, use the default/reference checkpoint format. This is due to q/k/v being merged into one qkv key. You can convert it by loading it in ComfyUI and saving it using the built-in "ModelSave" node.
 
-> [!IMPORTANT]  
-> The model format is very much WIP. I don't recommend uploading the model files created with this method anywhere until proper metadata is added, although the key/quantization format is unlikely to change.
+
+> [!WARNING]  
+> Do not quantize SDXL / SD1 / other Conv2D heavy models. There's little to no benefit with these models. If you do, make sure to **extract the UNET model first**.
+>This should be obvious, but also don't use the resulting llama-quantize binary with LLMs.
diff --git a/tools/convert.py b/tools/convert.py
index 7264c92..b6b5828 100644
--- a/tools/convert.py
+++ b/tools/convert.py
@@ -1,5 +1,6 @@
 # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
 import os
+import sys
 import torch
 import numpy as np
 import gguf # This needs to be the llama.cpp one specifically!
@@ -9,27 +10,13 @@
 from safetensors.torch import load_file
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="Generate GGUF files from single SD ckpt")
+    parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
     parser.add_argument("--src", required=True, help="Source model ckpt file.")
     parser.add_argument("--dst", help="Output  unet gguf file.")
-    parser.add_argument("--qtype", default="F16", help="Quant type [default: f16]")
     args = parser.parse_args()
 
     if not os.path.isfile(args.src):
         parser.error("No input provided!")
-
-    if args.dst is None:
-        args.dst = os.path.splitext(args.src)[0] + f"_{args.qtype}.gguf"
-        args.dst = os.path.basename(args.dst)
-
-    if os.path.isfile(args.dst):
-        input("Output exists enter to continue or ctrl+c to abort!")
-    
-    try:
-        args.ftype = getattr(gguf.LlamaFileType, f"MOSTLY_{args.qtype}")
-        args.qtype = getattr(gguf.GGMLQuantizationType, args.qtype)
-    except AttributeError:
-        parser.error(f"Unknown quant/file type {args.qtype}")
     
     return args
 
@@ -39,10 +26,21 @@ def load_state_dict(path):
         state_dict = state_dict.get("model", state_dict)
     else:
         state_dict = load_file(path)
-    return state_dict
+    
+    # only keep unet with no prefix!
+    sd = {}
+    has_prefix = any(["model.diffusion_model." in x for x in state_dict.keys()])
+    for k, v in state_dict.items():
+        if has_prefix and "model.diffusion_model." not in k:
+            continue
+        if has_prefix:
+            k = k.replace("model.diffusion_model.", "")
+        sd[k] = v
+
+    return sd
 
-def load_model(args):
-    state_dict = load_state_dict(args.src)
+def load_model(path):
+    state_dict = load_state_dict(path)
 
     # from ComfyUI model detection
     if "transformer_blocks.0.attn.norm_added_k.weight" in state_dict:
@@ -64,34 +62,31 @@ def load_model(args):
     writer = gguf.GGUFWriter(path=None, arch=arch)
     return (writer, state_dict)
 
-def handle_metadata(args, writer, state_dict):
-    # TODO: actual metadata
-    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
-    writer.add_file_type(args.ftype)
-
 def handle_tensors(args, writer, state_dict):
     # TODO list:
     # - do something about this being awful and hacky
 
     max_name_len = max([len(s) for s in state_dict.keys()]) + 4
     for key, data in tqdm(state_dict.items()):
-        if data.dtype == torch.bfloat16:
-            data = data.to(torch.float32)
-        data = data.numpy()
-
         old_dtype = data.dtype
 
+        if data.dtype == torch.bfloat16:
+            data = data.to(torch.float32).numpy()
+        else:
+            data = data.numpy()
+
         n_dims = len(data.shape)
-        data_qtype = args.qtype
         data_shape = data.shape
+        data_qtype = getattr(
+            gguf.GGMLQuantizationType,
+            "BF16" if old_dtype == torch.bfloat16 else "F16"
+        )
 
         # get number of parameters (AKA elements) in this tensor
         n_params = 1
         for dim_size in data_shape:
             n_params *= dim_size
 
-        fallback = gguf.GGMLQuantizationType.F16
-
         # keys to keep as max precision
         blacklist = [
             "time_embedding.",
@@ -105,7 +100,7 @@ def handle_tensors(args, writer, state_dict):
         ]
 
         if any([x in key for x in blacklist]) and ".weight" in key:
-            data_qtype = fallback
+            data_qtype = gguf.GGMLQuantizationType.F32
 
         if n_dims == 1: 
             # one-dimensional tensors should be kept in F32
@@ -118,20 +113,12 @@ def handle_tensors(args, writer, state_dict):
         
         elif n_dims == 4:
             if min(data.shape[:2]) == 4: # output tensor
-                data_qtype = fallback
+                data_qtype = gguf.GGMLQuantizationType.F16
             elif data_shape[-1] == 3: # 3x3 kernel
-                data_qtype = fallback
+                data_qtype = gguf.GGMLQuantizationType.F16
             elif data_shape[-1] == 1: # 1x1 kernel
                 #data = np.squeeze(data) # don't do this
-                data_qtype = fallback
-
-        # TODO: find keys to keep in higher precision(s) / qtypes
-        # if "time_emb_proj.weight" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
-        # if ".to_v.weight" in key or ".to_out" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
-        # if "ff.net" in key:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
+                data_qtype = gguf.GGMLQuantizationType.F16
 
         try:
             data = gguf.quants.quantize(data, data_qtype)
@@ -144,7 +131,6 @@ def handle_tensors(args, writer, state_dict):
             data_qtype = gguf.GGMLQuantizationType.F16
             data = gguf.quants.quantize(data, data_qtype)
 
-        assert len(key) < 64, f"Invalid key length! Cannot store in gguf file. {key}"
         new_name = key # do we need to rename?
 
         shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
@@ -152,23 +138,25 @@ def handle_tensors(args, writer, state_dict):
 
         writer.add_tensor(new_name, data, raw_dtype=data_qtype)
 
-warning = """
-######################################################
-      The quantized file format needs more work.
-Consider **not** uploading the resulting files for now
-######################################################
-"""
-
 if __name__ == "__main__":
     args = parse_args()
-    writer, state_dict = load_model(args)
+    path = args.src
+    writer, state_dict = load_model(path)
+
+    writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+    if next(iter(state_dict.values())).dtype == torch.bfloat16:
+        out_path = f"{os.path.splitext(path)[0]}-BF16.gguf"
+        writer.add_file_type(gguf.LlamaFileType.MOSTLY_BF16)
+    else:
+        out_path = f"{os.path.splitext(path)[0]}-F16.gguf"
+        writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)
     
-    handle_metadata(args, writer, state_dict)
-    handle_tensors(args, writer, state_dict)
+    out_path = args.dst or out_path
+    if os.path.isfile(out_path):
+        input("Output exists enter to continue or ctrl+c to abort!")
 
-    writer.write_header_to_file(path=(args.dst or "test.gguf"))
+    handle_tensors(path, writer, state_dict)
+    writer.write_header_to_file(path=out_path)
     writer.write_kv_data_to_file()
     writer.write_tensors_to_file(progress=True)
     writer.close()
-
-    print(warning)
diff --git a/tools/lcpp.patch b/tools/lcpp.patch
new file mode 100644
index 0000000..2c12ffe
--- /dev/null
+++ b/tools/lcpp.patch
@@ -0,0 +1,173 @@
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 5ab65ea9..44fe3fe6 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -212,6 +212,8 @@ enum llm_arch {
+     LLM_ARCH_JAIS,
+     LLM_ARCH_NEMOTRON,
+     LLM_ARCH_EXAONE,
++    LLM_ARCH_FLUX,
++    LLM_ARCH_SD1,
+     LLM_ARCH_UNKNOWN,
+ };
+ 
+@@ -259,6 +261,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_JAIS,            "jais"         },
+     { LLM_ARCH_NEMOTRON,        "nemotron"     },
+     { LLM_ARCH_EXAONE,          "exaone"       },
++    { LLM_ARCH_FLUX,            "flux"         },
++    { LLM_ARCH_SD1,             "sd1"          },
+     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
+ };
+ 
+@@ -1337,6 +1341,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+         },
+     },
++    { LLM_ARCH_FLUX, { {} }, },
++    { LLM_ARCH_SD1, { {} }, },
+     {
+         LLM_ARCH_UNKNOWN,
+         {
+@@ -4629,6 +4635,13 @@ static void llm_load_hparams(
+     // get general kv
+     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
+ 
++    // Disable LLM metadata for image models
++    if (model.arch == LLM_ARCH_FLUX) {
++        model.ftype = ml.ftype;
++        hparams.rope_type = llama_rope_type(&model);
++        return;
++    }
++
+     // get hparams kv
+     ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+ 
+@@ -15854,26 +15867,40 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+         return std::make_pair(i_layer, n_layer);
+     };
+ 
++    if (
++            (name.find("model.diffusion_model.") != std::string::npos) ||
++            (name.find("first_stage_model.") != std::string::npos) ||
++            (name.find("single_transformer_blocks.") != std::string::npos)
++        ) {
++            throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
++    }
++
+     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+     // with the quantization of the output tensor
+-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+-        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
+-            new_type = qs.params->output_tensor_type;
+-        } else {
+-            int nx = tensor->ne[0];
+-            if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+-                new_type = GGML_TYPE_Q8_0;
+-            }
+-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M   ||
+-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+-                new_type = GGML_TYPE_Q5_K;
+-            }
+-            else if (new_type != GGML_TYPE_Q8_0) {
+-                new_type = GGML_TYPE_Q6_K;
+-            }
+-        }
+-    } else if (name == "token_embd.weight") {
++    if ( // KEEP IN FP32
++            (name == tn(LLM_TENSOR_OUTPUT, "weight")) ||
++            (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) ||
++            (name.find("img_in.") != std::string::npos) ||
++            (name.find("time_in.in_layer.") != std::string::npos) ||
++            (name.find("vector_in.in_layer.") != std::string::npos) ||
++            (name.find("guidance_in.in_layer.") != std::string::npos) ||
++            (name.find("final_layer.linear.") != std::string::npos)
++        ) {
++            if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
++                new_type = qs.params->output_tensor_type;
++            } else {
++                new_type = GGML_TYPE_F32;
++            }
++    } else if ( // KEEP IN FP16
++            (name == "token_embd.weight") ||
++            (name.find("time_embedding.") != std::string::npos) ||
++            (name.find("add_embedding.") != std::string::npos) ||
++            (name.find("txt_in.") != std::string::npos) ||
++            (name.find("time_in.") != std::string::npos) ||
++            (name.find("vector_in.") != std::string::npos) ||
++            (name.find("guidance_in.") != std::string::npos) ||
++            (name.find("final_layer.") != std::string::npos)
++        ) {
+         if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
+             new_type = qs.params->token_embedding_type;
+         } else {
+@@ -15891,10 +15918,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+                      new_type == GGML_TYPE_Q4_0_8_8) {
+                 new_type = GGML_TYPE_Q4_0;
+             }
++            else { // if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
++                new_type = GGML_TYPE_F16;
++            }
+         }
+     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
+                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+-        if (name.find("attn_v.weight") != std::string::npos) {
++        if ((name.find("attn_v.weight") != std::string::npos) || (name.find(".to_v.weight") != std::string::npos)) {
+             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+             ++qs.i_attention_wv;
+@@ -15916,7 +15946,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+             }
+         }
+-    } else if (name.find("attn_v.weight") != std::string::npos) {
++    } else if ((name.find("attn_v.weight") != std::string::npos) || (name.find(".to_v.weight") != std::string::npos)) {
+         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+         }
+@@ -15954,7 +15984,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+             new_type = GGML_TYPE_Q8_0;
+         }
+         ++qs.i_attention_wv;
+-    } else if (name.find("attn_k.weight") != std::string::npos) {
++    } else if ((name.find("attn_k.weight") != std::string::npos) || (name.find("to_k.weight") != std::string::npos)) {
+         if (qs.model.hparams.n_expert == 8) {
+             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+             // TODO: explore better strategies
+@@ -15966,7 +15996,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+             new_type = GGML_TYPE_IQ2_S;
+         }
+-    } else if (name.find("attn_q.weight") != std::string::npos) {
++    } else if ((name.find("attn_q.weight") != std::string::npos) || (name.find("to_q.weight") != std::string::npos)) {
+         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+             new_type = GGML_TYPE_IQ3_XXS;
+         }
+@@ -16038,7 +16068,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+             if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
+         }
+     }
+-    else if (name.find("attn_qkv.weight") != std::string::npos) {
++    else if ((name.find("attn_qkv.weight") != std::string::npos) || (name.find("attn.qkv.weight") != std::string::npos)) {
+         if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+             new_type = GGML_TYPE_Q4_K;
+         }
+@@ -16107,6 +16137,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
+         }
+         LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+         ++qs.n_fallback;
++        // Force FP16 fallback - needed due to Conv2D
++        new_type = GGML_TYPE_F16;
+     }
+ 
+     return new_type;
+@@ -17432,6 +17464,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_T5:
+         case LLM_ARCH_T5ENCODER:
+         case LLM_ARCH_JAIS:
++        case LLM_ARCH_FLUX:
++        case LLM_ARCH_SD1:
+             return LLAMA_ROPE_TYPE_NONE;
+ 
+         // use what we call a normal RoPE, operating on pairs of consecutive head values