Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tried to quantize nvidia/Cosmos-1.0-Diffusion-7B-Text2World #209

Open
al-swaiti opened this issue Jan 27, 2025 · 3 comments
Open

tried to quantize nvidia/Cosmos-1.0-Diffusion-7B-Text2World #209

al-swaiti opened this issue Jan 27, 2025 · 3 comments

Comments

@al-swaiti
Copy link

al-swaiti commented Jan 27, 2025

my convert.py

Click to view the code
import os
import torch
import gguf
import argparse
from tqdm import tqdm

from safetensors.torch import load_file

# Define constants
MAX_TENSOR_NAME_LENGTH = 255
QUANTIZATION_THRESHOLD = 256
REARRANGE_THRESHOLD = 256

class ModelCosmos:
    arch = "cosmos"
    keys_detect = [
        ("logvar.0.freqs", "logvar.0.phases", "logvar.1.weight"),
        ("net.affline_norm.weight",),
        ("net.blocks.block0.blocks.0.adaLN_modulation.1.weight", 
         "net.blocks.block0.blocks.0.adaLN_modulation.2.weight"),
        ("net.blocks.block0.blocks.0.block.attn.to_k.0.weight", 
         "net.blocks.block0.blocks.0.block.attn.to_k.1.weight"),
        ("net.blocks.block0.blocks.0.block.attn.to_out.0.weight", 
         "net.blocks.block0.blocks.0.block.attn.to_q.0.weight"),
    ]
    keys_banned = []
    shape_fix = True

def parse_args():
    parser = argparse.ArgumentParser(description="Convert Cosmos model to GGUF")
    parser.add_argument("--src", required=True, help="Source model checkpoint file")
    parser.add_argument("--dst", help="Output GGUF file")
    return parser.parse_args()

def load_state_dict(path):
    if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
        state_dict = torch.load(path, map_location="cpu", weights_only=True)
        state_dict = state_dict.get("model", state_dict)
    else:
        state_dict = load_file(path)

    # Remove model prefix
    prefix = None
    for pfx in ["model.diffusion_model.", "model."]:
        if any([x.startswith(pfx) for x in state_dict.keys()]):
            prefix = pfx
            break

    sd = {}
    for k, v in state_dict.items():
        if prefix and prefix not in k:
            continue
        if prefix:
            k = k.replace(prefix, "")
        sd[k] = v

    return sd

def handle_tensors(writer, state_dict):
    name_lengths = sorted(
        ((key, len(key)) for key in state_dict.keys()),
        key=lambda item: item[1],
        reverse=True
    )
    
    max_name_len = name_lengths[0][1] if name_lengths else 0
    
    for key, data in tqdm(state_dict.items()):
        old_dtype = data.dtype

        # Convert to numpy and handle different dtypes
        if data.dtype == torch.bfloat16:
            data = data.to(torch.float32).numpy()
        elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), 
                             getattr(torch, "float8_e5m2", "_invalid")]:
            data = data.to(torch.float16).numpy()
        else:
            data = data.numpy()

        n_dims = len(data.shape)
        data_shape = data.shape
        data_qtype = gguf.GGMLQuantizationType.BF16 if old_dtype == torch.bfloat16 else gguf.GGMLQuantizationType.F16

        n_params = 1
        for dim_size in data_shape:
            n_params *= dim_size

        # Quantization type selection
        if old_dtype in (torch.float32, torch.bfloat16):
            if n_dims == 1:
                data_qtype = gguf.GGMLQuantizationType.F32
            elif n_params <= QUANTIZATION_THRESHOLD:
                data_qtype = gguf.GGMLQuantizationType.F32

        # Shape rearrangement for Cosmos model
        if (ModelCosmos.shape_fix and n_dims > 1 and n_params >= REARRANGE_THRESHOLD 
            and (n_params / 256).is_integer() and not (data.shape[-1] / 256).is_integer()):
            orig_shape = data.shape
            data = data.reshape(n_params // 256, 256)
            writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))

        # Quantization
        try:
            data = gguf.quants.quantize(data, data_qtype)
        except (AttributeError, gguf.QuantError):
            data_qtype = gguf.GGMLQuantizationType.F16
            data = gguf.quants.quantize(data, data_qtype)

        # Print tensor info
        shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
        tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{key}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")

        writer.add_tensor(key, data, raw_dtype=data_qtype)

def main():
    args = parse_args()
    
    # Load state dict
    state_dict = load_state_dict(args.src)
    
    # Create GGUF writer 
    writer = gguf.GGUFWriter(path=None, arch="cosmos")
    
    # Determine output path and file type
    if next(iter(state_dict.values())).dtype == torch.bfloat16:
        out_path = f"{os.path.splitext(args.src)[0]}-BF16.gguf"
        writer.add_file_type(gguf.LlamaFileType.MOSTLY_BF16)
    else:
        out_path = f"{os.path.splitext(args.src)[0]}-F16.gguf"
        writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)

    # Use provided destination or generated path
    out_path = args.dst or out_path
    
    # Warn if output exists
    if os.path.isfile(out_path):
        input("Output exists. Press Enter to continue or Ctrl+C to abort!")

    # Handle tensors and write GGUF file
    handle_tensors(writer, state_dict)
    writer.write_header_to_file(path=out_path)
    writer.write_kv_data_to_file()
    writer.write_tensors_to_file(progress=True)
    writer.close()

if __name__ == "__main__":
    main()



</details>```
@al-swaiti
Copy link
Author

the last patch i created

Click to view the code
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index de3c706f..0267c1fa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -223,7 +223,7 @@
 #define GGML_MAX_OP_PARAMS      64
 
 #ifndef GGML_MAX_NAME
-#   define GGML_MAX_NAME        64
+#   define GGML_MAX_NAME        128
 #endif
 
 #define GGML_DEFAULT_N_THREADS  4
@@ -2449,6 +2449,7 @@ extern "C" {
 
     // manage tensor info
     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, int n_dim);
     GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
     GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b16c462f..6d1568f1 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22960,6 +22960,14 @@ void gguf_add_tensor(
     ctx->header.n_tensors++;
 }
 
+void gguf_set_tensor_ndim(struct gguf_context * ctx, const char * name, const int n_dim) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ABORT("tensor not found");
+    }
+    ctx->infos[idx].n_dims = n_dim;
+}
+
 void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
     const int idx = gguf_find_tensor(ctx, name);
     if (idx < 0) {
diff --git a/src/llama.cpp b/src/llama.cpp
index 24e1f1f0..1cb5faab 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -205,6 +205,12 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
+    LLM_ARCH_FLUX,
+    LLM_ARCH_SD1,
+    LLM_ARCH_SDXL,
+    LLM_ARCH_COSMOS,
+    LLM_ARCH_SD3,
+    LLM_ARCH_AURA,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -258,6 +264,12 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
+    { LLM_ARCH_FLUX,            "flux"         },
+    { LLM_ARCH_SD1,             "sd1"          },
+    { LLM_ARCH_SDXL,            "sdxl"         },
+    { LLM_ARCH_COSMOS,          "cosmos"       },
+    { LLM_ARCH_SD3,             "sd3"          },
+    { LLM_ARCH_AURA,            "aura"         },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -1531,6 +1543,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
+    { LLM_ARCH_FLUX, {}},
+    { LLM_ARCH_SD1,  {}},
+    { LLM_ARCH_SDXL, {}},
+    { LLM_ARCH_COSMOS, {}},
+    { LLM_ARCH_SD3,  {}},
+    { LLM_ARCH_AURA, {}},
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -5403,6 +5421,12 @@ static void llm_load_hparams(
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
+    // Disable LLM metadata for image models
+    if (model.arch == LLM_ARCH_FLUX || model.arch == LLM_ARCH_COSMOS ||model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL || model.arch == LLM_ARCH_SD3 || model.arch == LLM_ARCH_AURA) {
+        model.ftype = ml.ftype;
+        return;
+    }
+
     // get hparams kv
     ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
 
@@ -18016,6 +18040,125 @@ static void llama_tensor_dequantize_internal(
     workers.clear();
 }
 
+static ggml_type img_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+    // Special function for quantizing image model tensors
+    const std::string name = ggml_get_name(tensor);
+    const llm_arch arch = qs.model.arch;
+
+    // Sanity check
+    if (
+            (name.find("model.diffusion_model.") != std::string::npos) ||
+            (name.find("first_stage_model.") != std::string::npos) ||
+            (name.find("single_transformer_blocks.") != std::string::npos) ||
+            (name.find("joint_transformer_blocks.") != std::string::npos)
+        ) {
+            throw std::runtime_error("Invalid input GGUF file. This is not a supported UNET model");
+    }
+
+    // Unsupported quant types - exclude all IQ quants for now
+    if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ1_M   || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_M   || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_4 ||
+        ftype == LLAMA_FTYPE_MOSTLY_Q4_0_4_8 || ftype == LLAMA_FTYPE_MOSTLY_Q4_0_8_8) {
+        throw std::runtime_error("Invalid quantization type for image model (Not supported)");
+    }
+
+    if ( // Rules for to_v attention
+            (name.find("attn_v.weight") != std::string::npos) ||
+            (name.find(".to_v.weight") != std::string::npos) || 
+            (name.find(".attn.w1v.weight") != std::string::npos) ||
+            (name.find(".attn.w2v.weight") != std::string::npos)
+        ){
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+                new_type = GGML_TYPE_Q3_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+                new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            ++qs.i_attention_wv;
+    } else if ( // Rules for fused qkv attention
+            (name.find("attn_qkv.weight") != std::string::npos) ||
+            (name.find("attn.qkv.weight") != std::string::npos)
+        ) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+    } else if ( // Rules for ffn
+            (name.find("ffn_down") != std::string::npos)
+        ) {
+            // TODO: add back `layer_info` with some model specific logic + logic further down
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+                new_type = GGML_TYPE_Q5_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) {
+                new_type = GGML_TYPE_Q4_1;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_0) {
+                new_type = GGML_TYPE_Q5_1;
+            }
+            ++qs.i_ffn_down;
+    }
+
+    // Sanity check for row shape
+    bool convert_incompatible_tensor = false;
+    if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   ||
+        new_type == GGML_TYPE_Q5_K    || new_type == GGML_TYPE_Q6_K) {
+        int nx = tensor->ne[0];
+        int ny = tensor->ne[1];
+        if (nx % QK_K != 0) {
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+            convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
+        }
+    }
+    if (convert_incompatible_tensor) {
+        // TODO: Possibly reenable this in the future
+        // switch (new_type) {
+        //     case GGML_TYPE_Q2_K:
+        //     case GGML_TYPE_Q3_K:
+        //     case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+        //     case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+        //     case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+        //     default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+        // }
+        new_type = GGML_TYPE_F16;
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
+    }
+    return new_type;
+}
+
 static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
@@ -18547,6 +18690,63 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             ctx_outs[i_split] = gguf_init_empty();
         }
         gguf_add_tensor(ctx_outs[i_split], tensor);
+        // SD3 pos_embed needs special fix as first dim is 1, which gets truncated here
+        if (model.arch == LLM_ARCH_SD3) {
+            const std::string name = ggml_get_name(tensor);
+            if (name == "pos_embed" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "pos_embed", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting pos_embed shape for SD3: [key:%s]\n", __func__, tensor->name);
+            }
+        }
+
+        if (model.arch == LLM_ARCH_COSMOS) {
+            const std::string name = ggml_get_name(tensor);
+            
+            // Handle patch embedding tensors
+            if (name == "net.patch_embed.proj.weight") {
+                const int n_dim = tensor->ne[2] == 1 ? 3 : ggml_n_dims(tensor);
+                gguf_set_tensor_ndim(ctx_outs[i_split], "net.patch_embed.proj.weight", n_dim);
+                LLAMA_LOG_INFO("\n%s: Setting patch_embed.proj.weight dims to %d for Cosmos: [key:%s]\n", 
+                            __func__, n_dim, tensor->name);
+            }
+            else if (name == "net.patch_embed.proj.bias") {
+                const int n_dim = tensor->ne[2] == 1 ? 3 : ggml_n_dims(tensor);
+                gguf_set_tensor_ndim(ctx_outs[i_split], "net.patch_embed.proj.bias", n_dim);
+                LLAMA_LOG_INFO("\n%s: Setting patch_embed.proj.bias dims to %d for Cosmos: [key:%s]\n", 
+                            __func__, n_dim, tensor->name);
+            }
+            
+            // Handle attention tensors
+            else if (name.find("block.attn.") != std::string::npos) {
+                const int n_dim = tensor->ne[2] == 1 ? 3 : ggml_n_dims(tensor);
+                gguf_set_tensor_ndim(ctx_outs[i_split], name.c_str(), n_dim);
+                LLAMA_LOG_INFO("\n%s: Setting attention tensor dims to %d for Cosmos: [key:%s]\n", 
+                            __func__, n_dim, tensor->name);
+            }
+            
+            // Handle norm layers
+            else if (name.find(".norm") != std::string::npos) {
+                const int n_dim = tensor->ne[2] == 1 ? 3 : ggml_n_dims(tensor);
+                gguf_set_tensor_ndim(ctx_outs[i_split], name.c_str(), n_dim);
+                LLAMA_LOG_INFO("\n%s: Setting norm layer dims to %d for Cosmos: [key:%s]\n", 
+                            __func__, n_dim, tensor->name);
+            }
+        }
+        // same goes for auraflow
+        if (model.arch == LLM_ARCH_AURA) {
+            const std::string name = ggml_get_name(tensor);
+            if (name == "positional_encoding" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "positional_encoding", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting positional_encoding shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+            }
+            if (name == "register_tokens" && tensor->ne[2] == 1) {
+                const int n_dim = 3;
+                gguf_set_tensor_ndim(ctx_outs[i_split], "register_tokens", n_dim);
+                LLAMA_LOG_INFO("\n%s: Correcting register_tokens shape for AuraFlow: [key:%s]\n", __func__, tensor->name);
+            }
+        }
     }
 
     // Set split info if needed
@@ -18647,6 +18847,71 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
+        // rules for image models
+        bool image_model = false;
+        if (model.arch == LLM_ARCH_FLUX) {
+            image_model = true;
+            quantize &= name.find("txt_in.") == std::string::npos;
+            quantize &= name.find("img_in.") == std::string::npos;
+            quantize &= name.find("time_in.") == std::string::npos;
+            quantize &= name.find("vector_in.") == std::string::npos;
+            quantize &= name.find("guidance_in.") == std::string::npos;
+            quantize &= name.find("final_layer.") == std::string::npos;
+        }
+	if (model.arch == LLM_ARCH_COSMOS) {
+	    image_model = true;
+	    quantize &= name.find("logvar") == std::string::npos;
+	    quantize &= name.find("net.affline_norm") == std::string::npos;
+	    quantize &= name.find("net.blocks") == std::string::npos;
+	    quantize &= !(
+		name.find("adaLN_modulation") != std::string::npos ||
+		name.find("block.attn.to_") != std::string::npos ||
+		name.find("block.attn.proj") != std::string::npos ||
+		name.find(".block.norm1") != std::string::npos ||
+		name.find(".block.norm2") != std::string::npos ||
+		name == "net.patch_embed.proj.weight" ||
+		name == "net.patch_embed.proj.bias"
+	    );
+	}
+        if (model.arch == LLM_ARCH_SD1 || model.arch == LLM_ARCH_SDXL) {
+            image_model = true;
+            quantize &= name.find("class_embedding.") == std::string::npos;
+            quantize &= name.find("time_embedding.") == std::string::npos;
+            quantize &= name.find("add_embedding.") == std::string::npos;
+            quantize &= name.find("time_embed.") == std::string::npos;
+            quantize &= name.find("label_emb.") == std::string::npos;
+            quantize &= name.find("conv_in.") == std::string::npos;
+            quantize &= name.find("conv_out.") == std::string::npos;
+            quantize &= name != "input_blocks.0.0.weight";
+            quantize &= name != "out.2.weight";
+        }
+        if (model.arch == LLM_ARCH_SD3) {
+            image_model = true;
+            quantize &= name.find("final_layer.") == std::string::npos;
+            quantize &= name.find("time_text_embed.") == std::string::npos;
+            quantize &= name.find("context_embedder.") == std::string::npos;
+            quantize &= name.find("t_embedder.") == std::string::npos;
+            quantize &= name.find("y_embedder.") == std::string::npos;
+            quantize &= name.find("x_embedder.") == std::string::npos;
+            quantize &= name != "proj_out.weight";
+            quantize &= name != "pos_embed";
+        }
+        if (model.arch == LLM_ARCH_AURA) {
+            image_model = true;
+            quantize &= name.find("t_embedder.") == std::string::npos;
+            quantize &= name.find("init_x_linear.") == std::string::npos;
+            quantize &= name != "modF.1.weight";
+            quantize &= name != "cond_seq_linear.weight";
+            quantize &= name != "final_linear.weight";
+            quantize &= name != "final_linear.weight";
+            quantize &= name != "positional_encoding";
+            quantize &= name != "register_tokens";
+        }
+        // ignore 3D/4D tensors for image models as the code was never meant to handle these
+        if (image_model) {
+            quantize &= ggml_n_dims(tensor) == 2;
+        }
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;
@@ -18655,6 +18920,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
+            if (image_model) {
+                new_type = img_tensor_get_type(qs, new_type, tensor, ftype);
+            } else {
             if (!params->pure && ggml_is_quantized(default_type)) {
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
             }
@@ -18664,6 +18932,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }
+            }
 
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
</details>```

@al-swaiti
Copy link
Author

the result quantized models have same size of original model approximately

@al-swaiti
Copy link
Author

i quantize it but i think model loader not support this model

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant