llama-quant : fix the verification of attention layers for encoder-decoder models (#16023)

DamonFool · web-flow · commit 745cbcf2fe1e · 2025-09-17T09:30:55.000+02:00
Signed-off-by: Jie Fu &lt;jiefu@tencent.com&gt;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // attention layers have a non-zero number of kv heads
         int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
         if (llama_model_has_encoder(&model)) {
-            n_attn_layer *= 3;
+            // now n_attn_layer is the number of attention layers in the encoder
+            // for each decoder block, there are 2 attention layers
+            n_attn_layer += 2 * model.hparams.dec_n_layer;
         }
         GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
     }

Original file line number	Diff line number	Diff line change
`@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::`
`725`	`725`	`// attention layers have a non-zero number of kv heads`
`726`	`726`	`int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);`
`727`	`727`	`if (llama_model_has_encoder(&model)) {`
`728`		`- n_attn_layer *= 3;`
	`728`	`+ // now n_attn_layer is the number of attention layers in the encoder`
	`729`	`+ // for each decoder block, there are 2 attention layers`
	`730`	`+ n_attn_layer += 2 * model.hparams.dec_n_layer;`
`729`	`731`	`}`
`730`	`732`	`GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");`
`731`	`733`	`}`