-
Notifications
You must be signed in to change notification settings - Fork 12.3k
llama : support qwen3 rerank and embeddings #14029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3f3b9a2
f8fd440
e0eb4b8
030dc3b
f8facb3
0777cd3
8edd2cf
c02f53d
c2f4dc7
cbb6f20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -825,6 +825,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { | |||||||
case LLM_ARCH_QWEN3: | ||||||||
{ | ||||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); | ||||||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); | ||||||||
|
||||||||
switch (hparams.n_layer) { | ||||||||
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; | ||||||||
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; | ||||||||
|
@@ -2468,6 +2470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { | |||||||
{ | ||||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); | ||||||||
|
||||||||
// output rerank | ||||||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED); | ||||||||
|
||||||||
// output | ||||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); | ||||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); | ||||||||
|
@@ -7057,7 +7062,7 @@ struct llm_build_qwen3 : public llm_graph_context { | |||||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); | ||||||||
} | ||||||||
|
||||||||
if (il == n_layer - 1) { | ||||||||
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { | ||||||||
// skip computing output for unused tokens | ||||||||
ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids); | ||||||||
|
@@ -13788,7 +13793,8 @@ uint64_t llama_model_size(const llama_model * model) { | |||||||
} | ||||||||
|
||||||||
const char * llama_model_chat_template(const llama_model * model, const char * name) { | ||||||||
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N) | ||||||||
const auto key = name | ||||||||
? LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N) + std::string(name) | ||||||||
Comment on lines
+13796
to
+13797
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I wonder how long this has been broken? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, has never worked it seems, broken since it was introduced in #11016 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not in used by any of the examples so we don't know if it works in the first place (probably used in downstream project, but idk) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make a PR. |
||||||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); | ||||||||
const auto & it = model->gguf_kv.find(key); | ||||||||
if (it == model->gguf_kv.end()) { | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, you can use LLM_KV_TOKENIZER_CHAT_TEMPLATE with suffix:
llama.cpp/src/llama-arch.cpp
Lines 1722 to 1725 in c02f53d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doing this but it doesn't work, maybe it's buggy somewhere else:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was looking in the wrong place, this is where it's broken:
llama.cpp/src/llama-arch.cpp
Lines 1709 to 1712 in e83ba3e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in #14050