Skip to content

Fix eos tokens to glm4 and adapts to glm3 #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: glm4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2728,6 +2728,8 @@ def set_vocab_chatglm3(self):
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
assert max(tokenizer.get_vocab().values()) < vocab_size
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
print(vocab_size)
print(max(tokenizer.get_vocab().values()))
for token_id in range(vocab_size):
Expand All @@ -2750,7 +2752,11 @@ def set_vocab_chatglm3(self):
text = f"[PAD{token_id}]".encode("utf-8")

if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
toktype = SentencePieceTokenTypes.UNKNOWN
if piece in special_tokens:
# show special tokens in prompt
toktype = SentencePieceTokenTypes.USER_DEFINED
else:
toktype = SentencePieceTokenTypes.UNKNOWN
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
Expand Down Expand Up @@ -2856,9 +2862,9 @@ def set_vocab(self):
special_vocab.chat_template = "ChatGLM4"
special_vocab.merges = merges
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
# if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
Expand Down Expand Up @@ -2955,7 +2961,7 @@ def parse_args() -> argparse.Namespace:
help="model is executed on big endian machine",
)
parser.add_argument(
"model", type=Path,
"--model", type=Path,
help="directory containing model file",
)
parser.add_argument(
Expand Down
35 changes: 31 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1802,9 +1802,11 @@ enum e_model {
MODEL_2_8B,
MODEL_3B,
MODEL_4B,
MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
MODEL_9B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
Expand Down Expand Up @@ -3918,9 +3920,11 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_2_8B: return "2.8B";
case MODEL_3B: return "3B";
case MODEL_4B: return "4B";
case MODEL_6B: return "6B";
case MODEL_6_9B: return "6.9B";
case MODEL_7B: return "7B";
case MODEL_8B: return "8B";
case MODEL_9B: return "9B";
case MODEL_12B: return "12B";
case MODEL_13B: return "13B";
case MODEL_14B: return "14B";
Expand Down Expand Up @@ -4507,8 +4511,8 @@ static void llm_load_hparams(
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 28: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_8B; break;
case 28: model.type = e_model::MODEL_6B; break;
case 40: model.type = e_model::MODEL_9B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
Expand Down Expand Up @@ -18362,6 +18366,19 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
}

bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
auto arch_name = llama_model_arch_name(model->arch);
auto vocab_type = model->vocab.type;
if (strcmp(arch_name, "chatglm") == 0) {
if (LLAMA_VOCAB_TYPE_BPE == vocab_type) { // glm4
return token != -1 && (
token == llama_token_eos(model) ||
token == llama_token_eot(model) ||
token == 151329 ||
token == 151336 ||
token == 151338
);
}
}
return token != -1 && (
token == llama_token_eos(model) ||
token == llama_token_eot(model)
Expand Down Expand Up @@ -18424,8 +18441,18 @@ int32_t llama_tokenize(
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);

auto arch_name = llama_model_arch_name(model->arch);
auto prompt = std::move(std::string(text, text_len));
auto vocab_type = model->vocab.type;
if (strcmp(arch_name, "chatglm") == 0) {
// chatglm3
if (LLAMA_VOCAB_TYPE_SPM == vocab_type) {
prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>";
} else if (LLAMA_VOCAB_TYPE_BPE == vocab_type) { // glm4
prompt = "[gMASK]<sop><|user|>\n" + prompt + "<|assistant|>";
}
}
auto res = llama_tokenize_internal(model->vocab, prompt, add_special, parse_special);
if (n_tokens_max < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
Expand Down
Loading