Skip to content

Commit

Permalink
llama-vocab, llama : handle <|eom_id|> Llama-3.1 token
Browse files Browse the repository at this point in the history
  • Loading branch information
sszymczy committed Jul 30, 2024
1 parent 7c27a19 commit cc50e78
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 1 deletion.
7 changes: 6 additions & 1 deletion src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
return token != -1 && (
token == llama_token_eos_impl(vocab) ||
token == llama_token_eot_impl(vocab)
token == llama_token_eot_impl(vocab) ||
token == llama_token_eom_impl(vocab)
);
}

Expand Down Expand Up @@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}

llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}

int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
Expand Down
2 changes: 2 additions & 0 deletions src/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct llama_vocab {
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
id special_eom_id = -1;

// tokenizer flags
bool tokenizer_add_space_prefix = false;
Expand Down Expand Up @@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);

int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
Expand Down
17 changes: 17 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_KV_TOKENIZER_MIDDLE_ID,
LLM_KV_TOKENIZER_EOT_ID,
LLM_KV_TOKENIZER_EOM_ID,

LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,
Expand Down Expand Up @@ -459,6 +460,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },

{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
Expand Down Expand Up @@ -5585,6 +5587,7 @@ static void llm_load_vocab(
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
};

for (const auto & it : special_token_types) {
Expand Down Expand Up @@ -5637,6 +5640,20 @@ static void llm_load_vocab(
}
}
}

// find EOM token: "<|eom_id|>"
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
// for now, we apply this workaround to find the EOM token based on its text
if (vocab.special_eom_id == -1) {
for (const auto & t : vocab.token_to_id) {
if (t.first == "<|eom_id|>") {
vocab.special_eom_id = t.second;
break;
}
}
}

}

// build special tokens cache
Expand Down

0 comments on commit cc50e78

Please sign in to comment.