Skip to content

Commit d3286d6

Browse files
tokenize: fix double BOS token
1 parent 858f6b7 commit d3286d6

File tree

21 files changed

+78
-58
lines changed

21 files changed

+78
-58
lines changed

common/common.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,15 +2343,17 @@ std::vector<llama_token> llama_tokenize(
23432343
const struct llama_context * ctx,
23442344
const std::string & text,
23452345
bool add_special,
2346-
bool parse_special) {
2347-
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
2346+
bool parse_special,
2347+
bool fix_double_bos) {
2348+
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos);
23482349
}
23492350

23502351
std::vector<llama_token> llama_tokenize(
23512352
const struct llama_model * model,
23522353
const std::string & text,
23532354
bool add_special,
2354-
bool parse_special) {
2355+
bool parse_special,
2356+
bool fix_double_bos) {
23552357
// upper limit for the number of tokens
23562358
int n_tokens = text.length() + 2 * add_special;
23572359
std::vector<llama_token> result(n_tokens);
@@ -2363,9 +2365,19 @@ std::vector<llama_token> llama_tokenize(
23632365
} else {
23642366
result.resize(n_tokens);
23652367
}
2368+
if (fix_double_bos) {
2369+
llama_fix_double_bos(model, result);
2370+
}
23662371
return result;
23672372
}
23682373

2374+
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt) {
2375+
const llama_token bos = llama_token_bos(model);
2376+
if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) {
2377+
prompt.erase(prompt.begin(), prompt.begin() + 1);
2378+
}
2379+
}
2380+
23692381
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
23702382
std::vector<char> result(8, 0);
23712383
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);

common/common.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,18 @@ std::vector<llama_token> llama_tokenize(
238238
const struct llama_context * ctx,
239239
const std::string & text,
240240
bool add_special,
241-
bool parse_special = false);
241+
bool parse_special = false,
242+
bool fix_dobule_bos = false);
242243

243244
std::vector<llama_token> llama_tokenize(
244245
const struct llama_model * model,
245246
const std::string & text,
246247
bool add_special,
247-
bool parse_special = false);
248+
bool parse_special = false,
249+
bool fix_double_bos = false);
250+
251+
// if the first and the second token in the prompt are both EOS, remove the first token
252+
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt);
248253

249254
// tokenizes a token into a piece, optionally renders special/control tokens
250255
// should work similar to Python's `tokenizer.id_to_piece`

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
7171
// tokenize the prompt
7272

7373
std::vector<llama_token> tokens_list;
74-
tokens_list = ::llama_tokenize(model, params.prompt, true);
74+
tokens_list = ::llama_tokenize(model, params.prompt, true, true, true);
7575

7676
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
7777

examples/beam-search/beam-search.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ int main(int argc, char ** argv)
137137
// Tokenize the prompt :
138138
//---------------------------------
139139

140-
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
140+
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true, true, true);
141141

142142
const size_t max_context_size = llama_n_ctx( ctx );
143143
const size_t max_tokens_list_size = max_context_size - 4 ;

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
114114
// tokenize the prompts and trim
115115
std::vector<std::vector<int32_t>> inputs;
116116
for (const auto & prompt : prompts) {
117-
auto inp = ::llama_tokenize(ctx, prompt, true, false);
117+
auto inp = ::llama_tokenize(ctx, prompt, true, false, true);
118118
if (inp.size() > n_batch) {
119119
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
120120
__func__, (long long int) inp.size(), (long long int) n_batch);

examples/imatrix/imatrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
378378
auto tim1 = std::chrono::high_resolution_clock::now();
379379
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
380380

381-
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
381+
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);
382382

383383
auto tim2 = std::chrono::high_resolution_clock::now();
384384
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

examples/infill/infill.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ int main(int argc, char ** argv) {
248248
suff_rm_leading_spc = false;
249249
}
250250
std::vector<llama_token> embd_inp;
251-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
252-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
251+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
252+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
253253
const int space_token = 29871;
254254
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
255255
inp_sfx.erase(inp_sfx.begin());
@@ -280,10 +280,10 @@ int main(int argc, char ** argv) {
280280
if (ctx_guidance) {
281281
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
282282

283-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
283+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
284284
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
285285

286-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
286+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
287287
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
288288

289289
original_prompt_len = original_inp.size();
@@ -630,8 +630,8 @@ int main(int argc, char ** argv) {
630630
suff_rm_leading_spc = false;
631631
}
632632
// tokenize new prefix and suffix
633-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
634-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
633+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
634+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
635635
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
636636
inp_sfx.erase(inp_sfx.begin());
637637
}
@@ -703,7 +703,7 @@ int main(int argc, char ** argv) {
703703

704704
const size_t original_size = embd_inp.size();
705705

706-
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
706+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false);
707707
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
708708

709709
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

examples/llava/llava-cli.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
3535

3636
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
3737
std::string str2 = str;
38-
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
38+
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos);
3939
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
4040
return true;
4141
}
@@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
156156
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
157157
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
158158
if (params->verbose_prompt) {
159-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
159+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true);
160160
for (int i = 0; i < (int) tmp.size(); i++) {
161161
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
162162
}
163163
}
164164
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
165165
if (params->verbose_prompt) {
166-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
166+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
167167
for (int i = 0; i < (int) tmp.size(); i++) {
168168
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
169169
}
@@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
173173
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
174174
user_prompt = prompt + "\nASSISTANT:";
175175
if (params->verbose_prompt) {
176-
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
176+
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
177177
for (int i = 0; i < (int) tmp.size(); i++) {
178178
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
179179
}

examples/lookahead/lookahead.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
6767
std::vector<llama_token> inp;
6868
std::vector<llama_token> all;
6969

70-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
70+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
7171
all = inp;
7272

7373
const int max_context_size = llama_n_ctx(ctx);

examples/lookup/lookup-create.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ int main(int argc, char ** argv){
2929

3030
// tokenize the prompt
3131
std::vector<llama_token> inp;
32-
inp = ::llama_tokenize(ctx, params.prompt, true, true);
32+
inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
3333
fprintf(stderr, "%s: tokenization done\n", __func__);
3434

3535

0 commit comments

Comments
 (0)