Skip to content

Commit fd7d473

Browse files
committed
fix: tokenizer bug
1 parent 9bec495 commit fd7d473

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

model2vec/tokenizer/tokenizer.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ def replace_vocabulary(
6565

6666
# Remove old added tokens from added tokens
6767
tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}]
68-
tokenizer_json = process_tokenizer(tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None)
68+
tokenizer_json = process_tokenizer(
69+
tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None
70+
)
6971

7072
# Remap special tokens
7173
tokenizer_json["added_tokens"] = _remap_added_tokens(
@@ -111,11 +113,11 @@ def clean_and_create_vocabulary(
111113
internal_vocab: dict[str, int] = tokenizer.get_vocab()
112114
internal_tokens: list[str] = [k for k, _ in sorted(internal_vocab.items(), key=lambda x: x[1])]
113115

116+
cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex)
114117
# Copy the backend tokenizer to avoid modifying the original.
115118
backend_tokenizer = backend_tokenizer.from_str(backend_tokenizer.to_str())
116119
backend_tokenizer = replace_normalizer(backend_tokenizer)
117120

118-
cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex)
119121
internal_tokens_set = {token.form for token in cleaned_vocabulary}
120122

121123
normalizer: Normalizer | None = backend_tokenizer.normalizer
@@ -302,7 +304,6 @@ def turn_tokens_into_ids(
302304
:param tokenizer: The tokenizer to use for converting tokens to IDs
303305
:param unk_token: The string form of the unk token.
304306
:return: List of token IDs corresponding to the input tokens
305-
:raises ValueError: If the tokenizer returns an unexpected number of tokens for a single token
306307
"""
307308
unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token)
308309
prefix, suffix = find_eos_bos(tokenizer)
@@ -330,12 +331,14 @@ def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[in
330331
if len(encoding) != 3:
331332
a_encoded = tokenizer.encode("a", add_special_tokens=False)
332333
if len(a_encoded) != 1:
333-
raise ValueError(f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'")
334+
raise ValueError(
335+
f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'"
336+
)
334337
a_idx = encoding.index(a_encoded[0])
335-
prefix, suffix = encoding[:a_idx], encoding[a_idx + 1:]
338+
prefix, suffix = encoding[:a_idx], encoding[a_idx + 1 :]
336339
else:
337340
prefix, suffix = encoding[:1], encoding[2:]
338-
return prefix,suffix
341+
return prefix, suffix
339342

340343

341344
def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str:

0 commit comments

Comments
 (0)