@@ -65,7 +65,9 @@ def replace_vocabulary(
6565
6666 # Remove old added tokens from added tokens
6767 tokenizer_json ["added_tokens" ] = [x for x in added_tokens if x ["content" ] in {"[UNK]" , "[PAD]" }]
68- tokenizer_json = process_tokenizer (tokenizer_json , pre_tokenized_tokens , "[UNK]" if "[UNK]" in pre_tokenized_tokens else None )
68+ tokenizer_json = process_tokenizer (
69+ tokenizer_json , pre_tokenized_tokens , "[UNK]" if "[UNK]" in pre_tokenized_tokens else None
70+ )
6971
7072 # Remap special tokens
7173 tokenizer_json ["added_tokens" ] = _remap_added_tokens (
@@ -111,11 +113,11 @@ def clean_and_create_vocabulary(
111113 internal_vocab : dict [str , int ] = tokenizer .get_vocab ()
112114 internal_tokens : list [str ] = [k for k , _ in sorted (internal_vocab .items (), key = lambda x : x [1 ])]
113115
116+ cleaned_vocabulary = _process_internal_tokens (tokenizer , backend_tokenizer , internal_tokens , token_remove_regex )
114117 # Copy the backend tokenizer to avoid modifying the original.
115118 backend_tokenizer = backend_tokenizer .from_str (backend_tokenizer .to_str ())
116119 backend_tokenizer = replace_normalizer (backend_tokenizer )
117120
118- cleaned_vocabulary = _process_internal_tokens (tokenizer , backend_tokenizer , internal_tokens , token_remove_regex )
119121 internal_tokens_set = {token .form for token in cleaned_vocabulary }
120122
121123 normalizer : Normalizer | None = backend_tokenizer .normalizer
@@ -302,7 +304,6 @@ def turn_tokens_into_ids(
302304 :param tokenizer: The tokenizer to use for converting tokens to IDs
303305 :param unk_token: The string form of the unk token.
304306 :return: List of token IDs corresponding to the input tokens
305- :raises ValueError: If the tokenizer returns an unexpected number of tokens for a single token
306307 """
307308 unk_id = None if unk_token is None else tokenizer .convert_tokens_to_ids (unk_token )
308309 prefix , suffix = find_eos_bos (tokenizer )
@@ -330,12 +331,14 @@ def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[in
330331 if len (encoding ) != 3 :
331332 a_encoded = tokenizer .encode ("a" , add_special_tokens = False )
332333 if len (a_encoded ) != 1 :
333- raise ValueError (f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{ a_encoded } '" )
334+ raise ValueError (
335+ f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{ a_encoded } '"
336+ )
334337 a_idx = encoding .index (a_encoded [0 ])
335- prefix , suffix = encoding [:a_idx ], encoding [a_idx + 1 :]
338+ prefix , suffix = encoding [:a_idx ], encoding [a_idx + 1 :]
336339 else :
337340 prefix , suffix = encoding [:1 ], encoding [2 :]
338- return prefix ,suffix
341+ return prefix , suffix
339342
340343
341344def _normalize_vocabulary_token (token : str , pre_tokenizer : PreTokenizer ) -> str :
0 commit comments