diff --git a/tests/utils/test_tokenizer_data_utils.py b/tests/utils/test_tokenizer_data_utils.py index e24c90099..f9712d05e 100644 --- a/tests/utils/test_tokenizer_data_utils.py +++ b/tests/utils/test_tokenizer_data_utils.py @@ -1,13 +1,55 @@ # Third party -# Third Party from transformers import AutoModelForCausalLM, AutoTokenizer # First Party from tests.artifacts.testdata import MODEL_NAME +from tuning.config import configs # Local # First party -from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize +from tuning.utils.tokenizer_data_utils import ( + tokenizer_and_embedding_resize, + set_special_tokens_dict, +) + + +def test_setting_special_tokens_with_LlamaTokenizerFast(): + # For LlamaTokenizerFast, Missing PAD Token + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict != { + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + } + + +def test_setting_special_tokens_with_GPT2TokenizerFast(): + # For GPT2TokenizerFast, PAD token = EOS Token + tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base") + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict == { + "pad_token": "", + } + + +def test_setting_special_tokens_with_GPTNeoXTokenizerFast(): + # For GPTNeoXTokenizerFast, Missing PAD Token + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict == { + "pad_token": "", + } def test_tokenizer_and_embedding_resize_return_values():