diff --git a/tests/utils/test_tokenizer_data_utils.py b/tests/utils/test_tokenizer_data_utils.py
index e24c90099..f9712d05e 100644
--- a/tests/utils/test_tokenizer_data_utils.py
+++ b/tests/utils/test_tokenizer_data_utils.py
@@ -1,13 +1,55 @@
# Third party
-# Third Party
from transformers import AutoModelForCausalLM, AutoTokenizer
# First Party
from tests.artifacts.testdata import MODEL_NAME
+from tuning.config import configs
# Local
# First party
-from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
+from tuning.utils.tokenizer_data_utils import (
+ tokenizer_and_embedding_resize,
+ set_special_tokens_dict,
+)
+
+
+def test_setting_special_tokens_with_LlamaTokenizerFast():
+ # For LlamaTokenizerFast, Missing PAD Token
+ tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
+ model_args = configs.ModelArguments()
+ special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
+ print(tokenizer)
+ print("Special Tokens", special_tokens_dict)
+ assert special_tokens_dict != {
+ "bos_token": "",
+ "eos_token": "",
+ "unk_token": "",
+ "pad_token": "",
+ }
+
+
+def test_setting_special_tokens_with_GPT2TokenizerFast():
+ # For GPT2TokenizerFast, PAD token = EOS Token
+ tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
+ model_args = configs.ModelArguments()
+ special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
+ print(tokenizer)
+ print("Special Tokens", special_tokens_dict)
+ assert special_tokens_dict == {
+ "pad_token": "",
+ }
+
+
+def test_setting_special_tokens_with_GPTNeoXTokenizerFast():
+ # For GPTNeoXTokenizerFast, Missing PAD Token
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+ model_args = configs.ModelArguments()
+ special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
+ print(tokenizer)
+ print("Special Tokens", special_tokens_dict)
+ assert special_tokens_dict == {
+ "pad_token": "",
+ }
def test_tokenizer_and_embedding_resize_return_values():