Skip to content

Commit

Permalink
Added unit tests for setting special tokens
Browse files Browse the repository at this point in the history
Added unit tests for set_special_tokens_dict() for LlamaTokenizerFast, GPT2TokenizerFast and GPTNeoXTokenizerFast.

Signed-off-by: Luka Dojcinovic <[email protected]>
  • Loading branch information
Luka-D committed Feb 18, 2025
1 parent 2df4780 commit 42a9c32
Showing 1 changed file with 44 additions and 2 deletions.
46 changes: 44 additions & 2 deletions tests/utils/test_tokenizer_data_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,55 @@
# Third party
# Third Party
from transformers import AutoModelForCausalLM, AutoTokenizer

# First Party
from tests.artifacts.testdata import MODEL_NAME
from tuning.config import configs

# Local
# First party
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import (
tokenizer_and_embedding_resize,
set_special_tokens_dict,
)


def test_setting_special_tokens_with_LlamaTokenizerFast():
# For LlamaTokenizerFast, Missing PAD Token
tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True)
model_args = configs.ModelArguments()
special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
print(tokenizer)
print("Special Tokens", special_tokens_dict)
assert special_tokens_dict != {
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<PAD>",
}


def test_setting_special_tokens_with_GPT2TokenizerFast():
# For GPT2TokenizerFast, PAD token = EOS Token
tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base")
model_args = configs.ModelArguments()
special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
print(tokenizer)
print("Special Tokens", special_tokens_dict)
assert special_tokens_dict == {
"pad_token": "<PAD>",
}


def test_setting_special_tokens_with_GPTNeoXTokenizerFast():
# For GPTNeoXTokenizerFast, Missing PAD Token
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
model_args = configs.ModelArguments()
special_tokens_dict = set_special_tokens_dict(model_args, tokenizer)
print(tokenizer)
print("Special Tokens", special_tokens_dict)
assert special_tokens_dict == {
"pad_token": "<PAD>",
}


def test_tokenizer_and_embedding_resize_return_values():
Expand Down

0 comments on commit 42a9c32

Please sign in to comment.