Skip to content

Commit 37c6d99

Browse files
committed
Add tokenizer trainer script to create and save a WordLevel tokenizer
1 parent bba0237 commit 37c6d99

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

tokenizer_trainer.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from tokenizers import Tokenizer
2+
from tokenizers.models import WordLevel
3+
from tokenizers.trainers import WordLevelTrainer
4+
from tokenizers.pre_tokenizers import Whitespace
5+
from transformers import PreTrainedTokenizerFast
6+
7+
# Initialize the tokenizer
8+
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
9+
tokenizer.pre_tokenizer = Whitespace()
10+
11+
# Trainer to learn the vocabulary
12+
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"])
13+
tokenizer.train(files=["sequences.txt"], trainer=trainer)
14+
15+
hf_tokenizer = PreTrainedTokenizerFast(
16+
tokenizer_object=tokenizer,
17+
unk_token="[UNK]",
18+
pad_token="[PAD]",
19+
cls_token="[CLS]",
20+
sep_token="[SEP]"
21+
)
22+
23+
# Save the tokenizer for future use
24+
hf_tokenizer.save_pretrained("./ho-sequence-tokenizer")

0 commit comments

Comments
 (0)