Skip to content

Commit

Permalink
[FastTokenizer] Add unittest for fast_tokenizer (PaddlePaddle#4339)
Browse files Browse the repository at this point in the history
* with_added_tokens->with_added_vocabulary

* faster->fast

* Add fast_tokenizer to test and ci requirements

* Add unittest for BertTokenizerFast

* faster->fast

* Add ernie tokenizer fast unittest

* fast_tokenizer->fast_tokenizer_python

* Add fast_tokenizer_python

* Add TinyBertFastTokenizer unittest

* Add ernie-m tokenizer unittest

* Add \n for tests/requirements.txt

* Add test_sequence_builders for ErnieMFastTokenizer

* Add full tokenizer test for fast_tokenizer

* Add use_faster args test

* Add test_fast_and_python_full_tokenizer
  • Loading branch information
joey12300 authored Jan 5, 2023
1 parent d218a25 commit 6960a46
Show file tree
Hide file tree
Showing 19 changed files with 344 additions and 119 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ msgstr ""
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.10.1\n"

#: ../source/paddlenlp.transformers.tokenizer_utils_faster.rst:2
msgid "tokenizer\\_utils\\_faster"
#: ../source/paddlenlp.transformers.tokenizer_utils_fast.rst:2
msgid "tokenizer\\_utils\\_fast"
msgstr ""

2 changes: 1 addition & 1 deletion docs/source/paddlenlp.transformers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,5 @@ paddlenlp.transformers
paddlenlp.transformers.sentencepiece_model_pb2
paddlenlp.transformers.tokenizer_utils
paddlenlp.transformers.tokenizer_utils_base
paddlenlp.transformers.tokenizer_utils_faster
paddlenlp.transformers.tokenizer_utils_fast
paddlenlp.transformers.utils
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
tokenizer\_utils\_faster
tokenizer\_utils\_fast
======================================================

.. automodule:: paddlenlp.transformers.tokenizer_utils_faster
.. automodule:: paddlenlp.transformers.tokenizer_utils_fast
:members:
:no-undoc-members:
:show-inheritance:
Expand Down
6 changes: 3 additions & 3 deletions paddlenlp/transformers/auto/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
"""
# Default not to use fast tokenizer
use_fast = kwargs.pop("use_fast", False)
if "use_fast" in kwargs:
use_fast = kwargs.pop("use_fast", False)
logger.warning("The keyword argument `use_fast` is deprecated in future, please use `use_fast` instead")
if "use_faster" in kwargs:
use_fast = kwargs.pop("use_faster", False)
logger.warning("The keyword argument `use_faster` is deprecated in future, please use `use_fast` instead")

all_tokenizer_names = []
for names, tokenizer_class in cls._tokenizer_mapping.items():
Expand Down
5 changes: 3 additions & 2 deletions paddlenlp/transformers/bert/fast_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
# limitations under the License.

import json
from typing import List, Optional, Tuple
from typing import Optional, Tuple

from fast_tokenizer import normalizers
from ..tokenizer_utils_faster import PretrainedFastTokenizer

from ..tokenizer_utils_fast import PretrainedFastTokenizer
from .tokenizer import BertTokenizer

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
Expand Down
34 changes: 20 additions & 14 deletions paddlenlp/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Tuple

from fast_tokenizer import Tokenizer, normalizers, pretokenizers, postprocessors, decoders
from fast_tokenizer.models import WordPiece, FastWordPiece, BPE, Unigram
from fast_tokenizer import (
Tokenizer,
decoders,
normalizers,
postprocessors,
pretokenizers,
)
from fast_tokenizer.models import BPE, FastWordPiece, Unigram


# Extract the vocab and merge file from sentencepiece file
Expand Down Expand Up @@ -127,12 +131,12 @@ def unk_id(self, proto):
return proto.trainer_spec.unk_id

def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type
self.model_type = proto.trainer_spec.model_type
vocab = self.vocab(proto)
unk_id = self.unk_id(proto)
if model_type == 1:
if self.model_type == 1:
tokenizer = Tokenizer(Unigram(vocab, unk_id))
elif model_type == 2:
elif self.model_type == 2:
# Special case for ernie-m
if hasattr(self.original_tokenizer, "sentencepiece_model_file"):
orginal_vocab_file = self.original_tokenizer.sentencepiece_model_file
Expand Down Expand Up @@ -200,13 +204,15 @@ def converted(self) -> Tokenizer:
class ErnieMConverter(SpmConverter):
def set_model(self, tokenizer):
SPLICE_UNDERLINE = self.replacement()
tokenizer.model.set_filter_token(SPLICE_UNDERLINE)
chinese_chars = r"\x{4e00}-\x{9fff}"
punc_chars = r",;:.?!~,;:。?!《》【】"
digits = r"0-9"
tokenizer.model.set_split_rule(
rf"[{chinese_chars}]|[{punc_chars}]|[{digits}]+|[^{chinese_chars}{punc_chars}{digits}]+"
)
if self.model_type == 1:
# Unigram
tokenizer.model.set_filter_token(SPLICE_UNDERLINE)
chinese_chars = r"\x{4e00}-\x{9fff}"
punc_chars = r",;:.?!~,;:。?!《》【】"
digits = r"0-9"
tokenizer.model.set_split_rule(
rf"[{chinese_chars}]|[{punc_chars}]|[{digits}]+|[^{chinese_chars}{punc_chars}{digits}]+"
)

def normalizer(self, proto):
list_normalizers = []
Expand Down
5 changes: 3 additions & 2 deletions paddlenlp/transformers/ernie/fast_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
# limitations under the License.

import json
from typing import List, Optional, Tuple
from typing import Optional, Tuple

from fast_tokenizer import normalizers
from ..tokenizer_utils_faster import PretrainedFastTokenizer

from ..tokenizer_utils_fast import PretrainedFastTokenizer
from .tokenizer import ErnieTokenizer

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
Expand Down
10 changes: 4 additions & 6 deletions paddlenlp/transformers/ernie_m/fast_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@
# limitations under the License.

import os
import json
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
from shutil import copyfile
from typing import List, Optional, Tuple, Union

from fast_tokenizer import normalizers
from ..tokenizer_utils_faster import PretrainedFastTokenizer
from ..tokenizer_utils_base import TensorType, PaddingStrategy, TruncationStrategy
from .tokenizer import ErnieMTokenizer
from ...utils.log import logger
from ..tokenizer_utils_base import PaddingStrategy, TensorType, TruncationStrategy
from ..tokenizer_utils_fast import PretrainedFastTokenizer
from .tokenizer import ErnieMTokenizer

VOCAB_FILES_NAMES = {
"sentencepiece_model_file": "sentencepiece.bpe.model",
Expand Down
5 changes: 3 additions & 2 deletions paddlenlp/transformers/tinybert/fast_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
# limitations under the License.

import json
from typing import List, Optional, Tuple
from typing import Optional, Tuple

from fast_tokenizer import normalizers
from ..tokenizer_utils_faster import PretrainedFastTokenizer

from ..tokenizer_utils_fast import PretrainedFastTokenizer
from .tokenizer import TinyBertTokenizer

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,25 @@
import os
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union
import six

from fast_tokenizer import Encoding as FastEncoding
from fast_tokenizer import Tokenizer as FastTokenizer

from .utils import InitTrackerMeta, fn_args_to_dict
from .convert_slow_tokenizer import convert_slow_tokenizer
from .tokenizer_utils import PretrainedTokenizer
from .tokenizer_utils_base import (
AddedToken,
BatchEncoding,
EncodedInput,
EncodedInputPair,
PaddingStrategy,
PreTokenizedInput,
PreTokenizedInputPair,
PretrainedTokenizerBase,
SpecialTokensMixin,
TextInput,
TextInputPair,
TruncationStrategy,
PaddingStrategy,
)
from .tokenizer_utils import PretrainedTokenizer
from paddlenlp.utils.log import logger

TOKENIZER_FILE = "tokenizer.json"
VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}
Expand Down Expand Up @@ -122,7 +118,7 @@ def __len__(self) -> int:
"""
Size of the full vocabulary with the added tokens.
"""
return self._tokenizer.get_vocab_size(with_added_tokens=True)
return self._tokenizer.get_vocab_size(with_added_vocabulary=True)

@property
def backend_tokenizer(self) -> FastTokenizer:
Expand Down
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pytest
parameterized
pytest-cov
regex
pytest-xdist
pytest-xdist
fast_tokenizer_python
1 change: 1 addition & 0 deletions scripts/regression/requirements_ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ parameterized
scikit-learn
cma
paddleocr
fast_tokenizer_python
1 change: 1 addition & 0 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
parameterized
sentencepiece
regex
fast_tokenizer_python
7 changes: 7 additions & 0 deletions tests/transformers/auto/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,10 @@ def test_fast_tokenizer_non_exist(self):
tokenizer1 = AutoTokenizer.from_pretrained("t5-small", use_fast=True)
# T5 FastTokenizer doesn't exist yet, so from_pretrained will return the normal tokenizer.
self.assertIsInstance(tokenizer1, paddlenlp.transformers.T5Tokenizer)

def test_use_faster(self):
tokenizer = AutoTokenizer.from_pretrained("__internal_testing__/bert", use_faster=True)
if is_fast_tokenizer_available():
self.assertIsInstance(tokenizer, paddlenlp.transformers.BertFastTokenizer)
else:
self.assertIsInstance(tokenizer, paddlenlp.transformers.BertTokenizer)
70 changes: 65 additions & 5 deletions tests/transformers/bert/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os
import unittest

from paddlenlp.transformers.bert.fast_tokenizer import BertFastTokenizer
from paddlenlp.transformers.bert.tokenizer import (
BasicTokenizer,
BertTokenizer,
Expand All @@ -26,12 +27,17 @@
)

from ...testing_utils import slow
from ...transformers.test_tokenizer_common import TokenizerTesterMixin, filter_non_english
from ...transformers.test_tokenizer_common import (
TokenizerTesterMixin,
filter_non_english,
)


class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

tokenizer_class = BertTokenizer
fast_tokenizer_class = BertFastTokenizer
test_fast_tokenizer = True
space_between_special_tokens = True
from_pretrained_filter = filter_non_english
test_seq2seq = False
Expand Down Expand Up @@ -73,6 +79,41 @@ def test_full_tokenizer(self):
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])

def test_fast_and_python_full_tokenizer(self):
if not self.test_fast_tokenizer:
return

tokenizer = self.get_tokenizer()
tokenizer_fast = self.get_fast_tokenizer()

sequence = "UNwant\u00E9d,running"
tokens = tokenizer.tokenize(sequence)
tokens_fast = tokenizer_fast.tokenize(sequence)
self.assertListEqual(tokens, tokens_fast)

ids = tokenizer.encode(sequence, add_special_tokens=False)["input_ids"]
ids_fast = tokenizer_fast.encode(sequence, add_special_tokens=False)["input_ids"]
self.assertListEqual(ids, ids_fast)

ids = tokenizer.encode(sequence)["input_ids"]
ids_fast = tokenizer_fast.encode(sequence)["input_ids"]
self.assertListEqual(ids, ids_fast)

tokenizer = self.get_tokenizer(do_lower_case=True)
tokenizer_fast = self.get_fast_tokenizer(do_lower_case=True)

tokens = tokenizer.tokenize(sequence)
tokens_fast = tokenizer_fast.tokenize(sequence)
self.assertListEqual(tokens, tokens_fast)

ids = tokenizer.encode(sequence, add_special_tokens=False)["input_ids"]
ids_fast = tokenizer_fast.encode(sequence, add_special_tokens=False)["input_ids"]
self.assertListEqual(ids, ids_fast)

ids = tokenizer.encode(sequence)["input_ids"]
ids_fast = tokenizer_fast.encode(sequence)["input_ids"]
self.assertListEqual(ids, ids_fast)

def test_chinese(self):
tokenizer = BasicTokenizer()

Expand Down Expand Up @@ -181,9 +222,12 @@ def test_is_punctuation(self):

def test_clean_text(self):
tokenizer = self.get_tokenizer()

tokenizer_fast = self.get_fast_tokenizer()
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
self.assertListEqual(
[tokenizer_fast.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
)

@slow
def test_sequence_builders(self):
Expand All @@ -204,6 +248,7 @@ def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_fast = self.fast_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

sentence = f"A, naïve {tokenizer.mask_token} AllenNLP sentence."
tokens = tokenizer.encode(
Expand All @@ -214,6 +259,14 @@ def test_offsets_with_special_characters(self):
add_special_tokens=True,
)

tokens_fast = tokenizer_fast.encode(
sentence,
return_attention_mask=False,
return_token_type_ids=False,
return_offsets_mapping=True,
add_special_tokens=True,
)

do_lower_case = tokenizer.do_lower_case if hasattr(tokenizer, "do_lower_case") else False
expected_results = (
[
Expand Down Expand Up @@ -250,7 +303,11 @@ def test_offsets_with_special_characters(self):
self.assertEqual(
[e[1] for e in expected_results], tokenizer.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual(
[e[1] for e in expected_results], tokenizer_fast.convert_ids_to_tokens(tokens_fast["input_ids"])
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
self.assertEqual([e[0] for e in expected_results], tokens_fast["offset_mapping"])

def test_change_tokenize_chinese_chars(self):
list_of_commun_chinese_char = ["的", "人", "有"]
Expand All @@ -260,25 +317,28 @@ def test_change_tokenize_chinese_chars(self):

kwargs["tokenize_chinese_chars"] = True
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_fast = self.fast_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

ids_without_spe_char_p = tokenizer.encode(
text_with_chinese_char, return_token_type_ids=None, add_special_tokens=False
)["input_ids"]
ids_without_spe_char_fast = tokenizer_fast.encode(
text_with_chinese_char, return_token_type_ids=None, add_special_tokens=False
)["input_ids"]

tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens(ids_without_spe_char_p)
tokens_without_spe_char_fast = tokenizer_fast.convert_ids_to_tokens(ids_without_spe_char_fast)

# it is expected that each Chinese character is not preceded by "##"
self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
self.assertListEqual(tokens_without_spe_char_fast, list_of_commun_chinese_char)

# not yet supported in bert tokenizer
"""
kwargs["tokenize_chinese_chars"] = False
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer.encode(text_with_chinese_char, return_token_type_ids=None,add_special_tokens=False)["input_ids"]
tokens_without_spe_char_p = tokenizer.convert_ids_to_tokens(ids_without_spe_char_p)
# it is expected that only the first Chinese character is not preceded by "##".
expected_tokens = [
f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
Expand Down
Loading

0 comments on commit 6960a46

Please sign in to comment.