GH-3652: fix tokenization issue

alanakbik · alanakbik · commit b5328bbe23a9 · 2025-03-30T23:00:16.000+09:00
diff --git a/flair/tokenization.py b/flair/tokenization.py
@@ -316,18 +316,20 @@ class StaccatoTokenizer(Tokenizer):
     - Sequences of numbers are kept together as single tokens
     - Kanji characters are split into individual tokens
     - Uninterrupted sequences of letters (Latin, Cyrillic, etc.) and kana are preserved as single tokens
+    - Whitespace and common zero-width characters are ignored.
     """
 
     def __init__(self):
         super().__init__()
         # Define patterns for different character types
-        self.punctuation = r"[^\w\s]"  # Non-alphanumeric, non-whitespace
+        # Punctuation/Symbols: Non-alphanumeric, non-whitespace, excluding common zero-width characters and BOM
+        self.punctuation = r"[^\w\s\uFE00-\uFE0F\u200B-\u200D\u2060-\u206F\uFEFF]"
         self.digits = r"\d+"  # One or more digits
         self.kanji = r"[\u4e00-\u9fff]"  # Kanji characters
 
         # Unicode ranges for various alphabets and scripts
-        # This includes Latin, Cyrillic, Greek, Hebrew, Arabic, etc.
-        self.alphabets = [
+        # This includes Latin, Cyrillic, Greek, Hebrew, Arabic, Japanese Kana, Korean Hangul, etc.
+        alphabets_list = [
             r"[a-zA-Z]+",  # Latin
             r"[\u0400-\u04FF\u0500-\u052F]+",  # Cyrillic and Cyrillic Supplement
             r"[\u0370-\u03FF\u1F00-\u1FFF]+",  # Greek and Coptic
@@ -337,42 +339,32 @@ def __init__(self):
             r"[\u3040-\u309F]+",  # Hiragana
             r"[\u30A0-\u30FF]+",  # Katakana
             r"[\uAC00-\uD7AF]+",  # Hangul (Korean)
-            # Add more scripts as needed
+            # Add more script ranges here if needed
         ]
+        self.alphabet_pattern = "|".join(alphabets_list)
 
-        # Combined pattern for tokenization
-        self.alphabet_pattern = "|".join(self.alphabets)
+        # Combined pattern for re.findall:
+        # Captures letter sequences OR digit sequences OR Kanji OR punctuation/symbols
+        combined_pattern = f"({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
+        # Pre-compile the regex for efficiency
+        self.token_pattern = re.compile(combined_pattern)
 
     def tokenize(self, text: str) -> list[str]:
         """
-        Tokenize the input text according to the defined rules.
+        Tokenize the input text using re.findall to extract valid tokens.
 
         Args:
             text: The input text to tokenize
 
         Returns:
-            A list of tokens
+            A list of tokens (strings)
         """
-        # Create a pattern that matches:
-        # 1. Punctuation characters
-        # 2. Number sequences
-        # 3. Kanji characters individually
-        # 4. Letter sequences from various scripts
-        pattern = f"({self.punctuation}|{self.digits}|{self.kanji})"
-
-        # First split by punctuation, numbers, and kanji
-        raw_tokens = []
-        parts = re.split(pattern, text)
-
-        # Filter out empty strings
-        for part in parts:
-            if part:
-                # If part is punctuation, number, or kanji, add it directly
-                if re.fullmatch(pattern, part):
-                    raw_tokens.append(part)
-                else:
-                    # For other text, split by whitespace
-                    subparts = part.split()
-                    raw_tokens.extend(subparts)
-
-        return raw_tokens
+        # Find all matches for the defined token patterns
+        matches = self.token_pattern.findall(text)
+
+        # re.findall returns a list of tuples, where each tuple corresponds to the capturing groups.
+        # For a match, only one group will be non-empty. We extract that non-empty group.
+        # Example match: ('word', '', '', '') or ('', '123', '', '') or ('', '', '好', '') or ('', '', '', '.')
+        tokens = [next(filter(None, match_tuple)) for match_tuple in matches]
+
+        return tokens
diff --git a/tests/test_tokenize_sentence.py b/tests/test_tokenize_sentence.py
@@ -48,29 +48,6 @@ def test_create_sentence_with_extra_whitespace():
     assert sentence.get_token(4).text == "."
 
 
-@pytest.mark.skip(reason="Fix these issues for StaccatoTokenizer in future PR")
-def test_create_sentence_difficult_encoding():
-    text = "so out of the norm ❤ ️ enjoyed every moment️"
-    sentence = Sentence(text)
-    assert len(sentence) == 9
-
-    text = (
-        "equivalently , accumulating the logs as :( 6 ) sl = 1N ∑ t = 1Nlogp "
-        "( Ll | xt \u200b , θ ) where "
-        "p ( Ll | xt \u200b , θ ) represents the class probability output"
-    )
-    sentence = Sentence(text)
-    assert len(sentence) == 37
-
-    text = "This guy needs his own show on Discivery Channel ! ﻿"
-    sentence = Sentence(text)
-    assert len(sentence) == 10
-
-    text = "n't have new vintages."
-    sentence = Sentence(text, use_tokenizer=True)
-    assert len(sentence) == 5
-
-
 def test_create_sentence_word_by_word():
     token1: Token = Token("Munich")
     token2: Token = Token("and")
@@ -403,25 +380,25 @@ def test_print_sentence_plain(tasks_base_path):
     sentence = corpus.train[0]
     sentence.infer_space_after()
     assert (
-        sentence.to_tokenized_string() == 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in '
-        "einer Weise aufgetreten , "
-        'die alles andere als überzeugend war " .'
+            sentence.to_tokenized_string() == 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in '
+                                              "einer Weise aufgetreten , "
+                                              'die alles andere als überzeugend war " .'
     )
     assert (
-        sentence.to_plain_string() == 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer '
-        "Weise aufgetreten, die "
-        'alles andere als überzeugend war".'
+            sentence.to_plain_string() == 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer '
+                                          "Weise aufgetreten, die "
+                                          'alles andere als überzeugend war".'
     )
 
     sentence = corpus.train[1]
     sentence.infer_space_after()
     assert (
-        sentence.to_tokenized_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
-        "Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ."
+            sentence.to_tokenized_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
+                                              "Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ."
     )
     assert (
-        sentence.to_plain_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
-        "Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf."
+            sentence.to_plain_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
+                                          "Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf."
     )
 
 
@@ -616,6 +593,27 @@ def test_staccato_tokenizer_with_multilingual_text():
     assert [token.text for token in arabic_sentence.tokens] == ["مرحبا", "بالعالم", "!", "123"]
 
 
+def test_create_sentence_difficult_encoding():
+    text = "so out of the norm ❤ ️ enjoyed every moment️"
+    sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
+    assert len(sentence) == 9
+
+    text = "This guy needs his own show on Discivery Channel ! ﻿"
+    sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
+    assert len(sentence) == 10
+
+    text = "n't have new vintages."
+    sentence = Sentence(text, use_tokenizer=True)
+    assert len(sentence) == 5
+
+    text = (
+        "equivalently , accumulating the logs as :( 6 ) sl = 1N ∑ t = 1Nlogp "
+        "( Ll | xt \u200b , θ ) where "
+        "p ( Ll | xt \u200b , θ ) represents the class probability output"
+    )
+    sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
+    assert len(sentence) == 40
+
 def test_sentence_retokenize():
     # Create a sentence with default tokenization
     sentence = Sentence("01-03-2025 New York")