Skip to content

Commit b5328bb

Browse files
committed
GH-3652: fix tokenization issue
1 parent 6adefcc commit b5328bb

File tree

2 files changed

+54
-64
lines changed

2 files changed

+54
-64
lines changed

flair/tokenization.py

+23-31
Original file line numberDiff line numberDiff line change
@@ -316,18 +316,20 @@ class StaccatoTokenizer(Tokenizer):
316316
- Sequences of numbers are kept together as single tokens
317317
- Kanji characters are split into individual tokens
318318
- Uninterrupted sequences of letters (Latin, Cyrillic, etc.) and kana are preserved as single tokens
319+
- Whitespace and common zero-width characters are ignored.
319320
"""
320321

321322
def __init__(self):
322323
super().__init__()
323324
# Define patterns for different character types
324-
self.punctuation = r"[^\w\s]" # Non-alphanumeric, non-whitespace
325+
# Punctuation/Symbols: Non-alphanumeric, non-whitespace, excluding common zero-width characters and BOM
326+
self.punctuation = r"[^\w\s\uFE00-\uFE0F\u200B-\u200D\u2060-\u206F\uFEFF]"
325327
self.digits = r"\d+" # One or more digits
326328
self.kanji = r"[\u4e00-\u9fff]" # Kanji characters
327329

328330
# Unicode ranges for various alphabets and scripts
329-
# This includes Latin, Cyrillic, Greek, Hebrew, Arabic, etc.
330-
self.alphabets = [
331+
# This includes Latin, Cyrillic, Greek, Hebrew, Arabic, Japanese Kana, Korean Hangul, etc.
332+
alphabets_list = [
331333
r"[a-zA-Z]+", # Latin
332334
r"[\u0400-\u04FF\u0500-\u052F]+", # Cyrillic and Cyrillic Supplement
333335
r"[\u0370-\u03FF\u1F00-\u1FFF]+", # Greek and Coptic
@@ -337,42 +339,32 @@ def __init__(self):
337339
r"[\u3040-\u309F]+", # Hiragana
338340
r"[\u30A0-\u30FF]+", # Katakana
339341
r"[\uAC00-\uD7AF]+", # Hangul (Korean)
340-
# Add more scripts as needed
342+
# Add more script ranges here if needed
341343
]
344+
self.alphabet_pattern = "|".join(alphabets_list)
342345

343-
# Combined pattern for tokenization
344-
self.alphabet_pattern = "|".join(self.alphabets)
346+
# Combined pattern for re.findall:
347+
# Captures letter sequences OR digit sequences OR Kanji OR punctuation/symbols
348+
combined_pattern = f"({self.alphabet_pattern})|({self.digits})|({self.kanji})|({self.punctuation})"
349+
# Pre-compile the regex for efficiency
350+
self.token_pattern = re.compile(combined_pattern)
345351

346352
def tokenize(self, text: str) -> list[str]:
347353
"""
348-
Tokenize the input text according to the defined rules.
354+
Tokenize the input text using re.findall to extract valid tokens.
349355
350356
Args:
351357
text: The input text to tokenize
352358
353359
Returns:
354-
A list of tokens
360+
A list of tokens (strings)
355361
"""
356-
# Create a pattern that matches:
357-
# 1. Punctuation characters
358-
# 2. Number sequences
359-
# 3. Kanji characters individually
360-
# 4. Letter sequences from various scripts
361-
pattern = f"({self.punctuation}|{self.digits}|{self.kanji})"
362-
363-
# First split by punctuation, numbers, and kanji
364-
raw_tokens = []
365-
parts = re.split(pattern, text)
366-
367-
# Filter out empty strings
368-
for part in parts:
369-
if part:
370-
# If part is punctuation, number, or kanji, add it directly
371-
if re.fullmatch(pattern, part):
372-
raw_tokens.append(part)
373-
else:
374-
# For other text, split by whitespace
375-
subparts = part.split()
376-
raw_tokens.extend(subparts)
377-
378-
return raw_tokens
362+
# Find all matches for the defined token patterns
363+
matches = self.token_pattern.findall(text)
364+
365+
# re.findall returns a list of tuples, where each tuple corresponds to the capturing groups.
366+
# For a match, only one group will be non-empty. We extract that non-empty group.
367+
# Example match: ('word', '', '', '') or ('', '123', '', '') or ('', '', '好', '') or ('', '', '', '.')
368+
tokens = [next(filter(None, match_tuple)) for match_tuple in matches]
369+
370+
return tokens

tests/test_tokenize_sentence.py

+31-33
Original file line numberDiff line numberDiff line change
@@ -48,29 +48,6 @@ def test_create_sentence_with_extra_whitespace():
4848
assert sentence.get_token(4).text == "."
4949

5050

51-
@pytest.mark.skip(reason="Fix these issues for StaccatoTokenizer in future PR")
52-
def test_create_sentence_difficult_encoding():
53-
text = "so out of the norm ❤ ️ enjoyed every moment️"
54-
sentence = Sentence(text)
55-
assert len(sentence) == 9
56-
57-
text = (
58-
"equivalently , accumulating the logs as :( 6 ) sl = 1N ∑ t = 1Nlogp "
59-
"( Ll | xt \u200b , θ ) where "
60-
"p ( Ll | xt \u200b , θ ) represents the class probability output"
61-
)
62-
sentence = Sentence(text)
63-
assert len(sentence) == 37
64-
65-
text = "This guy needs his own show on Discivery Channel ! "
66-
sentence = Sentence(text)
67-
assert len(sentence) == 10
68-
69-
text = "n't have new vintages."
70-
sentence = Sentence(text, use_tokenizer=True)
71-
assert len(sentence) == 5
72-
73-
7451
def test_create_sentence_word_by_word():
7552
token1: Token = Token("Munich")
7653
token2: Token = Token("and")
@@ -403,25 +380,25 @@ def test_print_sentence_plain(tasks_base_path):
403380
sentence = corpus.train[0]
404381
sentence.infer_space_after()
405382
assert (
406-
sentence.to_tokenized_string() == 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in '
407-
"einer Weise aufgetreten , "
408-
'die alles andere als überzeugend war " .'
383+
sentence.to_tokenized_string() == 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in '
384+
"einer Weise aufgetreten , "
385+
'die alles andere als überzeugend war " .'
409386
)
410387
assert (
411-
sentence.to_plain_string() == 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer '
412-
"Weise aufgetreten, die "
413-
'alles andere als überzeugend war".'
388+
sentence.to_plain_string() == 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer '
389+
"Weise aufgetreten, die "
390+
'alles andere als überzeugend war".'
414391
)
415392

416393
sentence = corpus.train[1]
417394
sentence.infer_space_after()
418395
assert (
419-
sentence.to_tokenized_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
420-
"Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ."
396+
sentence.to_tokenized_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
397+
"Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf ."
421398
)
422399
assert (
423-
sentence.to_plain_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
424-
"Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf."
400+
sentence.to_plain_string() == "Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als "
401+
"Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf."
425402
)
426403

427404

@@ -616,6 +593,27 @@ def test_staccato_tokenizer_with_multilingual_text():
616593
assert [token.text for token in arabic_sentence.tokens] == ["مرحبا", "بالعالم", "!", "123"]
617594

618595

596+
def test_create_sentence_difficult_encoding():
597+
text = "so out of the norm ❤ ️ enjoyed every moment️"
598+
sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
599+
assert len(sentence) == 9
600+
601+
text = "This guy needs his own show on Discivery Channel ! "
602+
sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
603+
assert len(sentence) == 10
604+
605+
text = "n't have new vintages."
606+
sentence = Sentence(text, use_tokenizer=True)
607+
assert len(sentence) == 5
608+
609+
text = (
610+
"equivalently , accumulating the logs as :( 6 ) sl = 1N ∑ t = 1Nlogp "
611+
"( Ll | xt \u200b , θ ) where "
612+
"p ( Ll | xt \u200b , θ ) represents the class probability output"
613+
)
614+
sentence = Sentence(text, use_tokenizer=StaccatoTokenizer())
615+
assert len(sentence) == 40
616+
619617
def test_sentence_retokenize():
620618
# Create a sentence with default tokenization
621619
sentence = Sentence("01-03-2025 New York")

0 commit comments

Comments
 (0)