Merge pull request #43 from omalab/master

Non-breaking spaces should STILL be spaces
diasks2 · Nov 29, 2020 · 9c66fa5 · 9c66fa5
2 parents ab771a2 + ea4a66a
commit 9c66fa5
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 2 deletions.
diff --git a/lib/pragmatic_tokenizer/pre_processor.rb b/lib/pragmatic_tokenizer/pre_processor.rb
@@ -17,7 +17,7 @@ def pre_process(language: Languages::Common)
     private
 
       def remove_non_breaking_space!
-        gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
+        gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
       end
 
       def shift_various_characters!

diff --git a/lib/pragmatic_tokenizer/regex.rb b/lib/pragmatic_tokenizer/regex.rb
@@ -27,6 +27,7 @@ class Regex
     ASTERISK                      = /(?:\*+)/
     UNDERSCORE                    = /(?:_+)/
     HYPHEN_OR_UNDERSCORE          = /(?:[-_])/
+    LONG_WORD_SPLIT               = /(?:[-_\/—–])/
     PERIOD_AND_PRIOR              = /(?:(.+\.))/
     PERIOD_ONLY                   = /(?:(\.))/
     CONTRACTIONS                  = /(?:[‘’‚‛‹›＇´`])/

diff --git a/lib/pragmatic_tokenizer/tokenizer.rb b/lib/pragmatic_tokenizer/tokenizer.rb
@@ -253,7 +253,7 @@ def split_long_word(token)
         return token if token.length <= @long_word_split
         return token if token =~ Regex::ONLY_HASHTAG_MENTION
         return token if token =~ Regex::DOMAIN_OR_EMAIL
-        token.split(Regex::HYPHEN_OR_UNDERSCORE)
+        token.split(Regex::LONG_WORD_SPLIT)
       end
 
       def chosen_case(text)

diff --git a/spec/languages/english_spec.rb b/spec/languages/english_spec.rb
@@ -88,6 +88,12 @@
           expect(pt.tokenize(text)).to eq(["#ab-cd"])
         end
 
+        it 'tokenizes a string #015' do
+          text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
+          pt = PragmaticTokenizer::Tokenizer.new
+          expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
+        end
+
         it 'handles numbers with symbols 2' do
           text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
           pt = PragmaticTokenizer::Tokenizer.new
@@ -543,6 +549,13 @@
           )
           expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
         end
+        it 'tokenizes something with a slash' do
+          text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
+          pt = PragmaticTokenizer::Tokenizer.new(
+              long_word_split: 1
+          )
+          expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
+        end
       end
 
       context 'option (clean)' do