Skip to content

Commit

Permalink
Merge pull request #43 from omalab/master
Browse files Browse the repository at this point in the history
Non-breaking spaces should STILL be spaces
  • Loading branch information
diasks2 authored Nov 29, 2020
2 parents ab771a2 + ea4a66a commit 9c66fa5
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 2 deletions.
2 changes: 1 addition & 1 deletion lib/pragmatic_tokenizer/pre_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def pre_process(language: Languages::Common)
private

def remove_non_breaking_space!
gsub!(Regex::NO_BREAK_SPACE, ''.freeze)
gsub!(Regex::NO_BREAK_SPACE, ' '.freeze)
end

def shift_various_characters!
Expand Down
1 change: 1 addition & 0 deletions lib/pragmatic_tokenizer/regex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Regex
ASTERISK = /(?:\*+)/
UNDERSCORE = /(?:_+)/
HYPHEN_OR_UNDERSCORE = /(?:[-_])/
LONG_WORD_SPLIT = /(?:[-_\/—–])/
PERIOD_AND_PRIOR = /(?:(.+\.))/
PERIOD_ONLY = /(?:(\.))/
CONTRACTIONS = /(?:[‘’‚‛‹›'´`])/
Expand Down
2 changes: 1 addition & 1 deletion lib/pragmatic_tokenizer/tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def split_long_word(token)
return token if token.length <= @long_word_split
return token if token =~ Regex::ONLY_HASHTAG_MENTION
return token if token =~ Regex::DOMAIN_OR_EMAIL
token.split(Regex::HYPHEN_OR_UNDERSCORE)
token.split(Regex::LONG_WORD_SPLIT)
end

def chosen_case(text)
Expand Down
13 changes: 13 additions & 0 deletions spec/languages/english_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@
expect(pt.tokenize(text)).to eq(["#ab-cd"])
end

it 'tokenizes a string #015' do
text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
pt = PragmaticTokenizer::Tokenizer.new
expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
end

it 'handles numbers with symbols 2' do
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
pt = PragmaticTokenizer::Tokenizer.new
Expand Down Expand Up @@ -543,6 +549,13 @@
)
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
end
it 'tokenizes something with a slash' do
text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
pt = PragmaticTokenizer::Tokenizer.new(
long_word_split: 1
)
expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
end
end

context 'option (clean)' do
Expand Down

0 comments on commit 9c66fa5

Please sign in to comment.