optimizing for speed, many regex now need proper naming

diasks2 · Feb 14, 2016 · 0d14248 · 0d14248
1 parent 346b220
commit 0d14248
Show file tree

Hide file tree

Showing 5 changed files with 322 additions and 235 deletions.
diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml
@@ -1,49 +1,113 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2016-01-24 21:09:34 +0100 using RuboCop version 0.36.0.
+# on 2016-02-14 14:51:24 +0100 using RuboCop version 0.37.2.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
 # versions of RuboCop, may require this file to be generated again.
 
-# Offense count: 11
+# Offense count: 8
 Metrics/AbcSize:
-  Max: 118
+  Max: 66
 
-# Offense count: 2
+# Offense count: 1
 # Configuration parameters: CountComments.
 Metrics/ClassLength:
-  Max: 218
+  Max: 241
 
-# Offense count: 7
+# Offense count: 6
 Metrics/CyclomaticComplexity:
-  Max: 40
+  Max: 20
 
-# Offense count: 7
+# Offense count: 5
 # Configuration parameters: CountComments.
 Metrics/MethodLength:
-  Max: 57
+  Max: 34
 
 # Offense count: 2
 # Configuration parameters: CountComments.
 Metrics/ModuleLength:
-  Max: 140
+  Max: 144
 
-# Offense count: 6
+# Offense count: 4
 Metrics/PerceivedComplexity:
-  Max: 41
+  Max: 20
+
+# Offense count: 13
+# Cop supports --auto-correct.
+# Configuration parameters: EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle, SupportedLastArgumentHashStyles.
+# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
+Style/AlignHash:
+  Exclude:
+    - 'spec/languages/english_spec.rb'
 
 # Offense count: 4
 # Cop supports --auto-correct.
 Style/CommentIndentation:
   Exclude:
     - 'lib/pragmatic_tokenizer/tokenizer.rb'
 
+# Offense count: 1
+# Cop supports --auto-correct.
+# Configuration parameters: SingleLineConditionsOnly.
+Style/ConditionalAssignment:
+  Exclude:
+    - 'lib/pragmatic_tokenizer/full_stop_separator.rb'
+
 # Offense count: 31
 Style/Documentation:
   Enabled: false
 
-# Offense count: 17
+# Offense count: 1
+# Cop supports --auto-correct.
+Style/EmptyLines:
+  Exclude:
+    - 'lib/pragmatic_tokenizer/tokenizer.rb'
+
+# Offense count: 7
+# Cop supports --auto-correct.
+Style/FirstMethodArgumentLineBreak:
+  Exclude:
+    - 'lib/pragmatic_tokenizer/post_processor.rb'
+    - 'lib/pragmatic_tokenizer/tokenizer.rb'
+
+# Offense count: 1
+# Cop supports --auto-correct.
+# Configuration parameters: EnforcedStyle, SupportedStyles, IndentationWidth.
+# SupportedStyles: consistent, special_for_inner_method_call, special_for_inner_method_call_in_parentheses
+Style/FirstParameterIndentation:
+  Exclude:
+    - 'spec/languages/english_spec.rb'
+
+# Offense count: 26
+# Cop supports --auto-correct.
+# Configuration parameters: EnforcedStyle, SupportedStyles, UseHashRocketsWithSymbolValues.
+# SupportedStyles: ruby19, ruby19_no_mixed_keys, hash_rockets
+Style/HashSyntax:
+  Enabled: false
+
+# Offense count: 2
+# Cop supports --auto-correct.
+# Configuration parameters: SupportedStyles, IndentationWidth.
+# SupportedStyles: special_inside_parentheses, consistent, align_brackets
+Style/IndentArray:
+  EnforcedStyle: consistent
+
+# Offense count: 1
+# Cop supports --auto-correct.
+# Configuration parameters: Width.
+Style/IndentationWidth:
+  Exclude:
+    - 'lib/pragmatic_tokenizer/pre_processor.rb'
+
+# Offense count: 4
+# Cop supports --auto-correct.
+# Configuration parameters: EnforcedStyle, SupportedStyles, IndentationWidth.
+# SupportedStyles: aligned, indented
+Style/MultilineMethodCallIndentation:
+  Enabled: false
+
+# Offense count: 1
 # Cop supports --auto-correct.
 # Configuration parameters: EnforcedStyle, SupportedStyles, IndentationWidth.
 # SupportedStyles: aligned, indented

diff --git a/README.md b/README.md
@@ -29,10 +29,10 @@ gem 'pragmatic_tokenizer'
 ```ruby
 text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
 
-PragmaticTokenizer::Tokenizer.new(text).tokenize
+PragmaticTokenizer::Tokenizer.new.tokenize(text)
 # => ["\"", "i", "said", ",", "'", "what're", "you", "?", "crazy", "?", "'", "\"", "said", "sandowsky", ".", "\"", "i", "can't", "afford", "to", "do", "that", ".", "\""]
 
-# You can pass many different options:
+# You can pass many different options to #initialize:
 options = {
   language:            :en, # the language of the string you are tokenizing
   abbreviations:       ['a.b', 'a'], # a user-supplied array of abbreviations (downcased with ending period removed)
@@ -403,4 +403,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+THE SOFTWARE.
diff --git a/lib/pragmatic_tokenizer/languages.rb b/lib/pragmatic_tokenizer/languages.rb
@@ -28,35 +28,36 @@
 module PragmaticTokenizer
   module Languages
     LANGUAGE_CODES = {
-        'en' => English,
-        'ar' => Arabic,
-        'bg' => Bulgarian,
-        'ca' => Catalan,
-        'cs' => Czech,
-        'da' => Danish,
-        'de' => Deutsch,
-        'el' => Greek,
-        'es' => Spanish,
-        'fa' => Persian,
-        'fi' => Finnish,
-        'fr' => French,
-        'id' => Indonesian,
-        'it' => Italian,
-        'lv' => Latvian,
-        'nl' => Dutch,
-        'nn' => Norwegian,
-        'nb' => Norwegian,
-        'no' => Norwegian,
-        'pl' => Polish,
-        'pt' => Portuguese,
-        'ro' => Romanian,
-        'ru' => Russian,
-        'sk' => Slovak,
-        'sv' => Swedish,
-        'tr' => Turkish
+        :en => English,
+        :ar => Arabic,
+        :bg => Bulgarian,
+        :ca => Catalan,
+        :cs => Czech,
+        :da => Danish,
+        :de => Deutsch,
+        :el => Greek,
+        :es => Spanish,
+        :fa => Persian,
+        :fi => Finnish,
+        :fr => French,
+        :id => Indonesian,
+        :it => Italian,
+        :lv => Latvian,
+        :nl => Dutch,
+        :nn => Norwegian,
+        :nb => Norwegian,
+        :no => Norwegian,
+        :pl => Polish,
+        :pt => Portuguese,
+        :ro => Romanian,
+        :ru => Russian,
+        :sk => Slovak,
+        :sv => Swedish,
+        :tr => Turkish
     }.freeze
 
     def self.get_language_by_code(code)
+      code = code ? code.to_sym : :en
       LANGUAGE_CODES[code] || Common
     end
   end

diff --git a/lib/pragmatic_tokenizer/post_processor.rb b/lib/pragmatic_tokenizer/post_processor.rb
@@ -1,27 +1,25 @@
 module PragmaticTokenizer
   class PostProcessor
 
-    REGEX_SYMBOL         = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.freeze
-    REGEXP_COMMAS        = /^(,|‚)+/.freeze
-    REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/.freeze
-    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\/(.*)/.freeze
-    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/.freeze
-    REGEXP_PLUS_SIGN     = /(.+)\+(.+)/.freeze
-    REGEXP_COLON         = /^(\:)(\S{2,})/.freeze
-    REGEXP_EMOJI         = /(\u{2744}[\u{FE0E}|\u{FE0F}])/.freeze
+    REGEX_SYMBOL         = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
+    REGEXP_COMMAS        = /^(,|‚)+/
+    REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/
+    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\/(.*)/
+    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/
+    REGEXP_PLUS_SIGN     = /(.+)\+(.+)/
+    REGEXP_COLON         = /^(\:)(\S{2,})/
+    REGEXP_EMOJI         = /(\u{2744}[\u{FE0E}|\u{FE0F}])/
 
     REGEX_UNIFIED1       = Regexp.union(REGEXP_SLASH,
                                         REGEXP_QUESTION_MARK,
                                         REGEXP_PLUS_SIGN,
                                         REGEXP_COLON,
                                         REGEXP_EMOJI,
                                         PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
-                                        PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX
-    ).freeze
+                                        PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
 
     REGEX_UNIFIED2       = Regexp.union(REGEXP_SINGLE_QUOTES,
-                                        REGEXP_COMMAS
-    ).freeze
+                                        REGEXP_COMMAS)
 
     attr_reader :text, :abbreviations, :downcase