feat(scanner): added Normalized input comparison marker.

DanielKeep · DanielKeep · commit 7e9640a01eb3 · 2016-02-09T22:57:22.000+11:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,7 +19,7 @@ exclude = [
 
 [features]
 default = []
-all = ["arrays-32", "tuples-16", "regex"]
+all = ["arrays-32", "tuples-16", "regex", "unicode-normalization"]
 "tuples-16" = []
 "arrays-32" = []
 
@@ -29,6 +29,7 @@ lazy_static = "0.1.14"
 strcursor = "0.2.2"
 
 regex = { version = "0.1.41", optional = true }
+unicode-normalization = { version = "0.1.2", optional = true }
 
 [dev-dependencies]
 bitflags = "0.4.0"
diff --git a/src/input.rs b/src/input.rs
@@ -650,6 +650,43 @@ impl StrCompare for IgnoreAsciiCase {
     }
 }
 
+/**
+Marker type used to do normalized string comparisons.
+
+Specifically, this type will compare strings based on the result of a NFD transform.
+*/
+#[cfg(feature="unicode-normalization")]
+#[derive(Debug)]
+pub enum Normalized {}
+
+#[cfg(feature="unicode-normalization")]
+impl StrCompare for Normalized {
+    fn compare(a: &str, b: &str) -> bool {
+        use unicode_normalization::UnicodeNormalization;
+
+        let mut acs = a.nfd();
+        let mut bcs = b.nfd();
+        loop {
+            match (acs.next(), bcs.next()) {
+                (Some(a), Some(b)) if a == b => (),
+                (None, None) => return true,
+                _ => return false
+            }
+        }
+    }
+}
+
+#[cfg(feature="unicode-normalization")]
+#[cfg(test)]
+#[test]
+fn test_normalized() {
+    use self::Normalized as N;
+
+    assert_eq!(N::compare("hi", "hi"), true);
+    assert_eq!(N::compare("café", "cafe\u{301}"), true);
+    assert_eq!(N::compare("cafe\u{301}", "café"), true);
+}
+
 fn slice_non_space(s: &str) -> Option<usize> {
     use ::util::TableUtil;
     use ::unicode::property::White_Space_table as WS;
diff --git a/src/lib.rs b/src/lib.rs
@@ -70,6 +70,8 @@ The following [optional features](http://doc.crates.io/manifest.html#the-feature
 
 * `tuples-16`: implement scanning for tuples of up to 16 elements.  The default is up to 4 elements.
 
+* `unicode-normalization`: include support for `Normalized` and `IgnoreCaseNormalized` cursor types.  Adds a dependency on the `unicode-normalization` crate.
+
 ## Important Notes
 
 * There are no default scanners for `&str` or `String`; if you want a string, you should pick an appropriate abstract scanner from the [`scanner`](scanner/index.html) module.
@@ -270,6 +272,7 @@ A scanning pattern is made up of one or more pattern terms, separated by commas.
 extern crate itertools;
 extern crate strcursor;
 #[cfg(feature="regex")] extern crate regex;
+#[cfg(feature="unicode-normalization")] extern crate unicode_normalization;
 
 #[macro_use] mod macros;
 
diff --git a/tests/cursor_markers.rs b/tests/cursor_markers.rs
@@ -139,3 +139,50 @@ fn test_non_space() {
         Ok(())
     );
 }
+
+#[cfg(feature="unicode-normalization")]
+#[test]
+fn test_normalized() {
+    use scan_rules::ScanError as SE;
+    use scan_rules::ScanErrorKind as SEK;
+
+    type Cursor<'a> = StrCursor<'a, input::Normalized, input::IgnoreSpace, input::Wordish>;
+
+    let inp = "café bäbe";
+
+    assert_match!(
+        scan!(inp;
+            ("café bäbe") => ()),
+        Ok(())
+    );
+
+    assert_match!(
+        scan!(inp;
+            ("café bäbe") => ()),
+        Err(SE { ref at, kind: SEK::LiteralMismatch, .. }) if at.offset() == 0
+    );
+
+    assert_match!(
+        scan!(inp;
+            ("café bäbe") => ()),
+        Err(SE { ref at, kind: SEK::LiteralMismatch, .. }) if at.offset() == 6
+    );
+
+    assert_match!(
+        scan!(Cursor::new(inp);
+            ("café bäbe") => ()),
+        Ok(())
+    );
+
+    assert_match!(
+        scan!(Cursor::new(inp);
+            ("café bäbe") => ()),
+        Ok(())
+    );
+
+    assert_match!(
+        scan!(Cursor::new(inp);
+            ("café bäbe") => ()),
+        Ok(())
+    );
+}