optimization: Use match for small sets

Kmeakin · Kmeakin · commit b7fa8ef1336f · 2025-08-10T23:36:55.000Z
If the number of codepoint ranges in a set is sufficiently small, it may
be better to simply use a `match` expression rather than a lookup table.
The instructions to implement the `match` may be slightly bigger than
the table that it replaced (hard to predict, depends on architecture and
whatever optimzations LLVM applies), but in return we elimate the lookup
tables and avoid the slower binary search.
diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs
@@ -1,16 +1,16 @@
 ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
 // Alphabetic      :  1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist
 // Case_Ignorable  :  1043 bytes,   2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist
-// Cc              :     7 bytes,     32 codepoints in   1 ranges (U+000080 - U+0000A0) using skiplist
+// Cc              :     0 bytes,     32 codepoints in   1 ranges (U+000080 - U+0000A0) using match
 // Grapheme_Extend :   887 bytes,   2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
 // Lowercase       :   933 bytes,   2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset
-// Lt              :    33 bytes,     31 codepoints in  10 ranges (U+0001C5 - U+001FFD) using skiplist
+// Lt              :     0 bytes,     31 codepoints in  10 ranges (U+0001C5 - U+001FFD) using match
 // N               :   455 bytes,   1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist
 // Uppercase       :   797 bytes,   1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset
-// White_Space     :   256 bytes,     19 codepoints in   8 ranges (U+000085 - U+003001) using cascading
+// White_Space     :     0 bytes,     19 codepoints in   8 ranges (U+000085 - U+003001) using match
 // to_lower        : 11484 bytes
 // to_upper        : 13432 bytes
-// Total           : 31050 bytes
+// Total           : 30754 bytes
 
 #[inline(always)]
 const fn bitset_search<
@@ -340,33 +340,13 @@ pub mod case_ignorable {
 
 #[rustfmt::skip]
 pub mod cc {
-    use super::ShortOffsetRunHeader;
-
-    static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [
-        ShortOffsetRunHeader::new(0, 1114272),
-    ];
-    static OFFSETS: [u8; 3] = [
-        128, 32, 0,
-    ];
     #[inline]
-    pub fn lookup(c: char) -> bool {
+    pub const fn lookup(c: char) -> bool {
         debug_assert!(!c.is_ascii());
-        (c as u32) >= 0x80 && lookup_slow(c)
-    }
-
-    #[inline(never)]
-    fn lookup_slow(c: char) -> bool {
-        const {
-            assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
-            let mut i = 0;
-            while i < SHORT_OFFSET_RUNS.len() {
-                assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
-                i += 1;
-            }
+        match c as u32 {
+            0x80..=0x9f => true,
+            _ => false,
         }
-        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
-        // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
-        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -554,34 +534,22 @@ pub mod lowercase {
 
 #[rustfmt::skip]
 pub mod lt {
-    use super::ShortOffsetRunHeader;
-
-    static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
-        ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
-        ShortOffsetRunHeader::new(9, 1122301),
-    ];
-    static OFFSETS: [u8; 21] = [
-        0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
-    ];
     #[inline]
-    pub fn lookup(c: char) -> bool {
+    pub const fn lookup(c: char) -> bool {
         debug_assert!(!c.is_ascii());
-        (c as u32) >= 0x1c5 && lookup_slow(c)
-    }
-
-    #[inline(never)]
-    fn lookup_slow(c: char) -> bool {
-        const {
-            assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
-            let mut i = 0;
-            while i < SHORT_OFFSET_RUNS.len() {
-                assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
-                i += 1;
-            }
+        match c as u32 {
+            0x1c5 => true,
+            0x1c8 => true,
+            0x1cb => true,
+            0x1f2 => true,
+            0x1f88..=0x1f8f => true,
+            0x1f98..=0x1f9f => true,
+            0x1fa8..=0x1faf => true,
+            0x1fbc => true,
+            0x1fcc => true,
+            0x1ffc => true,
+            _ => false,
         }
-        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
-        // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
-        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -743,25 +711,18 @@ pub mod uppercase {
 
 #[rustfmt::skip]
 pub mod white_space {
-    static WHITESPACE_MAP: [u8; 256] = [
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0,
-    ];
     #[inline]
     pub const fn lookup(c: char) -> bool {
         debug_assert!(!c.is_ascii());
-        match c as u32 >> 8 {
-            0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0,
-            22 => c as u32 == 0x1680,
-            32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0,
-            48 => c as u32 == 0x3000,
+        match c as u32 {
+            0x85 => true,
+            0xa0 => true,
+            0x1680 => true,
+            0x2000..=0x200a => true,
+            0x2028..=0x2029 => true,
+            0x202f => true,
+            0x205f => true,
+            0x3000 => true,
             _ => false,
         }
     }
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
@@ -78,13 +78,13 @@ use std::ops::Range;
 
 use ucd_parse::Codepoints;
 
-mod cascading_map;
 mod case_mapping;
+mod r#match;
 mod raw_emitter;
 mod skiplist;
 mod unicode_download;
 
-use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace};
+use raw_emitter::{RawEmitter, emit_codepoints};
 
 static PROPERTIES: &[&str] = &[
     "Alphabetic",
@@ -239,11 +239,7 @@ fn main() {
         let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
 
         let mut emitter = RawEmitter::new();
-        if property == &"White_Space" {
-            emit_whitespace(&mut emitter, ranges);
-        } else {
-            emit_codepoints(&mut emitter, ranges);
-        }
+        emit_codepoints(&mut emitter, ranges);
 
         modules.push((property.to_lowercase().to_string(), emitter.file));
         table_file.push_str(&format!(
diff --git a/src/tools/unicode-table-generator/src/match.rs b/src/tools/unicode-table-generator/src/match.rs
@@ -0,0 +1,30 @@
+use std::fmt::{self, Write as _};
+use std::ops::Range;
+
+use crate::raw_emitter::RawEmitter;
+
+impl RawEmitter {
+    pub fn emit_match(&mut self, ranges: &[Range<u32>]) -> Result<(), fmt::Error> {
+        let arms: Vec<_> = ranges
+            .iter()
+            .map(|range| match range.len() {
+                1 => format!("{:#x} => true,", range.start),
+
+                // minus one because inclusive range pattern
+                _ => format!("{:#x}..={:#x} => true,", range.start, range.end - 1),
+            })
+            .collect();
+
+        writeln!(self.file, "#[inline]")?;
+        writeln!(self.file, "pub const fn lookup(c: char) -> bool {{")?;
+        writeln!(self.file, "    debug_assert!(!c.is_ascii());")?;
+        writeln!(self.file, "    match c as u32 {{")?;
+        for arm in arms {
+            writeln!(self.file, "        {arm}")?;
+        }
+        writeln!(self.file, "        _ => false,")?;
+        writeln!(self.file, "    }}")?;
+        writeln!(self.file, "}}")?;
+        Ok(())
+    }
+}
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -156,6 +156,12 @@ impl RawEmitter {
 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
     emitter.blank_line();
 
+    if ranges.len() <= 10 {
+        emitter.emit_match(ranges).unwrap();
+        emitter.desc = String::from("match");
+        return;
+    }
+
     let mut bitset = emitter.clone();
     let bitset_ok = bitset.emit_bitset(ranges).is_ok();
 
@@ -171,15 +177,6 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
     }
 }
 
-pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
-    emitter.blank_line();
-
-    let mut cascading = emitter.clone();
-    cascading.emit_cascading_map(ranges);
-    *emitter = cascading;
-    emitter.desc = String::from("cascading");
-}
-
 struct Canonicalized {
     canonical_words: Vec<u64>,
     canonicalized_words: Vec<(u8, u8)>,