Skip to content

Commit b7fa8ef

Browse files
committed
optimization: Use match for small sets
If the number of codepoint ranges in a set is sufficiently small, it may be better to simply use a `match` expression rather than a lookup table. The instructions to implement the `match` may be slightly bigger than the table that it replaced (hard to predict, depends on architecture and whatever optimzations LLVM applies), but in return we elimate the lookup tables and avoid the slower binary search.
1 parent 9a1accc commit b7fa8ef

File tree

5 files changed

+69
-163
lines changed

5 files changed

+69
-163
lines changed

library/core/src/unicode/unicode_data.rs

Lines changed: 30 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist
33
// Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist
4-
// Cc : 7 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using skiplist
4+
// Cc : 0 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using match
55
// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
66
// Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset
7-
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
7+
// Lt : 0 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using match
88
// N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist
99
// Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset
10-
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
10+
// White_Space : 0 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using match
1111
// to_lower : 11484 bytes
1212
// to_upper : 13432 bytes
13-
// Total : 31050 bytes
13+
// Total : 30754 bytes
1414

1515
#[inline(always)]
1616
const fn bitset_search<
@@ -340,33 +340,13 @@ pub mod case_ignorable {
340340

341341
#[rustfmt::skip]
342342
pub mod cc {
343-
use super::ShortOffsetRunHeader;
344-
345-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [
346-
ShortOffsetRunHeader::new(0, 1114272),
347-
];
348-
static OFFSETS: [u8; 3] = [
349-
128, 32, 0,
350-
];
351343
#[inline]
352-
pub fn lookup(c: char) -> bool {
344+
pub const fn lookup(c: char) -> bool {
353345
debug_assert!(!c.is_ascii());
354-
(c as u32) >= 0x80 && lookup_slow(c)
355-
}
356-
357-
#[inline(never)]
358-
fn lookup_slow(c: char) -> bool {
359-
const {
360-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
361-
let mut i = 0;
362-
while i < SHORT_OFFSET_RUNS.len() {
363-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
364-
i += 1;
365-
}
346+
match c as u32 {
347+
0x80..=0x9f => true,
348+
_ => false,
366349
}
367-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
368-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
369-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
370350
}
371351
}
372352

@@ -554,34 +534,22 @@ pub mod lowercase {
554534

555535
#[rustfmt::skip]
556536
pub mod lt {
557-
use super::ShortOffsetRunHeader;
558-
559-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
560-
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
561-
ShortOffsetRunHeader::new(9, 1122301),
562-
];
563-
static OFFSETS: [u8; 21] = [
564-
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
565-
];
566537
#[inline]
567-
pub fn lookup(c: char) -> bool {
538+
pub const fn lookup(c: char) -> bool {
568539
debug_assert!(!c.is_ascii());
569-
(c as u32) >= 0x1c5 && lookup_slow(c)
570-
}
571-
572-
#[inline(never)]
573-
fn lookup_slow(c: char) -> bool {
574-
const {
575-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
576-
let mut i = 0;
577-
while i < SHORT_OFFSET_RUNS.len() {
578-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
579-
i += 1;
580-
}
540+
match c as u32 {
541+
0x1c5 => true,
542+
0x1c8 => true,
543+
0x1cb => true,
544+
0x1f2 => true,
545+
0x1f88..=0x1f8f => true,
546+
0x1f98..=0x1f9f => true,
547+
0x1fa8..=0x1faf => true,
548+
0x1fbc => true,
549+
0x1fcc => true,
550+
0x1ffc => true,
551+
_ => false,
581552
}
582-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
583-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
584-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
585553
}
586554
}
587555

@@ -743,25 +711,18 @@ pub mod uppercase {
743711

744712
#[rustfmt::skip]
745713
pub mod white_space {
746-
static WHITESPACE_MAP: [u8; 256] = [
747-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
748-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
749-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750-
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
751-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
752-
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
753-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
754-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
755-
0, 0, 0, 0, 0, 0, 0, 0, 0,
756-
];
757714
#[inline]
758715
pub const fn lookup(c: char) -> bool {
759716
debug_assert!(!c.is_ascii());
760-
match c as u32 >> 8 {
761-
0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0,
762-
22 => c as u32 == 0x1680,
763-
32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0,
764-
48 => c as u32 == 0x3000,
717+
match c as u32 {
718+
0x85 => true,
719+
0xa0 => true,
720+
0x1680 => true,
721+
0x2000..=0x200a => true,
722+
0x2028..=0x2029 => true,
723+
0x202f => true,
724+
0x205f => true,
725+
0x3000 => true,
765726
_ => false,
766727
}
767728
}

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 0 additions & 78 deletions
This file was deleted.

src/tools/unicode-table-generator/src/main.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,13 @@ use std::ops::Range;
7878

7979
use ucd_parse::Codepoints;
8080

81-
mod cascading_map;
8281
mod case_mapping;
82+
mod r#match;
8383
mod raw_emitter;
8484
mod skiplist;
8585
mod unicode_download;
8686

87-
use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace};
87+
use raw_emitter::{RawEmitter, emit_codepoints};
8888

8989
static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
@@ -239,11 +239,7 @@ fn main() {
239239
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
240240

241241
let mut emitter = RawEmitter::new();
242-
if property == &"White_Space" {
243-
emit_whitespace(&mut emitter, ranges);
244-
} else {
245-
emit_codepoints(&mut emitter, ranges);
246-
}
242+
emit_codepoints(&mut emitter, ranges);
247243

248244
modules.push((property.to_lowercase().to_string(), emitter.file));
249245
table_file.push_str(&format!(
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
use std::fmt::{self, Write as _};
2+
use std::ops::Range;
3+
4+
use crate::raw_emitter::RawEmitter;
5+
6+
impl RawEmitter {
7+
pub fn emit_match(&mut self, ranges: &[Range<u32>]) -> Result<(), fmt::Error> {
8+
let arms: Vec<_> = ranges
9+
.iter()
10+
.map(|range| match range.len() {
11+
1 => format!("{:#x} => true,", range.start),
12+
13+
// minus one because inclusive range pattern
14+
_ => format!("{:#x}..={:#x} => true,", range.start, range.end - 1),
15+
})
16+
.collect();
17+
18+
writeln!(self.file, "#[inline]")?;
19+
writeln!(self.file, "pub const fn lookup(c: char) -> bool {{")?;
20+
writeln!(self.file, " debug_assert!(!c.is_ascii());")?;
21+
writeln!(self.file, " match c as u32 {{")?;
22+
for arm in arms {
23+
writeln!(self.file, " {arm}")?;
24+
}
25+
writeln!(self.file, " _ => false,")?;
26+
writeln!(self.file, " }}")?;
27+
writeln!(self.file, "}}")?;
28+
Ok(())
29+
}
30+
}

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,12 @@ impl RawEmitter {
156156
pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
157157
emitter.blank_line();
158158

159+
if ranges.len() <= 10 {
160+
emitter.emit_match(ranges).unwrap();
161+
emitter.desc = String::from("match");
162+
return;
163+
}
164+
159165
let mut bitset = emitter.clone();
160166
let bitset_ok = bitset.emit_bitset(ranges).is_ok();
161167

@@ -171,15 +177,6 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
171177
}
172178
}
173179

174-
pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
175-
emitter.blank_line();
176-
177-
let mut cascading = emitter.clone();
178-
cascading.emit_cascading_map(ranges);
179-
*emitter = cascading;
180-
emitter.desc = String::from("cascading");
181-
}
182-
183180
struct Canonicalized {
184181
canonical_words: Vec<u64>,
185182
canonicalized_words: Vec<(u8, u8)>,

0 commit comments

Comments
 (0)