Skip to content

Commit fd1cda6

Browse files
committed
optimization: Don't include ASCII characters in Unicode tables
The ASCII subset of Unicode is fixed and will never change, so we don't need to generate tables for it with every new Unicode version. This saves a few bytes of static data and speeds up `char::is_control` and `char::is_grapheme_extended` on ASCII inputs. Since the table lookup functions exported from the `unicode` module will give nonsensical errors on ASCII input (and in fact will panic in debug mode), I had to add some private wrapper methods to `char` which check for ASCII-ness first.
1 parent e6e32f1 commit fd1cda6

File tree

3 files changed

+315
-247
lines changed

3 files changed

+315
-247
lines changed

alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

core/src/char/methods.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,43 @@ impl char {
969969
#[must_use]
970970
#[inline]
971971
pub(crate) fn is_grapheme_extended(self) -> bool {
972-
unicode::Grapheme_Extend(self)
972+
!self.is_ascii() && unicode::Grapheme_Extend(self)
973+
}
974+
975+
/// Returns `true` if this `char` has the `Cased` property.
976+
///
977+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979+
///
980+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981+
/// [ucd]: https://www.unicode.org/reports/tr44/
982+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983+
#[must_use]
984+
#[inline]
985+
#[doc(hidden)]
986+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987+
pub fn is_cased(self) -> bool {
988+
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
989+
}
990+
991+
/// Returns `true` if this `char` has the `Case_Ignorable` property.
992+
///
993+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
994+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
995+
///
996+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
997+
/// [ucd]: https://www.unicode.org/reports/tr44/
998+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
999+
#[must_use]
1000+
#[inline]
1001+
#[doc(hidden)]
1002+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1003+
pub fn is_case_ignorable(self) -> bool {
1004+
if self.is_ascii() {
1005+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1006+
} else {
1007+
unicode::Case_Ignorable(self)
1008+
}
9731009
}
9741010

9751011
/// Returns `true` if this `char` has one of the general categories for numbers.

0 commit comments

Comments
 (0)