optimization: Don't include ASCII characters in Unicode tables

Kmeakin · Kmeakin · commit fd1cda688ec9 · 2025-09-07T15:21:24.000+02:00
The ASCII subset of Unicode is fixed and will never change, so we don't
need to generate tables for it with every new Unicode version. This
saves a few bytes of static data and speeds up `char::is_control` and
`char::is_grapheme_extended` on ASCII inputs.

Since the table lookup functions exported from the `unicode` module will
give nonsensical errors on ASCII input (and in fact will panic in debug
mode), I had to add some private wrapper methods to `char` which check
for ASCII-ness first.
diff --git a/alloc/src/str.rs b/alloc/src/str.rs
@@ -418,9 +418,8 @@ impl str {
         }
 
         fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
-            use core::unicode::{Case_Ignorable, Cased};
-            match iter.skip_while(|&c| Case_Ignorable(c)).next() {
-                Some(c) => Cased(c),
+            match iter.skip_while(|&c| c.is_case_ignorable()).next() {
+                Some(c) => c.is_cased(),
                 None => false,
             }
         }
diff --git a/core/src/char/methods.rs b/core/src/char/methods.rs
@@ -969,7 +969,43 @@ impl char {
     #[must_use]
     #[inline]
     pub(crate) fn is_grapheme_extended(self) -> bool {
-        unicode::Grapheme_Extend(self)
+        !self.is_ascii() && unicode::Grapheme_Extend(self)
+    }
+
+    /// Returns `true` if this `char` has the `Cased` property.
+    ///
+    /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
+    /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
+    ///
+    /// [Unicode Standard]: https://www.unicode.org/versions/latest/
+    /// [ucd]: https://www.unicode.org/reports/tr44/
+    /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
+    #[must_use]
+    #[inline]
+    #[doc(hidden)]
+    #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+    pub fn is_cased(self) -> bool {
+        if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
+    }
+
+    /// Returns `true` if this `char` has the `Case_Ignorable` property.
+    ///
+    /// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
+    /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
+    ///
+    /// [Unicode Standard]: https://www.unicode.org/versions/latest/
+    /// [ucd]: https://www.unicode.org/reports/tr44/
+    /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
+    #[must_use]
+    #[inline]
+    #[doc(hidden)]
+    #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+    pub fn is_case_ignorable(self) -> bool {
+        if self.is_ascii() {
+            matches!(self, '\'' | '.' | ':' | '^' | '`')
+        } else {
+            unicode::Case_Ignorable(self)
+        }
     }
 
     /// Returns `true` if this `char` has one of the general categories for numbers.
diff --git a/core/src/unicode/unicode_data.rs b/core/src/unicode/unicode_data.rs

Original file line number	Diff line number	Diff line change
`@@ -418,9 +418,8 @@ impl str {`
`418`	`418`	`}`
`419`	`419`
`420`	`420`	`fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {`
`421`		`- use core::unicode::{Case_Ignorable, Cased};`
`422`		`- match iter.skip_while(\|&c\| Case_Ignorable(c)).next() {`
`423`		`- Some(c) => Cased(c),`
	`421`	`+ match iter.skip_while(\|&c\| c.is_case_ignorable()).next() {`
	`422`	`+ Some(c) => c.is_cased(),`
`424`	`423`	`None => false,`
`425`	`424`	`}`
`426`	`425`	`}`