|
| 1 | +// Copyright 2017 The UNIC Project Developers. |
| 2 | +// |
| 3 | +// See the COPYRIGHT file at the top-level directory of this distribution. |
| 4 | +// |
| 5 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 8 | +// option. This file may not be copied, modified, or distributed |
| 9 | +// except according to those terms. |
| 10 | + |
| 11 | +use std::cmp::Ordering; |
| 12 | + |
| 13 | +/// Represents the Unicode Character |
| 14 | +/// [*General Category*](http://unicode.org/reports/tr44/#General_Category) property. |
| 15 | +/// |
| 16 | +/// This is a useful breakdown into various character types which can be used as a default |
| 17 | +/// categorization in implementations. For the property values, see |
| 18 | +/// [*General Category Values*](http://unicode.org/reports/tr44/#General_Category_Values). |
| 19 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 20 | +pub enum GeneralCategory { |
| 21 | + /// An uppercase letter (Short form: `Lu`) |
| 22 | + UppercaseLetter, |
| 23 | + /// A lowercase letter (Short form: `Ll`) |
| 24 | + LowercaseLetter, |
| 25 | + /// A digraphic character, with first part uppercase (Short form: `Lt`) |
| 26 | + TitlecaseLetter, |
| 27 | + /// A modifier letter (Short form: `Lm`) |
| 28 | + ModifierLetter, |
| 29 | + /// Other letters, including syllables and ideographs (Short form: `Lo`) |
| 30 | + OtherLetter, |
| 31 | + /// A nonspacing combining mark (zero advance width) (Short form: `Mn`) |
| 32 | + NonspacingMark, |
| 33 | + /// A spacing combining mark (positive advance width) (Short form: `Mc`) |
| 34 | + SpacingMark, |
| 35 | + /// An enclosing combining mark (Short form: `Me`) |
| 36 | + EnclosingMark, |
| 37 | + /// A decimal digit (Short form: `Nd`) |
| 38 | + DecimalNumber, |
| 39 | + /// A letterlike numeric character (Short form: `Nl`) |
| 40 | + LetterNumber, |
| 41 | + /// A numeric character of other type (Short form: `No`) |
| 42 | + OtherNumber, |
| 43 | + /// A connecting punctuation mark, like a tie (Short form: `Pc`) |
| 44 | + ConnectorPunctuation, |
| 45 | + /// A dash or hyphen punctuation mark (Short form: `Pd`) |
| 46 | + DashPunctuation, |
| 47 | + /// An opening punctuation mark (of a pair) (Short form: `Ps`) |
| 48 | + OpenPunctuation, |
| 49 | + /// A closing punctuation mark (of a pair) (Short form: `Pe`) |
| 50 | + ClosePunctuation, |
| 51 | + /// An initial quotation mark (Short form: `Pi`) |
| 52 | + InitialPunctuation, |
| 53 | + /// A final quotation mark (Short form: `Pf`) |
| 54 | + FinalPunctuation, |
| 55 | + /// A punctuation mark of other type (Short form: `Po`) |
| 56 | + OtherPunctuation, |
| 57 | + /// A symbol of mathematical use (Short form: `Sm`) |
| 58 | + MathSymbol, |
| 59 | + /// A currency sign (Short form: `Sc`) |
| 60 | + CurrencySymbol, |
| 61 | + /// A non-letterlike modifier symbol (Short form: `Sk`) |
| 62 | + ModifierSymbol, |
| 63 | + /// A symbol of other type (Short form: `So`) |
| 64 | + OtherSymbol, |
| 65 | + /// A space character (of various non-zero widths) (Short form: `Zs`) |
| 66 | + SpaceSeparator, |
| 67 | + /// U+2028 LINE SEPARATOR only (Short form: `Zl`) |
| 68 | + LineSeparator, |
| 69 | + /// U+2029 PARAGRAPH SEPARATOR only (Short form: `Zp`) |
| 70 | + ParagraphSeparator, |
| 71 | + /// A C0 or C1 control code (Short form: `Cc`) |
| 72 | + Control, |
| 73 | + /// A format control character (Short form: `Cf`) |
| 74 | + Format, |
| 75 | + /// A surrogate code point (Short form: `Cs`) |
| 76 | + Surrogate, |
| 77 | + /// A private-use character (Short form: `Co`) |
| 78 | + PrivateUse, |
| 79 | + /// Unassigned (Short form: `Cn`) |
| 80 | + Unassigned, |
| 81 | +} |
| 82 | + |
| 83 | +use self::GeneralCategory::*; |
| 84 | + |
| 85 | +const GENERAL_CATEGORY_TABLE: &'static [(char, char, GeneralCategory)] = |
| 86 | + include!("tables/general_category.rsv"); |
| 87 | + |
| 88 | +impl GeneralCategory { |
| 89 | + /// Find the GeneralCategory of a single char. |
| 90 | + pub fn of(ch: char) -> GeneralCategory { |
| 91 | + bsearch_range_value_table(ch, GENERAL_CATEGORY_TABLE) |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +impl GeneralCategory { |
| 96 | + /// `Lu` | `Ll` | `Lt` (Short form: `LC`) |
| 97 | + pub fn is_cased_letter(&self) -> bool { |
| 98 | + matches!(*self, UppercaseLetter | LowercaseLetter | TitlecaseLetter) |
| 99 | + } |
| 100 | + /// `Lu` | `Ll` | `Lt` | `Lm` | `Lo` (Short form: `L`) |
| 101 | + pub fn is_letter(&self) -> bool { |
| 102 | + matches!( |
| 103 | + *self, |
| 104 | + UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter |
| 105 | + ) |
| 106 | + } |
| 107 | + /// `Mn` | `Mc` | `Me` (Short form: `M`) |
| 108 | + pub fn is_mark(&self) -> bool { |
| 109 | + matches!(*self, NonspacingMark | SpacingMark | EnclosingMark) |
| 110 | + } |
| 111 | + /// `Nd` | `Nl` | `No` (Short form: `N`) |
| 112 | + pub fn is_number(&self) -> bool { |
| 113 | + matches!(*self, DecimalNumber | LetterNumber | OtherNumber) |
| 114 | + } |
| 115 | + /// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po` (Short form: `P`) |
| 116 | + pub fn is_punctuation(&self) -> bool { |
| 117 | + matches!( |
| 118 | + *self, |
| 119 | + ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation | |
| 120 | + InitialPunctuation | FinalPunctuation | OtherPunctuation |
| 121 | + ) |
| 122 | + } |
| 123 | + /// `Sm` | `Sc` | `Sk` | `So` (Short form: `S`) |
| 124 | + pub fn is_symbol(&self) -> bool { |
| 125 | + matches!( |
| 126 | + *self, |
| 127 | + MathSymbol | CurrencySymbol | ModifierLetter | OtherSymbol |
| 128 | + ) |
| 129 | + } |
| 130 | + /// `Zs` | `Zl` | `Zp` (Short form: `Z`) |
| 131 | + pub fn is_separator(&self) -> bool { |
| 132 | + matches!(*self, SpaceSeparator | LineSeparator | ParagraphSeparator) |
| 133 | + } |
| 134 | + /// `Cc` | `Cf` | `Cs` | `Co` | `Cn` (Short form: `C`) |
| 135 | + pub fn is_other(&self) -> bool { |
| 136 | + matches!( |
| 137 | + *self, |
| 138 | + Control | Format | Surrogate | PrivateUse | Unassigned |
| 139 | + ) |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +fn bsearch_range_value_table( |
| 144 | + c: char, |
| 145 | + r: &'static [(char, char, GeneralCategory)], |
| 146 | +) -> GeneralCategory { |
| 147 | + match r.binary_search_by(|&(lo, hi, _)| if lo <= c && c <= hi { |
| 148 | + Ordering::Equal |
| 149 | + } else if hi < c { |
| 150 | + Ordering::Less |
| 151 | + } else { |
| 152 | + Ordering::Greater |
| 153 | + }) { |
| 154 | + Ok(idx) => { |
| 155 | + let (_, _, category) = r[idx]; |
| 156 | + category |
| 157 | + } |
| 158 | + Err(_) => GeneralCategory::Unassigned, |
| 159 | + } |
| 160 | +} |
| 161 | + |
| 162 | +#[cfg(test)] |
| 163 | +mod tests { |
| 164 | + use super::GeneralCategory as GC; |
| 165 | + use std::char; |
| 166 | + |
| 167 | + #[test] |
| 168 | + fn test_ascii() { |
| 169 | + for c in 0x00..(0x1F + 1) { |
| 170 | + let c = char::from_u32(c).unwrap(); |
| 171 | + assert_eq!(GC::of(c), GC::Control); |
| 172 | + } |
| 173 | + assert_eq!(GC::of(' '), GC::SpaceSeparator); |
| 174 | + assert_eq!(GC::of('!'), GC::OtherPunctuation); |
| 175 | + assert_eq!(GC::of('"'), GC::OtherPunctuation); |
| 176 | + assert_eq!(GC::of('#'), GC::OtherPunctuation); |
| 177 | + assert_eq!(GC::of('$'), GC::CurrencySymbol); |
| 178 | + assert_eq!(GC::of('%'), GC::OtherPunctuation); |
| 179 | + assert_eq!(GC::of('&'), GC::OtherPunctuation); |
| 180 | + assert_eq!(GC::of('\''), GC::OtherPunctuation); |
| 181 | + assert_eq!(GC::of('('), GC::OpenPunctuation); |
| 182 | + assert_eq!(GC::of(')'), GC::ClosePunctuation); |
| 183 | + assert_eq!(GC::of('*'), GC::OtherPunctuation); |
| 184 | + assert_eq!(GC::of('+'), GC::MathSymbol); |
| 185 | + assert_eq!(GC::of(','), GC::OtherPunctuation); |
| 186 | + assert_eq!(GC::of('-'), GC::DashPunctuation); |
| 187 | + assert_eq!(GC::of('.'), GC::OtherPunctuation); |
| 188 | + assert_eq!(GC::of('/'), GC::OtherPunctuation); |
| 189 | + for c in ('0' as u32)..('9' as u32 + 1) { |
| 190 | + let c = char::from_u32(c).unwrap(); |
| 191 | + assert_eq!(GC::of(c), GC::DecimalNumber); |
| 192 | + } |
| 193 | + assert_eq!(GC::of(':'), GC::OtherPunctuation); |
| 194 | + assert_eq!(GC::of(';'), GC::OtherPunctuation); |
| 195 | + assert_eq!(GC::of('<'), GC::MathSymbol); |
| 196 | + assert_eq!(GC::of('='), GC::MathSymbol); |
| 197 | + assert_eq!(GC::of('>'), GC::MathSymbol); |
| 198 | + assert_eq!(GC::of('?'), GC::OtherPunctuation); |
| 199 | + assert_eq!(GC::of('@'), GC::OtherPunctuation); |
| 200 | + for c in ('A' as u32)..('Z' as u32 + 1) { |
| 201 | + let c = char::from_u32(c).unwrap(); |
| 202 | + assert_eq!(GC::of(c), GC::UppercaseLetter); |
| 203 | + } |
| 204 | + assert_eq!(GC::of('['), GC::OpenPunctuation); |
| 205 | + assert_eq!(GC::of('\\'), GC::OtherPunctuation); |
| 206 | + assert_eq!(GC::of(']'), GC::ClosePunctuation); |
| 207 | + assert_eq!(GC::of('^'), GC::ModifierSymbol); |
| 208 | + assert_eq!(GC::of('_'), GC::ConnectorPunctuation); |
| 209 | + assert_eq!(GC::of('`'), GC::ModifierSymbol); |
| 210 | + for c in ('a' as u32)..('z' as u32 + 1) { |
| 211 | + let c = char::from_u32(c).unwrap(); |
| 212 | + assert_eq!(GC::of(c), GC::LowercaseLetter); |
| 213 | + } |
| 214 | + assert_eq!(GC::of('{'), GC::OpenPunctuation); |
| 215 | + assert_eq!(GC::of('|'), GC::MathSymbol); |
| 216 | + assert_eq!(GC::of('}'), GC::ClosePunctuation); |
| 217 | + assert_eq!(GC::of('~'), GC::MathSymbol); |
| 218 | + } |
| 219 | + |
| 220 | + #[test] |
| 221 | + fn test_bmp_edge() { |
| 222 | + // 0xFEFF ZERO WIDTH NO-BREAK SPACE (or) BYTE ORDER MARK |
| 223 | + let bom = char::from_u32(0xFEFF).unwrap(); |
| 224 | + assert_eq!(GC::of(bom), GC::Format); |
| 225 | + // 0xFFFC OBJECT REPLACEMENT CHARACTER |
| 226 | + assert_eq!(GC::of(''), GC::OtherSymbol); |
| 227 | + // 0xFFFD REPLACEMENT CHARACTER |
| 228 | + assert_eq!(GC::of('�'), GC::OtherSymbol); |
| 229 | + for &c in [0xFFEF, 0xFFFE, 0xFFFF].iter() { |
| 230 | + let c = char::from_u32(c).unwrap(); |
| 231 | + assert_eq!(GC::of(c), GC::Unassigned); |
| 232 | + } |
| 233 | + } |
| 234 | + |
| 235 | + #[test] |
| 236 | + fn test_private_use() { |
| 237 | + for c in 0xF0000..(0xFFFFD + 1) { |
| 238 | + let c = char::from_u32(c).unwrap(); |
| 239 | + assert_eq!(GC::of(c), GC::PrivateUse); |
| 240 | + } |
| 241 | + for c in 0x100000..(0x10FFFD + 1) { |
| 242 | + let c = char::from_u32(c).unwrap(); |
| 243 | + assert_eq!(GC::of(c), GC::PrivateUse); |
| 244 | + } |
| 245 | + for &c in [0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF].iter() { |
| 246 | + let c = char::from_u32(c).unwrap(); |
| 247 | + assert_eq!(GC::of(c), GC::Unassigned); |
| 248 | + } |
| 249 | + } |
| 250 | +} |
0 commit comments