Skip to content

Commit 8048bb4

Browse files
authored
Merge pull request #31 from CAD97/category
[ucd/category] Add General Category property
2 parents 1c344d9 + 120601a commit 8048bb4

File tree

9 files changed

+3480
-0
lines changed

9 files changed

+3480
-0
lines changed

components/ucd/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ unic-ucd-bidi = { path = "bidi/", version = "0.4.0" }
1818
unic-ucd-core = { path = "core/", version = "0.4.0" }
1919
unic-ucd-normal = { path = "normal/", version = "0.4.0" }
2020
unic-ucd-utils = { path = "utils/", version = "0.4.0" }
21+
unic-ucd-category = { path = "category/", version = "0.4.0" }

components/ucd/category/Cargo.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[package]
2+
name = "unic-ucd-category"
3+
version = "0.4.0"
4+
authors = ["The UNIC Project Developers"]
5+
homepage = "https://github.com/behnam/rust-unic/"
6+
repository = "https://github.com/behnam/rust-unic/"
7+
license = "MIT/Apache-2.0"
8+
keywords = ["text", "unicode"]
9+
description = "UNIC - Unicode Character Database - General Category"
10+
11+
[badges]
12+
travis-ci = { repository = "behnam/rust-unic", branch = "master" }
13+
14+
[dependencies]
15+
unic-ucd-core = { path = "../core/", version = "0.4.0" }
16+
matches = "0.1.6"
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use std::cmp::Ordering;
12+
13+
/// Represents the Unicode Character
14+
/// [*General Category*](http://unicode.org/reports/tr44/#General_Category) property.
15+
///
16+
/// This is a useful breakdown into various character types which can be used as a default
17+
/// categorization in implementations. For the property values, see
18+
/// [*General Category Values*](http://unicode.org/reports/tr44/#General_Category_Values).
19+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
20+
pub enum GeneralCategory {
21+
/// An uppercase letter (Short form: `Lu`)
22+
UppercaseLetter,
23+
/// A lowercase letter (Short form: `Ll`)
24+
LowercaseLetter,
25+
/// A digraphic character, with first part uppercase (Short form: `Lt`)
26+
TitlecaseLetter,
27+
/// A modifier letter (Short form: `Lm`)
28+
ModifierLetter,
29+
/// Other letters, including syllables and ideographs (Short form: `Lo`)
30+
OtherLetter,
31+
/// A nonspacing combining mark (zero advance width) (Short form: `Mn`)
32+
NonspacingMark,
33+
/// A spacing combining mark (positive advance width) (Short form: `Mc`)
34+
SpacingMark,
35+
/// An enclosing combining mark (Short form: `Me`)
36+
EnclosingMark,
37+
/// A decimal digit (Short form: `Nd`)
38+
DecimalNumber,
39+
/// A letterlike numeric character (Short form: `Nl`)
40+
LetterNumber,
41+
/// A numeric character of other type (Short form: `No`)
42+
OtherNumber,
43+
/// A connecting punctuation mark, like a tie (Short form: `Pc`)
44+
ConnectorPunctuation,
45+
/// A dash or hyphen punctuation mark (Short form: `Pd`)
46+
DashPunctuation,
47+
/// An opening punctuation mark (of a pair) (Short form: `Ps`)
48+
OpenPunctuation,
49+
/// A closing punctuation mark (of a pair) (Short form: `Pe`)
50+
ClosePunctuation,
51+
/// An initial quotation mark (Short form: `Pi`)
52+
InitialPunctuation,
53+
/// A final quotation mark (Short form: `Pf`)
54+
FinalPunctuation,
55+
/// A punctuation mark of other type (Short form: `Po`)
56+
OtherPunctuation,
57+
/// A symbol of mathematical use (Short form: `Sm`)
58+
MathSymbol,
59+
/// A currency sign (Short form: `Sc`)
60+
CurrencySymbol,
61+
/// A non-letterlike modifier symbol (Short form: `Sk`)
62+
ModifierSymbol,
63+
/// A symbol of other type (Short form: `So`)
64+
OtherSymbol,
65+
/// A space character (of various non-zero widths) (Short form: `Zs`)
66+
SpaceSeparator,
67+
/// U+2028 LINE SEPARATOR only (Short form: `Zl`)
68+
LineSeparator,
69+
/// U+2029 PARAGRAPH SEPARATOR only (Short form: `Zp`)
70+
ParagraphSeparator,
71+
/// A C0 or C1 control code (Short form: `Cc`)
72+
Control,
73+
/// A format control character (Short form: `Cf`)
74+
Format,
75+
/// A surrogate code point (Short form: `Cs`)
76+
Surrogate,
77+
/// A private-use character (Short form: `Co`)
78+
PrivateUse,
79+
/// Unassigned (Short form: `Cn`)
80+
Unassigned,
81+
}
82+
83+
use self::GeneralCategory::*;
84+
85+
const GENERAL_CATEGORY_TABLE: &'static [(char, char, GeneralCategory)] =
86+
include!("tables/general_category.rsv");
87+
88+
impl GeneralCategory {
89+
/// Find the GeneralCategory of a single char.
90+
pub fn of(ch: char) -> GeneralCategory {
91+
bsearch_range_value_table(ch, GENERAL_CATEGORY_TABLE)
92+
}
93+
}
94+
95+
impl GeneralCategory {
96+
/// `Lu` | `Ll` | `Lt` (Short form: `LC`)
97+
pub fn is_cased_letter(&self) -> bool {
98+
matches!(*self, UppercaseLetter | LowercaseLetter | TitlecaseLetter)
99+
}
100+
/// `Lu` | `Ll` | `Lt` | `Lm` | `Lo` (Short form: `L`)
101+
pub fn is_letter(&self) -> bool {
102+
matches!(
103+
*self,
104+
UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter
105+
)
106+
}
107+
/// `Mn` | `Mc` | `Me` (Short form: `M`)
108+
pub fn is_mark(&self) -> bool {
109+
matches!(*self, NonspacingMark | SpacingMark | EnclosingMark)
110+
}
111+
/// `Nd` | `Nl` | `No` (Short form: `N`)
112+
pub fn is_number(&self) -> bool {
113+
matches!(*self, DecimalNumber | LetterNumber | OtherNumber)
114+
}
115+
/// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po` (Short form: `P`)
116+
pub fn is_punctuation(&self) -> bool {
117+
matches!(
118+
*self,
119+
ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation |
120+
InitialPunctuation | FinalPunctuation | OtherPunctuation
121+
)
122+
}
123+
/// `Sm` | `Sc` | `Sk` | `So` (Short form: `S`)
124+
pub fn is_symbol(&self) -> bool {
125+
matches!(
126+
*self,
127+
MathSymbol | CurrencySymbol | ModifierLetter | OtherSymbol
128+
)
129+
}
130+
/// `Zs` | `Zl` | `Zp` (Short form: `Z`)
131+
pub fn is_separator(&self) -> bool {
132+
matches!(*self, SpaceSeparator | LineSeparator | ParagraphSeparator)
133+
}
134+
/// `Cc` | `Cf` | `Cs` | `Co` | `Cn` (Short form: `C`)
135+
pub fn is_other(&self) -> bool {
136+
matches!(
137+
*self,
138+
Control | Format | Surrogate | PrivateUse | Unassigned
139+
)
140+
}
141+
}
142+
143+
fn bsearch_range_value_table(
144+
c: char,
145+
r: &'static [(char, char, GeneralCategory)],
146+
) -> GeneralCategory {
147+
match r.binary_search_by(|&(lo, hi, _)| if lo <= c && c <= hi {
148+
Ordering::Equal
149+
} else if hi < c {
150+
Ordering::Less
151+
} else {
152+
Ordering::Greater
153+
}) {
154+
Ok(idx) => {
155+
let (_, _, category) = r[idx];
156+
category
157+
}
158+
Err(_) => GeneralCategory::Unassigned,
159+
}
160+
}
161+
162+
#[cfg(test)]
163+
mod tests {
164+
use super::GeneralCategory as GC;
165+
use std::char;
166+
167+
#[test]
168+
fn test_ascii() {
169+
for c in 0x00..(0x1F + 1) {
170+
let c = char::from_u32(c).unwrap();
171+
assert_eq!(GC::of(c), GC::Control);
172+
}
173+
assert_eq!(GC::of(' '), GC::SpaceSeparator);
174+
assert_eq!(GC::of('!'), GC::OtherPunctuation);
175+
assert_eq!(GC::of('"'), GC::OtherPunctuation);
176+
assert_eq!(GC::of('#'), GC::OtherPunctuation);
177+
assert_eq!(GC::of('$'), GC::CurrencySymbol);
178+
assert_eq!(GC::of('%'), GC::OtherPunctuation);
179+
assert_eq!(GC::of('&'), GC::OtherPunctuation);
180+
assert_eq!(GC::of('\''), GC::OtherPunctuation);
181+
assert_eq!(GC::of('('), GC::OpenPunctuation);
182+
assert_eq!(GC::of(')'), GC::ClosePunctuation);
183+
assert_eq!(GC::of('*'), GC::OtherPunctuation);
184+
assert_eq!(GC::of('+'), GC::MathSymbol);
185+
assert_eq!(GC::of(','), GC::OtherPunctuation);
186+
assert_eq!(GC::of('-'), GC::DashPunctuation);
187+
assert_eq!(GC::of('.'), GC::OtherPunctuation);
188+
assert_eq!(GC::of('/'), GC::OtherPunctuation);
189+
for c in ('0' as u32)..('9' as u32 + 1) {
190+
let c = char::from_u32(c).unwrap();
191+
assert_eq!(GC::of(c), GC::DecimalNumber);
192+
}
193+
assert_eq!(GC::of(':'), GC::OtherPunctuation);
194+
assert_eq!(GC::of(';'), GC::OtherPunctuation);
195+
assert_eq!(GC::of('<'), GC::MathSymbol);
196+
assert_eq!(GC::of('='), GC::MathSymbol);
197+
assert_eq!(GC::of('>'), GC::MathSymbol);
198+
assert_eq!(GC::of('?'), GC::OtherPunctuation);
199+
assert_eq!(GC::of('@'), GC::OtherPunctuation);
200+
for c in ('A' as u32)..('Z' as u32 + 1) {
201+
let c = char::from_u32(c).unwrap();
202+
assert_eq!(GC::of(c), GC::UppercaseLetter);
203+
}
204+
assert_eq!(GC::of('['), GC::OpenPunctuation);
205+
assert_eq!(GC::of('\\'), GC::OtherPunctuation);
206+
assert_eq!(GC::of(']'), GC::ClosePunctuation);
207+
assert_eq!(GC::of('^'), GC::ModifierSymbol);
208+
assert_eq!(GC::of('_'), GC::ConnectorPunctuation);
209+
assert_eq!(GC::of('`'), GC::ModifierSymbol);
210+
for c in ('a' as u32)..('z' as u32 + 1) {
211+
let c = char::from_u32(c).unwrap();
212+
assert_eq!(GC::of(c), GC::LowercaseLetter);
213+
}
214+
assert_eq!(GC::of('{'), GC::OpenPunctuation);
215+
assert_eq!(GC::of('|'), GC::MathSymbol);
216+
assert_eq!(GC::of('}'), GC::ClosePunctuation);
217+
assert_eq!(GC::of('~'), GC::MathSymbol);
218+
}
219+
220+
#[test]
221+
fn test_bmp_edge() {
222+
// 0xFEFF ZERO WIDTH NO-BREAK SPACE (or) BYTE ORDER MARK
223+
let bom = char::from_u32(0xFEFF).unwrap();
224+
assert_eq!(GC::of(bom), GC::Format);
225+
// 0xFFFC OBJECT REPLACEMENT CHARACTER
226+
assert_eq!(GC::of(''), GC::OtherSymbol);
227+
// 0xFFFD REPLACEMENT CHARACTER
228+
assert_eq!(GC::of('�'), GC::OtherSymbol);
229+
for &c in [0xFFEF, 0xFFFE, 0xFFFF].iter() {
230+
let c = char::from_u32(c).unwrap();
231+
assert_eq!(GC::of(c), GC::Unassigned);
232+
}
233+
}
234+
235+
#[test]
236+
fn test_private_use() {
237+
for c in 0xF0000..(0xFFFFD + 1) {
238+
let c = char::from_u32(c).unwrap();
239+
assert_eq!(GC::of(c), GC::PrivateUse);
240+
}
241+
for c in 0x100000..(0x10FFFD + 1) {
242+
let c = char::from_u32(c).unwrap();
243+
assert_eq!(GC::of(c), GC::PrivateUse);
244+
}
245+
for &c in [0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF].iter() {
246+
let c = char::from_u32(c).unwrap();
247+
assert_eq!(GC::of(c), GC::Unassigned);
248+
}
249+
}
250+
}

components/ucd/category/src/lib.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
#![deny(unsafe_code, missing_docs)]
12+
13+
//! # UNIC — UCD — Category
14+
//!
15+
//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/).
16+
//!
17+
//! Unicode [General Category](http://unicode.org/reports/tr44/#General_Category).
18+
//!
19+
//! > The General_Category property of a code point provides for the most general classification of
20+
//! that code point. It is usually determined based on the primary characteristic of the assigned
21+
//! character for that code point. For example, is the character a letter, a mark, a number,
22+
//! punctuation, or a symbol, and if so, of what type? Other General_Category values define the
23+
//! classification of code points which are not assigned to regular graphic characters, including
24+
//! such statuses as private-use, control, surrogate code point, and reserved unassigned.
25+
//!
26+
//! > Many characters have multiple uses, and not all such cases can be captured entirely by the
27+
//! General_Category value. For example, the General_Category value of Latin, Greek, or Hebrew
28+
//! letters does not attempt to cover (or preclude) the numerical use of such letters as Roman
29+
//! numerals or in other numerary systems. Conversely, the General_Category of ASCII digits 0..9 as
30+
//! Nd (decimal digit) neither attempts to cover (or preclude) the occasional use of these digits as
31+
//! letters in various orthographies. The General_Category is simply the first-order, most usual
32+
//! categorization of a character.
33+
//!
34+
//! > For more information about the General_Category property, see Chapter 4,
35+
//! Character Properties in [*Unicode*](http://unicode.org/reports/tr41/tr41-21.html#Unicode).
36+
//!
37+
//! -- [Unicode® Standard Annex #44 - Unicode Character Database](http://unicode.org/reports/tr44/)
38+
//!
39+
40+
#[macro_use]
41+
extern crate matches;
42+
extern crate unic_ucd_core;
43+
44+
mod category;
45+
46+
pub use category::GeneralCategory;
47+
48+
use unic_ucd_core::UnicodeVersion;
49+
50+
/// The [Unicode version](http://www.unicode.org/versions/) of data
51+
pub const UNICODE_VERSION: UnicodeVersion = include!("tables/unicode_version.rsv");

0 commit comments

Comments
 (0)