|
1 | 1 | #![allow(unused_variables)]
|
2 | 2 | #![allow(unused_imports)]
|
| 3 | + |
3 | 4 | use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION};
|
4 |
| -use crate::utils::{is_suspiciously_successive_range, remove_accent, unicode_range}; |
| 5 | +use crate::utils::{ |
| 6 | + is_accentuated, is_cjk, is_hangul, is_hiragana, is_katakana, is_latin, is_punctuation, |
| 7 | + is_separator, is_suspiciously_successive_range, is_thai, remove_accent, unicode_range, |
| 8 | +}; |
5 | 9 | use bitflags::{bitflags, Flags};
|
6 | 10 | use cached::proc_macro::cached;
|
7 | 11 | use cached::UnboundCache;
|
8 | 12 | use log::trace;
|
9 | 13 | use ordered_float::OrderedFloat;
|
10 | 14 | use unic::char::property::EnumeratedCharProperty;
|
11 |
| -use unic::ucd::{GeneralCategory, Name}; |
| 15 | +use unic::ucd::{is_white_space, GeneralCategory, Name}; |
12 | 16 |
|
13 | 17 | //
|
14 | 18 | // Mess detection module
|
@@ -64,7 +68,6 @@ impl PartialEq for MessDetectorChar {
|
64 | 68 | }
|
65 | 69 |
|
66 | 70 | impl MessDetectorChar {
|
67 |
| - |
68 | 71 | pub fn new(character: char) -> Self {
|
69 | 72 | new_mess_detector_character(character)
|
70 | 73 | }
|
@@ -137,6 +140,7 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
|
137 | 140 | // whitespace
|
138 | 141 | if character.is_whitespace() {
|
139 | 142 | flags.insert(MessDetectorCharFlags::WHITESPACE);
|
| 143 | + flags.insert(MessDetectorCharFlags::SEPARATOR); |
140 | 144 | } else {
|
141 | 145 | // safe symbols (non-whitespace)
|
142 | 146 | if COMMON_SAFE_ASCII_CHARACTERS.contains(character) {
|
@@ -171,18 +175,21 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
|
171 | 175 | // emoticon
|
172 | 176 | if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) {
|
173 | 177 | flags.insert(MessDetectorCharFlags::EMOTICON);
|
174 |
| - } else { |
175 |
| - // punctuation |
176 |
| - if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) { |
177 |
| - flags.insert(MessDetectorCharFlags::PUNCTUATION); |
178 |
| - } |
| 178 | + } |
179 | 179 |
|
180 |
| - // separator |
181 |
| - if MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) { |
182 |
| - flags.insert(MessDetectorCharFlags::SEPARATOR); |
183 |
| - } |
| 180 | + // separator |
| 181 | + if ['|', '+', '<', '>'].contains(&character) |
| 182 | + || MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) |
| 183 | + { |
| 184 | + flags.insert(MessDetectorCharFlags::SEPARATOR); |
184 | 185 | }
|
185 | 186 | }
|
| 187 | + |
| 188 | + // punctuation |
| 189 | + if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) { |
| 190 | + flags.insert(MessDetectorCharFlags::PUNCTUATION); |
| 191 | + } |
| 192 | + |
186 | 193 | // symbol
|
187 | 194 | if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) {
|
188 | 195 | flags.insert(MessDetectorCharFlags::SYMBOL);
|
|
0 commit comments