Skip to content

Commit 082b052

Browse files
committed
fix bug and change README.md
1 parent 807eb40 commit 082b052

File tree

3 files changed

+24
-18
lines changed

3 files changed

+24
-18
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,22 @@ Python version supports more encodings, but a lot of them are old almost unused
1818

1919
## ⚡ Performance
2020

21-
This package offer better performance than Python version (3 times faster, than MYPYC version of charset-normalizer, 6 times faster than usual Python version).
22-
However, in comparison with `chardet` and `chardetng` packages it is slower but more accurate (I guess because it process whole file chunk by chunk).
21+
This package offer better performance than Python version (4 times faster, than MYPYC version of charset-normalizer, 8 times faster than usual Python version).
22+
In comparison with `chardet` and `chardetng` packages it has approximately the same speed but more accurate.
2323
Here are some numbers.
2424

2525
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
2626
|---------------------------------------------------------------------------------------------|:----------:|:------------------:|:------------------:|
2727
| [chardet](https://crates.io/crates/chardet) | 82.6 % | 2.2 ms | 450 file/sec |
2828
| [chardetng](https://crates.io/crates/chardetng) | 90.7 % | 1.6 ms | 625 file/sec |
29-
| charset-normalizer-rs | **97.1 %** | **2.7 ms** | 370 file/sec |
29+
| charset-normalizer-rs | **97.1 %** | **1.8 ms** | 555 file/sec |
3030
| [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | **98 %** | **8 ms** | 125 file/sec |
3131

3232
| Package | 99th percentile | 95th percentile | 50th percentile |
3333
|---------------------------------------------------------------------------------------------|:---------------:|:---------------:|:---------------:|
3434
| [chardet](https://crates.io/crates/chardet) | 8 ms | 2 ms | 0.2 ms |
3535
| [chardetng](https://crates.io/crates/chardetng) | 14 ms | 5 ms | 0.5 ms |
36-
| charset-normalizer-rs | 19 ms | 7 ms | 1.2 ms |
36+
| charset-normalizer-rs | 12 ms | 5 ms | 0.7 ms |
3737
| [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | 94 ms | 37 ms | 3 ms |
3838

3939
Stats are generated using 400+ files using default parameters. These results might change at any time.

src/md.rs

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
#![allow(unused_variables)]
22
#![allow(unused_imports)]
3+
34
use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION};
4-
use crate::utils::{is_suspiciously_successive_range, remove_accent, unicode_range};
5+
use crate::utils::{
6+
is_accentuated, is_cjk, is_hangul, is_hiragana, is_katakana, is_latin, is_punctuation,
7+
is_separator, is_suspiciously_successive_range, is_thai, remove_accent, unicode_range,
8+
};
59
use bitflags::{bitflags, Flags};
610
use cached::proc_macro::cached;
711
use cached::UnboundCache;
812
use log::trace;
913
use ordered_float::OrderedFloat;
1014
use unic::char::property::EnumeratedCharProperty;
11-
use unic::ucd::{GeneralCategory, Name};
15+
use unic::ucd::{is_white_space, GeneralCategory, Name};
1216

1317
//
1418
// Mess detection module
@@ -64,7 +68,6 @@ impl PartialEq for MessDetectorChar {
6468
}
6569

6670
impl MessDetectorChar {
67-
6871
pub fn new(character: char) -> Self {
6972
new_mess_detector_character(character)
7073
}
@@ -137,6 +140,7 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
137140
// whitespace
138141
if character.is_whitespace() {
139142
flags.insert(MessDetectorCharFlags::WHITESPACE);
143+
flags.insert(MessDetectorCharFlags::SEPARATOR);
140144
} else {
141145
// safe symbols (non-whitespace)
142146
if COMMON_SAFE_ASCII_CHARACTERS.contains(character) {
@@ -171,18 +175,21 @@ pub fn new_mess_detector_character(character: char) -> MessDetectorChar {
171175
// emoticon
172176
if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) {
173177
flags.insert(MessDetectorCharFlags::EMOTICON);
174-
} else {
175-
// punctuation
176-
if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) {
177-
flags.insert(MessDetectorCharFlags::PUNCTUATION);
178-
}
178+
}
179179

180-
// separator
181-
if MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) {
182-
flags.insert(MessDetectorCharFlags::SEPARATOR);
183-
}
180+
// separator
181+
if ['|', '+', '<', '>'].contains(&character)
182+
|| MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[])
183+
{
184+
flags.insert(MessDetectorCharFlags::SEPARATOR);
184185
}
185186
}
187+
188+
// punctuation
189+
if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) {
190+
flags.insert(MessDetectorCharFlags::PUNCTUATION);
191+
}
192+
186193
// symbol
187194
if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) {
188195
flags.insert(MessDetectorCharFlags::SYMBOL);

src/tests/md.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ fn test_mess_ratio() {
3333

3434
#[test]
3535
fn test_datasets_mess_ratio() {
36-
env_logger::init(); // TODO remove
3736
for (path, encoding) in &get_large_test_datasets().unwrap() {
3837
let file = File::open(path);
3938
if file.is_err() {
@@ -51,7 +50,7 @@ fn test_datasets_mess_ratio() {
5150
false,
5251
) {
5352
let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0)));
54-
assert!(mr < 0.2, "Mess ration is very high = {} for {}", mr, path);
53+
assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path);
5554
}
5655
}
5756
}

0 commit comments

Comments
 (0)