-
Notifications
You must be signed in to change notification settings - Fork 3
Mega Change #28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Mega Change #28
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
wrap_comments = true |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
//! This module represents an effort to be result-compatible with Python's [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy). | ||
//! | ||
//! This module is great for getting started or porting from using [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy). | ||
//! The rest of this crate might be more accurate or offer more advanced use | ||
//! cases if your project needs grow beyond that. | ||
//! | ||
//! This module's implementation may change at any time if it improves compliance with [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy)'s results. | ||
//! | ||
//! Warning/Note: Almost everything in this module assumes [Codepoint Segmentation](CodePointSegmenter) which is not always appropriate. | ||
|
||
pub mod fuzz; | ||
pub mod process; | ||
pub mod string_processing; | ||
pub mod utils; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
//! Convenience methods to process fuzzy matching queries for common use cases. | ||
//! | ||
//! Lack of functions compared to [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy) are just 'get top N', at this point. | ||
use crate::fuzzywuzzy_compatible::fuzz::wratio; | ||
use crate::fuzzywuzzy_compatible::utils::full_process; | ||
use crate::primitives::{Match, Processor, Score, Scorer, Sorter}; | ||
use core::cmp::Ordering; | ||
use core::convert::TryInto; | ||
|
||
/// The default scorer used for functions in this module. Delegates to [wratio]. | ||
/// | ||
/// The `&&str` is a consequence of the type signature of the [Scorer] trait | ||
/// and Rust disliking `str` without a reference. | ||
pub fn default_scorer(query: &&str, choice: String) -> Score { | ||
wratio(query, &choice, true, true).try_into().unwrap() | ||
} | ||
|
||
/// The default processor used for functions in this module. Delegates to | ||
/// [full_process]. | ||
/// | ||
/// The `&&str` is a consequence of many functions generically accepting `A` and | ||
/// passing in `&A` to [Processor]s. and Rust disliking `str` without a | ||
/// reference. | ||
pub fn default_processor(s: &&str) -> String { | ||
full_process(*s, false) | ||
} | ||
|
||
/// Quickly compare and return scored [Match]es of the choices against the | ||
/// query. Delegates to [extract_without_order_full]. | ||
/// | ||
/// ``` | ||
/// # use core::convert::TryInto; | ||
/// # use fuzzywuzzy::primitives::{Match, Score}; | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::process::extract_without_order; | ||
/// let query = "bar"; | ||
/// let choices = vec!["foo", "bar", "baz"]; | ||
/// assert_eq!(extract_without_order(query, &choices), vec![Match{ item: "foo", score: 0.try_into().unwrap() }, Match{ item: "bar", score: 100.try_into().unwrap() }, Match{ item: "baz", score: 67.try_into().unwrap() }, ]); | ||
/// ``` | ||
pub fn extract_without_order<'a>(query: &'a str, choices: &[&'a str]) -> Vec<Match<&'a str>> { | ||
extract_without_order_full( | ||
query, | ||
choices, | ||
default_processor, | ||
default_scorer, | ||
Score::new(0).unwrap(), | ||
) | ||
.into_iter() | ||
.map(|s| Match { | ||
item: *s.item, | ||
score: s.score, | ||
}) | ||
.collect() | ||
} | ||
|
||
// TODO: add to a doctest here.... | ||
// let choices = vec![ | ||
// "new york mets vs chicago cubs", | ||
// "chicago cubs vs chicago white sox", | ||
// "philladelphia phillies vs atlanta braves", | ||
// "braves vs mets", | ||
// ]; | ||
// let expected_results = vec![ | ||
// ("new york mets vs chicago cubs".to_string(), 86u8), | ||
// ("chicago cubs vs chicago white sox".to_string(), 86u8), | ||
// ("philladelphia phillies vs atlanta braves".to_string(), 54u8), | ||
// ("braves vs mets".to_string(), 57u8) | ||
// ]; | ||
// assert_eq!( | ||
// extract_without_order( | ||
// "brave new cubs", | ||
// choices, | ||
// |s, b| s.into(), // an alternative to full_process. | ||
// &wratio, | ||
// 0), | ||
// expected_results); | ||
// ``` | ||
|
||
/// Return scored [Match]es of the choices against the query with your choice of | ||
/// [Processor] and [Scorer]. | ||
/// | ||
/// ``` | ||
/// # use core::convert::TryInto; | ||
/// # use fuzzywuzzy::primitives::{Match, Score}; | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::process::{ default_processor, default_scorer, extract_without_order_full }; | ||
/// let query = "bar"; | ||
/// let choices = vec!["foo", "bar", "baz"]; | ||
/// assert_eq!(extract_without_order_full(query, &choices, default_processor, default_scorer, 0.try_into().unwrap()), vec![Match{ item: &"foo", score: 0.try_into().unwrap() }, Match{ item: &"bar", score: 100.try_into().unwrap() }, Match{ item: &"baz", score: 67.try_into().unwrap() }, ]); | ||
/// ``` | ||
pub fn extract_without_order_full<'a, 'b, A: 'a, B, C, P, S>( | ||
query: A, | ||
choices: &'b [B], | ||
processor: P, | ||
scorer: S, | ||
score_cutoff: Score, | ||
) -> Vec<Match<&'b B>> | ||
where | ||
P: Processor<&'b B, C>, | ||
// because 'inner can be *any* lifetime (including very short ones), this tells Rust that the | ||
// scorer doesn't need our &query for long | ||
S: for<'inner> Scorer<&'inner A, C>, | ||
{ | ||
if choices.is_empty() { | ||
return Vec::new(); | ||
} | ||
let mut result = Vec::new(); | ||
for c in choices.iter() { | ||
let score = scorer.score(&query, processor.process(c)); | ||
if score >= score_cutoff { | ||
result.push(Match { item: c, score }); | ||
} | ||
} | ||
result | ||
} | ||
|
||
/// Quickly, stably dedupe strings by fuzzily comparing them to each other. | ||
/// Delegates to [dedupe_full]. | ||
pub fn dedupe<'a>(items: &[&'a str], threshold: Score) -> Vec<&'a str> { | ||
let scorer = |a: &&str, b: &&str| wratio(a, b, true, true).try_into().unwrap(); | ||
let sorter = |a: &&str, b: &&str| match a.len().cmp(&b.len()) { | ||
Ordering::Less => Ordering::Less, | ||
Ordering::Greater => Ordering::Greater, | ||
Ordering::Equal => a.cmp(b), | ||
}; | ||
dedupe_full(items, threshold, scorer, sorter, true) | ||
.into_iter() | ||
.map(|s| *s) | ||
.collect() | ||
} | ||
|
||
/// Given a list of items, fuzzily deduplicate them by comparing them to each | ||
/// other using the [Scorer]. | ||
/// | ||
/// For each item in the list, we compare it to all other items. All items with | ||
/// a [Score] exceeding `threshold` are collected and stably sorted according to | ||
/// the [Sorter] which is emitted as the canonical representation. | ||
/// | ||
/// After each item is replaced by the representative for its batch, the | ||
/// intermediate result containing all canonical values are sorted (using the | ||
/// natural sorting order for the type) and deduped with a window of size 2. | ||
/// | ||
/// If the `stable` parameter is true, the result will be sorted in the order | ||
/// they first appear in the input. | ||
/// | ||
/// There can be strange chains of logic with hidden or unexpected results if | ||
/// the scorer is not commutative (e.g., `score(a,b) != score(b,a)`). | ||
/// For example, given [A, B, C] where A ~ B, B > A, B ~ C, C > B, we might | ||
/// return [B, C] because B took A's place and C took B's place. | ||
/// | ||
/// ``` | ||
/// # use core::cmp::Ordering; | ||
/// # use core::convert::TryInto; | ||
/// # use fuzzywuzzy::primitives::{Match, Score}; | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::fuzz::wratio; | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::process::{ default_processor, default_scorer, dedupe_full }; | ||
/// let frodo_baggin = "Frodo Baggin"; | ||
/// let frodo_baggins = "Frodo Baggins"; | ||
/// let f_baggins = "F. Baggins"; | ||
/// let samwise = "Samwise G."; | ||
/// let gandalf = "Gandalf"; | ||
/// let bilbo = "Bilbo Baggins"; | ||
/// let contains_dupes = vec![frodo_baggin, samwise, frodo_baggins, samwise, gandalf, bilbo, f_baggins]; | ||
/// // Notice that later instances of the same 'group' are gone but the order of the groups is maintained. | ||
/// let expected_stable = vec![&frodo_baggins, &samwise, &gandalf, &bilbo]; | ||
/// // ... but not when we don't require `stable`. | ||
/// let expected_unstable = vec![&samwise, &gandalf, &frodo_baggins, &bilbo]; | ||
/// let scorer = |a: &&str, b: &&str| wratio(a, b, true, true).try_into().unwrap(); | ||
/// let sorter = |a: &&str, b: &&str| { | ||
/// match a.len().cmp(&b.len()) { | ||
/// Ordering::Less => Ordering::Less, | ||
/// Ordering::Greater => Ordering::Greater, | ||
/// Ordering::Equal => a.cmp(b), | ||
/// } | ||
/// }; | ||
/// assert_eq!(dedupe_full(&contains_dupes, 70.try_into().unwrap(), scorer, sorter, true), expected_stable); | ||
/// assert_eq!(dedupe_full(&contains_dupes, 70.try_into().unwrap(), scorer, sorter, false), expected_unstable); | ||
/// ``` | ||
pub fn dedupe_full<'a, A: 'a + Eq + Ord>( | ||
items: &'a [A], | ||
threshold: Score, | ||
scorer: impl Scorer<&'a A, &'a A>, | ||
sorter: impl Sorter<&'a A>, | ||
stable: bool, | ||
) -> Vec<&'a A> { | ||
let mut extractor = Vec::new(); | ||
for item in items.iter() { | ||
let mut matches = extract_without_order_full( | ||
item, | ||
items, | ||
|a: &'a A| a, | ||
|a: &&'a A, b: &'a A| scorer.score(a, b), | ||
threshold, | ||
); | ||
matches.sort_by(|a, b| sorter.sort(a.item, b.item).reverse()); | ||
extractor.extend(matches.iter().map(|m| m.item).take(1)); | ||
} | ||
// unstable case first because it is easier | ||
if !stable { | ||
// unstably sort with our order | ||
extractor.sort_unstable_by(|a, b| a.cmp(b).reverse()); | ||
extractor.dedup(); | ||
extractor | ||
} else { | ||
// to maintain the order we had before deletion: | ||
// 1. we save the index | ||
// 2. sort by our items, | ||
// 3. dedup | ||
// 4. re-sort by our original indices | ||
|
||
// (1) - save/enumerate our original indices | ||
let mut sorted: Vec<(usize, &A)> = extractor.into_iter().enumerate().collect(); | ||
// (2) - sort by our items | ||
sorted.sort_unstable_by(|(_, a), (_, b)| a.cmp(b).reverse()); | ||
let sorted_length = sorted.len(); | ||
// (3) - dedup | ||
let mut deduped = | ||
sorted | ||
.into_iter() | ||
.fold(Vec::with_capacity(sorted_length), |mut v, (index, item)| { | ||
match v.last() { | ||
Some(&(_, last)) => { | ||
if last != item { | ||
v.push((index, item)); | ||
} | ||
} | ||
None => v.push((index, item)), | ||
} | ||
v | ||
}); | ||
// (4) - re-sort by our indices | ||
// unstable sort is okay here because the indices are unique | ||
deduped.sort_unstable_by(|(a1, _), (b1, _)| a1.cmp(b1)); | ||
deduped.into_iter().map(|(_, item)| item).collect() | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
//! Ported functions from fuzzywuzzy.string_processing | ||
|
||
use crate::normalization::{Normalizer, SplittingAlphanumericNormalizer}; | ||
|
||
/// Replaces sequences of characters that are not letters or numbers with a | ||
/// single space. | ||
/// | ||
/// Note that this function does not take into account | ||
/// [normalization](crate::normalization) or | ||
/// [segmentation](crate::segmentation). | ||
/// | ||
/// Note, for compatibility with Python's fuzzywuzzy which internally uses the | ||
/// `\W` regex character class, we include underscore (`'_'`) as a | ||
/// letter/number. | ||
/// | ||
/// There might be other unknown differences between Python's `re` module's | ||
/// implementation of `\W` and Rust's implementation of [char::is_alphanumeric]. | ||
/// ``` | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::string_processing::replace_non_letters_non_numbers_with_whitespace; | ||
/// assert_eq!(replace_non_letters_non_numbers_with_whitespace("abc 123"), "abc 123"); | ||
/// assert_eq!(replace_non_letters_non_numbers_with_whitespace("abc!!!123"), "abc 123"); | ||
/// // Some codepoints like common diacritics are removed. | ||
/// assert_eq!(replace_non_letters_non_numbers_with_whitespace("a\u{0308}bc"), "a bc"); | ||
/// // But single-character codepoints like U+00E4 are not. | ||
/// assert_eq!(replace_non_letters_non_numbers_with_whitespace("äbc"), "äbc"); | ||
/// // Known incompatibility: Python's fuzzywuzzy converts the combining characters below, | ||
/// // but Rust considers them to be alphabetic characters so they are not. | ||
/// // Future versions of fuzzywuzzy-rs may fix this. | ||
/// // assert_eq!(replace_non_letters_non_numbers_with_whitespace("abcØØØकिमपि"), "abcØØØक मप "); | ||
/// assert_eq!(replace_non_letters_non_numbers_with_whitespace("abcØØØकिमपि"), "abcØØØकिमपि"); | ||
/// ``` | ||
pub fn replace_non_letters_non_numbers_with_whitespace(s: &str) -> String { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More nitpicking:
|
||
SplittingAlphanumericNormalizer.normalize(s) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
//! Choice ported functions from fuzzywuzzy.utils (if they make sense) | ||
//! | ||
//! For example, `fuzzywuzzy.utils.validate_string` doesn't make much sense | ||
//! because we do not need to validate the type. | ||
|
||
use crate::fuzzywuzzy_compatible::string_processing::replace_non_letters_non_numbers_with_whitespace; | ||
|
||
/// Returns a [String] composed of all of the ASCII pieces of the input string. | ||
/// | ||
/// Note that this function does not include extended ASCII characters. | ||
/// ``` | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::utils::asciionly; | ||
/// assert_eq!(asciionly("abc123"), "abc123"); | ||
/// assert_eq!(asciionly("abcØØØ"), "abc"); | ||
/// assert_eq!(asciionly("abcØØØकिमपि"), "abc"); | ||
/// assert_eq!(asciionly("ØØØकिमपि"), ""); | ||
/// ``` | ||
pub fn asciionly(s: &str) -> String { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More nitpicking:
|
||
s.chars().filter(char::is_ascii).collect() | ||
} | ||
|
||
/// Process string by removing all but letters and numbers, force to lowercase, | ||
/// trim whitespace. | ||
/// | ||
/// If `force_ascii`, first force convert to ASCII with [asciionly]. Because | ||
/// this can happen before the removal of characters via | ||
/// [replace_non_letters_non_numbers_with_whitespace], it can affect whitespace. | ||
/// ``` | ||
/// # use fuzzywuzzy::fuzzywuzzy_compatible::utils::full_process; | ||
/// assert_eq!(full_process("ABC What! do_ you mean? ... ", false), "abc what do_ you mean"); | ||
/// // U+00E4 | ||
/// assert_eq!(full_process(" äbc ", false), "äbc"); | ||
/// assert_eq!(full_process(" äbc ", true), "bc"); | ||
/// // U+0061 + U+0308 | ||
/// // Notice the change in whitespace. | ||
/// // This could also happen with various unicode symbols or punctuation. | ||
/// assert_eq!(full_process(" a\u{0308}bc ", false), "a bc"); | ||
/// assert_eq!(full_process(" a\u{0308}bc ", true), "abc"); | ||
/// assert_eq!(full_process("Lorem Ipsum", false), "lorem ipsum"); | ||
/// assert_eq!(full_process("C'est la vie", false), "c est la vie"); | ||
/// assert_eq!(full_process("Ça va?", false), "ça va"); | ||
/// assert_eq!(full_process("Cães danados", false), "cães danados"); | ||
/// assert_eq!(full_process("¬Camarões assados", false), "camarões assados"); | ||
/// assert_eq!(full_process("a¬4ሴ2€耀", false), "a 4ሴ2 耀"); | ||
/// assert_eq!(full_process("Á", false), "á"); | ||
/// | ||
/// assert_eq!(full_process("Lorem Ipsum", true), "lorem ipsum"); | ||
/// assert_eq!(full_process("C'est la vie", true), "c est la vie"); | ||
/// assert_eq!(full_process("Ça va?", true), "a va"); | ||
/// assert_eq!(full_process("Cães danados", true), "ces danados"); | ||
/// assert_eq!(full_process("¬Camarões assados", true), "camares assados"); | ||
/// // Notice that the filtering of non-ascii values occurs *before* replacing | ||
/// // non-alphanumeric with whitespace, which changes the result dramatically. | ||
/// assert_eq!(full_process("a¬4ሴ2€耀", true), "a42"); | ||
/// assert_eq!(full_process("Á", true), ""); | ||
/// ``` | ||
pub fn full_process(s: &str, force_ascii: bool) -> String { | ||
let tmp = if force_ascii { asciionly(s) } else { s.into() }; | ||
replace_non_letters_non_numbers_with_whitespace(&tmp) | ||
.to_lowercase() | ||
.trim() | ||
.into() | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pedantic nitpicking: is
_full
a common suffix for functions providing more behavior controls in the signature?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know! It would be good to find some prior art on this before 1.0 but its probably fine for this PR. If you think we can do better, make an issue so we remember to check it out again.