-
Notifications
You must be signed in to change notification settings - Fork 10
Reorganize benchmark to include fairer comparisons #27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
b3bac7e
262e9a7
fee4232
f0c61bf
e5c4cd9
7f627d5
599e11d
851a559
ec07a42
8f16b7f
b51951b
3f1b056
9b7f311
9c53252
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
members = [ | ||
"crates/*", | ||
"crates/bpe/benchmarks", | ||
] | ||
resolver = "2" | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,109 @@ | ||
use std::sync::LazyLock; | ||
|
||
use bpe::byte_pair_encoding::BytePairEncoding; | ||
use either::Either; | ||
use fancy_regex::Regex; | ||
|
||
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); | ||
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; | ||
Tokenizer::new(bpe, Some(pat)).expect("valid regex") | ||
}); | ||
|
||
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); | ||
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; | ||
Tokenizer::new(bpe, Some(pat)).expect("valid regex") | ||
}); | ||
|
||
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); | ||
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; | ||
Tokenizer::new(bpe, Some(pat)).expect("valid regex") | ||
}); | ||
|
||
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| { | ||
static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| { | ||
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict")); | ||
rmp_serde::from_slice(bytes).expect("") | ||
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); | ||
let pat = [ | ||
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?", | ||
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?", | ||
"\\p{N}{1,3}", | ||
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*", | ||
"\\s*[\\r\\n]+", | ||
"\\s+(?!\\S)", | ||
"\\s+", | ||
].join("|"); | ||
Tokenizer::new(bpe, Some(&pat)).expect("valid regex") | ||
}); | ||
|
||
pub use bpe::*; | ||
|
||
pub fn r50k() -> &'static BytePairEncoding { | ||
/// A byte-pair encoding tokenizer that supports a pre-tokenization regex. | ||
/// The direct methods on this type pre-tokenize the input text and should | ||
/// produce the same output as the tiktoken tokenizers. The type gives access | ||
/// to the regex and underlying bye-pair encoding if needed. Note that using | ||
/// the byte-pair encoding directly does not take the regex into account and | ||
/// may result in output that differs from tiktoken. | ||
pub struct Tokenizer { | ||
hendrikvanantwerpen marked this conversation as resolved.
Show resolved
Hide resolved
hendrikvanantwerpen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// The byte-pair encoding for this tokenizer. | ||
pub bpe: BytePairEncoding, | ||
/// The pattern regex used to split the input. | ||
pub pat: Option<Regex>, | ||
} | ||
|
||
impl Tokenizer { | ||
#[allow(clippy::result_large_err)] | ||
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: did you test different regex libraries? Is this the fastest? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't, this is the same library tiktoken uses. The regex uses negative lookahead though, which isn't supported by many libraries. The internet typically recommends this crate for regexes that use that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like someone has a PR on tiktoken to get rid of I wonder how complex the state machine for these regexes is. Perhaps not too complex if you can reuse regex logic for the character classes? |
||
let pat = pat.map(fancy_regex::Regex::new).transpose()?; | ||
Ok(Self { bpe, pat }) | ||
} | ||
|
||
pub fn count(&self, text: &str) -> usize { | ||
self.split(text) | ||
.map(|piece| self.bpe.count(piece.as_bytes())) | ||
.sum() | ||
} | ||
|
||
pub fn encode(&self, text: &str) -> Vec<u32> { | ||
self.split(text) | ||
.flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes())) | ||
.collect() | ||
} | ||
|
||
pub fn decode(&self, tokens: &[u32]) -> Option<String> { | ||
String::from_utf8(self.bpe.decode_tokens(tokens)).ok() | ||
} | ||
|
||
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a { | ||
match &self.pat { | ||
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| { | ||
let m = m.expect("match succeeded"); | ||
assert_eq!(*start, m.start(), "pattern should match all input text"); | ||
*start = m.end(); | ||
Some(m.as_str()) | ||
})), | ||
None => Either::Right(std::iter::once(text)), | ||
} | ||
} | ||
} | ||
|
||
pub fn r50k() -> &'static Tokenizer { | ||
&BPE_R50K | ||
} | ||
|
||
pub fn p50k() -> &'static BytePairEncoding { | ||
pub fn p50k() -> &'static Tokenizer { | ||
&BPE_P50K | ||
} | ||
|
||
pub fn cl100k() -> &'static BytePairEncoding { | ||
pub fn cl100k() -> &'static Tokenizer { | ||
&BPE_CL100K | ||
} | ||
|
||
pub fn o200k() -> &'static BytePairEncoding { | ||
pub fn o200k() -> &'static Tokenizer { | ||
&BPE_O200K | ||
} | ||
|
||
|
@@ -48,25 +115,25 @@ mod tests { | |
|
||
#[test] | ||
fn can_load_r50k() { | ||
r50k().count("".as_bytes()); | ||
r50k().count(""); | ||
} | ||
|
||
#[test] | ||
fn can_load_p50k() { | ||
p50k().count("".as_bytes()); | ||
p50k().count(""); | ||
} | ||
|
||
#[test] | ||
fn can_load_cl100k() { | ||
cl100k().count("".as_bytes()); | ||
cl100k().count(""); | ||
} | ||
|
||
#[test] | ||
fn can_load_o200k() { | ||
o200k().count("".as_bytes()); | ||
o200k().count(""); | ||
} | ||
|
||
/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting. | ||
/// Test demonstrating a case where input splitting makes a difference. | ||
#[test] | ||
fn splitting_difference() { | ||
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle"; | ||
|
@@ -78,20 +145,10 @@ mod tests { | |
.map(|i| i as u32) | ||
.collect(); | ||
|
||
let without_splitting = BPE_CL100K.encode_via_backtracking(input); | ||
let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input); | ||
assert_ne!(without_splitting, expected); | ||
|
||
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; | ||
let re = fancy_regex::Regex::new(pat).unwrap(); | ||
println!("{}", re.find_iter(text).count()); | ||
let with_splitting: Vec<_> = re | ||
.find_iter(text) | ||
.flat_map(|piece| { | ||
BPE_CL100K | ||
.encode_via_backtracking(piece.unwrap().as_str().as_bytes()) | ||
.into_iter() | ||
}) | ||
.collect(); | ||
let with_splitting: Vec<_> = BPE_CL100K.encode(text); | ||
assert_eq!(with_splitting, expected); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.