Skip to content

Commit ee843cd

Browse files
Merge pull request #26 from github/initial-release
2 parents 8f53c50 + 0fad66c commit ee843cd

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bpe-openai"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
edition = "2021"
55
description = "Prebuilt fast byte-pair encoders for OpenAI."
66
repository = "https://github.com/github/rust-gems"
@@ -17,6 +17,10 @@ bpe = { version = "0.1.0", path = "../bpe" }
1717
rmp-serde = "1"
1818
serde = { version = "1" }
1919

20+
[dev-dependencies]
21+
fancy-regex = "0.13"
22+
tiktoken-rs = { version = "0.5" }
23+
2024
[build-dependencies]
2125
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
2226
rmp-serde = "1"

crates/bpe-openai/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ Supported token sets:
1212
- cl100k
1313
- o200k
1414

15+
> **⚠ CAUTION ⚠**
16+
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
17+
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
18+
1519
## Usage
1620

1721
Add a dependency by running

crates/bpe-openai/src/lib.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ pub fn o200k() -> &'static BytePairEncoding {
4242

4343
#[cfg(test)]
4444
mod tests {
45+
use tiktoken_rs::cl100k_base_singleton;
46+
4547
use super::*;
4648

4749
#[test]
@@ -63,4 +65,33 @@ mod tests {
6365
fn can_load_o200k() {
6466
o200k().count("".as_bytes());
6567
}
68+
69+
/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
70+
#[test]
71+
fn splitting_difference() {
72+
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
73+
let input = text.as_bytes();
74+
let expected: Vec<_> = cl100k_base_singleton()
75+
.lock()
76+
.encode_ordinary(text)
77+
.into_iter()
78+
.map(|i| i as u32)
79+
.collect();
80+
81+
let without_splitting = BPE_CL100K.encode_via_backtracking(input);
82+
assert_ne!(without_splitting, expected);
83+
84+
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
85+
let re = fancy_regex::Regex::new(pat).unwrap();
86+
println!("{}", re.find_iter(text).count());
87+
let with_splitting: Vec<_> = re
88+
.find_iter(text)
89+
.flat_map(|piece| {
90+
BPE_CL100K
91+
.encode_via_backtracking(piece.unwrap().as_str().as_bytes())
92+
.into_iter()
93+
})
94+
.collect();
95+
assert_eq!(with_splitting, expected);
96+
}
6697
}

0 commit comments

Comments
 (0)