github · hendrikvanantwerpen · Oct 14, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
@@ -2,6 +2,7 @@
 
 members = [
     "crates/*",
+    "crates/bpe/benchmarks",
 ]
 resolver = "2"
 

@@ -14,11 +14,12 @@ bench = false
 
 [dependencies]
 bpe = { version = "0.1.0", path = "../bpe" }
+either = "1.13"
+fancy-regex = "0.13"
 rmp-serde = "1"
 serde = { version = "1" }
 
 [dev-dependencies]
-fancy-regex = "0.13"
 tiktoken-rs = { version = "0.5" }
 
 [build-dependencies]

@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
 The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
 For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
 
-Supported token sets:
+Supported tokenizers:
 
 - r50k
 - p50k
 - cl100k
 - o200k
 
-> **⚠ CAUTION ⚠**
-> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
-> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
-
 ## Usage
 
 Add a dependency by running

@@ -1,42 +1,109 @@
 use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
+use either::Either;
+use fancy_regex::Regex;
 
-static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat = [
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "\\p{N}{1,3}",
+        " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
+        "\\s*[\\r\\n]+",
+        "\\s+(?!\\S)",
+        "\\s+",
+    ].join("|");
+    Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
 });
 
 pub use bpe::*;
 
-pub fn r50k() -> &'static BytePairEncoding {
+/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
+/// The direct methods on this type pre-tokenize the input text and should
+/// produce the same output as the tiktoken tokenizers. The type gives access
+/// to the regex and underlying bye-pair encoding if needed. Note that using
+/// the byte-pair encoding directly does not take the regex into account and
+/// may result in output that differs from tiktoken.
+pub struct Tokenizer {
+    /// The byte-pair encoding for this tokenizer.
+    pub bpe: BytePairEncoding,
+    /// The pattern regex used to split the input.
+    pub pat: Option<Regex>,
+}
+
+impl Tokenizer {
+    #[allow(clippy::result_large_err)]
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
+        let pat = pat.map(fancy_regex::Regex::new).transpose()?;
+        Ok(Self { bpe, pat })
+    }
+
+    pub fn count(&self, text: &str) -> usize {
+        self.split(text)
+            .map(|piece| self.bpe.count(piece.as_bytes()))
+            .sum()
+    }
+
+    pub fn encode(&self, text: &str) -> Vec<u32> {
+        self.split(text)
+            .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
+            .collect()
+    }
+
+    pub fn decode(&self, tokens: &[u32]) -> Option<String> {
+        String::from_utf8(self.bpe.decode_tokens(tokens)).ok()
+    }
+
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
+        match &self.pat {
+            Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
+                let m = m.expect("match succeeded");
+                assert_eq!(*start, m.start(), "pattern should match all input text");
+                *start = m.end();
+                Some(m.as_str())
+            })),
+            None => Either::Right(std::iter::once(text)),
+        }
+    }
+}
+
+pub fn r50k() -> &'static Tokenizer {
     &BPE_R50K
 }
 
-pub fn p50k() -> &'static BytePairEncoding {
+pub fn p50k() -> &'static Tokenizer {
     &BPE_P50K
 }
 
-pub fn cl100k() -> &'static BytePairEncoding {
+pub fn cl100k() -> &'static Tokenizer {
     &BPE_CL100K
 }
 
-pub fn o200k() -> &'static BytePairEncoding {
+pub fn o200k() -> &'static Tokenizer {
     &BPE_O200K
 }
 
@@ -48,25 +115,25 @@ mod tests {
 
     #[test]
     fn can_load_r50k() {
-        r50k().count("".as_bytes());
+        r50k().count("");
     }
 
     #[test]
     fn can_load_p50k() {
-        p50k().count("".as_bytes());
+        p50k().count("");
     }
 
     #[test]
     fn can_load_cl100k() {
-        cl100k().count("".as_bytes());
+        cl100k().count("");
     }
 
     #[test]
     fn can_load_o200k() {
-        o200k().count("".as_bytes());
+        o200k().count("");
     }
 
-    /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
+    /// Test demonstrating a case where input splitting makes a difference.
     #[test]
     fn splitting_difference() {
         let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +145,10 @@ mod tests {
             .map(|i| i as u32)
             .collect();
 
-        let without_splitting = BPE_CL100K.encode_via_backtracking(input);
+        let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
         assert_ne!(without_splitting, expected);
 
-        let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-        let re = fancy_regex::Regex::new(pat).unwrap();
-        println!("{}", re.find_iter(text).count());
-        let with_splitting: Vec<_> = re
-            .find_iter(text)
-            .flat_map(|piece| {
-                BPE_CL100K
-                    .encode_via_backtracking(piece.unwrap().as_str().as_bytes())
-                    .into_iter()
-            })
-            .collect();
+        let with_splitting: Vec<_> = BPE_CL100K.encode(text);
         assert_eq!(with_splitting, expected);
     }
 }
@@ -12,12 +12,6 @@ categories = ["algorithms", "data-structures", "encoding", "science"]
 crate-type = ["lib", "staticlib"]
 bench = false
 
-[[bench]]
-name = "performance"
-path = "benches/performance.rs"
-harness = false
-test = false
-
 [features]
 rand = ["dep:rand"]
 tiktoken-rs = ["dep:tiktoken-rs"]
@@ -33,4 +27,3 @@ tiktoken-rs = { version = "0.5", optional = true }
 
 [dev-dependencies]
 bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
-criterion = "0.5"