Fix example and add test reproducing it

hendrikvanantwerpen · hendrikvanantwerpen · commit 814ef8d4322f · 2024-10-02T11:55:28.000+02:00
diff --git a/crates/bpe/README.md b/crates/bpe/README.md
@@ -94,35 +94,35 @@ Given a valid encoding sequence `e_0..e_i` and a valid encoding tuple `e_i e_j`,
 ## Novel Algorithm
 
 At a first glance, it seems impossible to achieve `O(n)` complexity while preserving the encoding output of the original BPE algorithm, since the original BPE algorithm needs to first scan the full input before it can make any encoding decision.
-For instance, the sequence `abab` would be encoded as `ab ab` when the dictionary contains the tokens `a b ab ba bc abc babc ababc` ordered by frequency. But appending a single character `ababc` would result in a pretty different tokenization: `ab a bc`. So without looking ahead it seems impossible to properly tokenize the text.
+For instance, the sequence `abac` would be encoded as `ab ac` when the dictionary contains the tokens `a b c ab cb ac` ordered by frequency. But appending a single character `abacb` would result in a pretty different tokenization: `ab a cb`. So without looking ahead it seems impossible to properly tokenize the text.
 
-The solution is to track the encodings of ALL text prefixes. For our example `ababc` we would get:
+The solution is to track the encodings of ALL text prefixes. For our example `abacb` we would get:
 
 - `a` ------> `a`
 - `ab` -----> `ab`
 - `aba` ----> `ab a`
-- `abab` ---> `ab ab`
-- `ababc` --> `ab a bc`
+- `abab` ---> `ab ac`
+- `ababc` --> `ab a cb`
 
 This can be done much more efficiently thanks to Corollary IIa, since now only the last token of every prefix has to be remembered:
 
 - `a` ------> `a`
 - `ab` -----> `ab`
 - `aba` ----> `a`
-- `abab` ---> `ab`
-- `ababc` --> `bc`
+- `abac` ---> `ac`
+- `abacb` --> `bc`
 
 In order to reconstruct the full encoding for a specific prefix, one simply starts with the last token of that prefix, shortens the prefix by the extracted token and looks up the token associated with the shortened prefix and so on until the beginning of the text is reached.
 
-For our example prefix `ababc`, this procedure executes the following steps and determines the correct encoding in reverse order:
+For our example prefix `abacb`, this procedure executes the following steps and determines the correct encoding in reverse order:
 
-- `ababc` -> `bc`
+- `abacb` -> `cb`
 - `aba` ---> `a`
 - `ab` ----> `ab`
 - `<empty>`
 
 The actual challenge is to determine for every prefix this last token efficiently.
-The prefix `abab` could for instance end with either the token `b` or `ab`, but only `ab` leads to a valid encoding sequence.
+The prefix `abac` could for instance end with either the token `c` or `ac`, but only `ac` leads to a valid encoding sequence.
 But, Corollary IIa tells us that **one and only one** last token can be the correct one and Corollary IIIa shows us how to find it:
 We only have to check whether a possible next token is "compatible" with its previous token, i.e. whether the two tokens form a valid encoding sequence.
 
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs
@@ -176,12 +176,12 @@ pub fn find_hash_factor_for_tiktoken(bpe: &tiktoken_rs::CoreBPE, len: usize) ->
 /// Find a suitable hash factor for a set of given tokens that prevents collisions when
 /// constructing a [`BytePairEncoding`] from those tokens.
 #[cfg(feature = "rand")]
-pub fn find_hash_factor_for_dictionary(iter: impl Iterator<Item = Vec<u8>>) -> u64 {
+pub fn find_hash_factor_for_dictionary(tokens: impl IntoIterator<Item = Vec<u8>>) -> u64 {
     use std::collections::HashSet;
 
     use rand::Rng;
 
-    let all_tokens = iter.collect_vec();
+    let all_tokens = tokens.into_iter().collect_vec();
     let mut rnd = rand::thread_rng();
     loop {
         let factor: u64 = rnd.gen();
@@ -244,15 +244,18 @@ impl BytePairEncoding {
     ///
     /// The recommended approach is to store the serialized value and reuse that,
     /// to prevent repeating the cost of computing the hash factor and encoding.
-    pub fn from_dictionary(iter: impl Iterator<Item = Vec<u8>>, hash_factor: Option<u64>) -> Self {
+    pub fn from_dictionary(
+        tokens: impl IntoIterator<Item = Vec<u8>>,
+        hash_factor: Option<u64>,
+    ) -> Self {
         let hash_factor = hash_factor
             .inspect(|f| assert_ne!(*f, 0, "hash factor must be larger than zero"))
             .unwrap_or(1);
         let mut all_tokens = Vec::new();
         let mut all_tokens_rev = Vec::new();
         let mut token_starts = vec![0];
         let mut bytes_hash_to_token = FnvHashMap::default();
-        for (i, token) in iter.enumerate() {
+        for (i, token) in tokens.into_iter().enumerate() {
             bytes_hash_to_token.insert(hash_bytes(&token, hash_factor), i as u32);
             all_tokens_rev.extend(token.iter().copied().rev());
             all_tokens.extend(token);
diff --git a/crates/bpe/src/lib.rs b/crates/bpe/src/lib.rs
@@ -4,3 +4,64 @@ mod bitfield;
 pub mod byte_pair_encoding;
 pub mod interval_encoding;
 pub mod prependable_encoder;
+
+#[cfg(test)]
+mod tests {
+    use itertools::Itertools;
+
+    use crate::byte_pair_encoding::BytePairEncoding;
+
+    /// This test produces the output for the encoding example in the README.
+    #[test]
+    fn readme_example() {
+        let tokens = ["a", "b", "c", "ab", "cb", "ac"].map(|t| t.as_bytes().to_vec());
+        let bpe = BytePairEncoding::from_dictionary(tokens, None);
+        let text = "abacb";
+        let prefixes = (1..=text.len()).map(|end| &text[..end]).collect_vec();
+        let all_prefix_tokens = prefixes
+            .iter()
+            .map(|prefix| {
+                bpe.encode_via_backtracking(prefix.as_bytes())
+                    .into_iter()
+                    .map(|t| unsafe { String::from_utf8_unchecked(bpe.decode_tokens(&[t])) })
+                    .collect_vec()
+            })
+            .collect_vec();
+        let last_prefix_tokens = all_prefix_tokens
+            .iter()
+            .map(|tokens| tokens.last().unwrap())
+            .collect_vec();
+
+        println!("All tokens for each prefix of `{text}`:\n");
+        for (prefix, tokens) in prefixes.iter().zip(&all_prefix_tokens) {
+            println!(
+                "- `{prefix}` {}> `{}`",
+                "-".repeat(text.len() + 2 - prefix.len()),
+                tokens.join(" ")
+            );
+        }
+        println!();
+
+        println!("Last token for each prefix of `{text}`:\n");
+        for (prefix, token) in prefixes.iter().zip(&last_prefix_tokens) {
+            println!(
+                "- `{prefix}` {}> `{token}`",
+                "-".repeat(text.len() + 2 - prefix.len()),
+            );
+        }
+        println!();
+
+        println!("Tokenization of `{text}`:\n");
+        let mut remaining = text.len();
+        while remaining > 0 {
+            let prefix = &text[..remaining];
+            let token = last_prefix_tokens[remaining - 1];
+            println!(
+                "- `{prefix}` {}> `{token}`",
+                "-".repeat(text.len() + 2 - prefix.len()),
+            );
+            remaining -= token.len();
+        }
+        println!("- `<empty>`");
+    }
+}