Skip to content

Commit 79f0a02

Browse files
Add correctness test for o200k
1 parent a94431a commit 79f0a02

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

crates/bpe/src/byte_pair_encoding.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,39 @@ mod tests {
567567
use std::time::Instant;
568568

569569
use itertools::Itertools;
570-
use tiktoken_rs::cl100k_base_singleton;
570+
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton};
571571

572572
use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
573573

574574
#[test]
575-
fn test_correctness() {
575+
fn test_correctness_cl100k() {
576+
// This is quite a challenging test case...
577+
let test_string = std::str::from_utf8(&[
578+
125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,
579+
112, 105, 115, 105, 99, 105, 110, 103, 105, 116, 121, 69, 110, 103, 105, 110, 101, 32,
580+
69, 67, 105, 114, 105, 101, 32, 111, 112, 116, 105, 109, 97, 108, 95, 68, 65, 32, 111,
581+
102, 102, 101, 110, 100,
582+
])
583+
.unwrap();
584+
let time = Instant::now();
585+
let bpe = BytePairEncoding::o200k();
586+
println!("{:?}", time.elapsed());
587+
let encoded1 = o200k_base_singleton()
588+
.lock()
589+
.encode_ordinary(test_string)
590+
.into_iter()
591+
.map(|t| t as u32)
592+
.collect_vec();
593+
let encoded2 = bpe.encode_via_backtracking(test_string.as_bytes());
594+
assert_eq!(encoded1, encoded2);
595+
let encoded3 = bpe.encode_via_table(test_string.as_bytes());
596+
assert_eq!(encoded1, encoded3);
597+
let encoded4 = bpe.encode_via_bitfield(test_string.as_bytes());
598+
assert_eq!(encoded1, encoded4);
599+
}
600+
601+
#[test]
602+
fn test_correctness_o200k() {
576603
// This is quite a challenging test case...
577604
let test_string = std::str::from_utf8(&[
578605
125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,

0 commit comments

Comments
 (0)