@@ -567,12 +567,39 @@ mod tests {
567
567
use std:: time:: Instant ;
568
568
569
569
use itertools:: Itertools ;
570
- use tiktoken_rs:: cl100k_base_singleton;
570
+ use tiktoken_rs:: { cl100k_base_singleton, o200k_base_singleton } ;
571
571
572
572
use crate :: byte_pair_encoding:: { create_test_bytes, BytePairEncoding } ;
573
573
574
574
#[ test]
575
- fn test_correctness ( ) {
575
+ fn test_correctness_cl100k ( ) {
576
+ // This is quite a challenging test case...
577
+ let test_string = std:: str:: from_utf8 ( & [
578
+ 125 , 34 , 10 , 10 , 46 , 109 , 107 , 100 , 105 , 114 , 115 , 32 , 102 , 100 , 115 , 32 , 97 , 100 , 105 ,
579
+ 112 , 105 , 115 , 105 , 99 , 105 , 110 , 103 , 105 , 116 , 121 , 69 , 110 , 103 , 105 , 110 , 101 , 32 ,
580
+ 69 , 67 , 105 , 114 , 105 , 101 , 32 , 111 , 112 , 116 , 105 , 109 , 97 , 108 , 95 , 68 , 65 , 32 , 111 ,
581
+ 102 , 102 , 101 , 110 , 100 ,
582
+ ] )
583
+ . unwrap ( ) ;
584
+ let time = Instant :: now ( ) ;
585
+ let bpe = BytePairEncoding :: o200k ( ) ;
586
+ println ! ( "{:?}" , time. elapsed( ) ) ;
587
+ let encoded1 = o200k_base_singleton ( )
588
+ . lock ( )
589
+ . encode_ordinary ( test_string)
590
+ . into_iter ( )
591
+ . map ( |t| t as u32 )
592
+ . collect_vec ( ) ;
593
+ let encoded2 = bpe. encode_via_backtracking ( test_string. as_bytes ( ) ) ;
594
+ assert_eq ! ( encoded1, encoded2) ;
595
+ let encoded3 = bpe. encode_via_table ( test_string. as_bytes ( ) ) ;
596
+ assert_eq ! ( encoded1, encoded3) ;
597
+ let encoded4 = bpe. encode_via_bitfield ( test_string. as_bytes ( ) ) ;
598
+ assert_eq ! ( encoded1, encoded4) ;
599
+ }
600
+
601
+ #[ test]
602
+ fn test_correctness_o200k ( ) {
576
603
// This is quite a challenging test case...
577
604
let test_string = std:: str:: from_utf8 ( & [
578
605
125 , 34 , 10 , 10 , 46 , 109 , 107 , 100 , 105 , 114 , 115 , 32 , 102 , 100 , 115 , 32 , 97 , 100 , 105 ,
0 commit comments