Skip to content

Commit 1d02b2e

Browse files
Merge pull request #23 from github/add-worstcase-benchmark
Add worstcase benchmark
2 parents a94431a + af3f23c commit 1d02b2e

8 files changed

+202
-55
lines changed

crates/bpe/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac
227227

228228
![encoding runtime comparison](./benches/result/encoding-o200k.svg)
229229

230+
The graph below shows encoding results for input that is particularly challenging for tiktoken.
231+
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
232+
This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
233+
234+
![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg)
235+
230236
### Incremental encoding
231237

232238
Incremental encoding tokenizes a text while appending bytes.

crates/bpe/benches/performance.rs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,31 @@ fn appending_benchmark(c: &mut Criterion) {
160160
}
161161
}
162162

163+
fn worstcase_benchmark(c: &mut Criterion) {
164+
for (name, bpe, tiktoken) in TOKENIZERS.iter() {
165+
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
166+
let input = text.as_bytes();
167+
168+
let mut group = c.benchmark_group(format!("worstcase-{name}"));
169+
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
170+
group.throughput(criterion::Throughput::Bytes(bytes as u64));
171+
group.bench_with_input(
172+
BenchmarkId::new("backtracking", bytes),
173+
&bytes,
174+
|b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))),
175+
);
176+
group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
177+
b.iter_batched(
178+
|| select_test_bytes(input, *bytes),
179+
|input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()),
180+
criterion::BatchSize::SmallInput,
181+
)
182+
});
183+
}
184+
group.finish();
185+
}
186+
}
187+
163188
fn is_char_boundary(b: u8) -> bool {
164189
// Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
165190
// Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
@@ -188,12 +213,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
188213
text
189214
}
190215

216+
fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
217+
let mut start = thread_rng().gen_range(0..input.len() - bytes);
218+
while start > 0 && !is_char_boundary(input[start]) {
219+
start -= 1;
220+
}
221+
let mut end = start + bytes;
222+
while end < input.len() && !is_char_boundary(input[end]) {
223+
end += 1;
224+
}
225+
&input[start..end]
226+
}
227+
191228
criterion_group!(
192229
name = benches;
193230
config = Criterion::default()
194231
.warm_up_time(Duration::from_millis(500))
195-
.measurement_time(Duration::from_millis(1000))
232+
.measurement_time(Duration::from_millis(4000))
196233
.nresamples(1000);
197-
targets = counting_benchmark, encoding_benchmark, appending_benchmark
234+
targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark
198235
);
199236
criterion_main!(benches);

crates/bpe/benches/result/appending-o200k.svg

Lines changed: 10 additions & 10 deletions
Loading

crates/bpe/benches/result/counting-o200k.svg

Lines changed: 10 additions & 10 deletions
Loading

crates/bpe/benches/result/encoding-o200k.svg

Lines changed: 30 additions & 30 deletions
Loading

crates/bpe/benches/result/worstcase-o200k.svg

Lines changed: 77 additions & 0 deletions
Loading

crates/bpe/script/copy-benchmark-results

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ result_dir="benches/result"
66

77
mkdir -p "$result_dir"
88

9-
for i in {counting,encoding,appending}-o200k; do
9+
for i in {counting,encoding,appending,worstcase}-o200k; do
1010
rsvg-convert --format svg --output "$result_dir/$i.svg" --background-color white "target/criterion/reports/$i/lines.svg"
1111
done

crates/bpe/src/byte_pair_encoding.rs

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,39 @@ mod tests {
567567
use std::time::Instant;
568568

569569
use itertools::Itertools;
570-
use tiktoken_rs::cl100k_base_singleton;
570+
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton};
571571

572572
use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
573573

574574
#[test]
575-
fn test_correctness() {
575+
fn test_correctness_cl100k() {
576+
// This is quite a challenging test case...
577+
let test_string = std::str::from_utf8(&[
578+
125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,
579+
112, 105, 115, 105, 99, 105, 110, 103, 105, 116, 121, 69, 110, 103, 105, 110, 101, 32,
580+
69, 67, 105, 114, 105, 101, 32, 111, 112, 116, 105, 109, 97, 108, 95, 68, 65, 32, 111,
581+
102, 102, 101, 110, 100,
582+
])
583+
.unwrap();
584+
let time = Instant::now();
585+
let bpe = BytePairEncoding::o200k();
586+
println!("{:?}", time.elapsed());
587+
let encoded1 = o200k_base_singleton()
588+
.lock()
589+
.encode_ordinary(test_string)
590+
.into_iter()
591+
.map(|t| t as u32)
592+
.collect_vec();
593+
let encoded2 = bpe.encode_via_backtracking(test_string.as_bytes());
594+
assert_eq!(encoded1, encoded2);
595+
let encoded3 = bpe.encode_via_table(test_string.as_bytes());
596+
assert_eq!(encoded1, encoded3);
597+
let encoded4 = bpe.encode_via_bitfield(test_string.as_bytes());
598+
assert_eq!(encoded1, encoded4);
599+
}
600+
601+
#[test]
602+
fn test_correctness_o200k() {
576603
// This is quite a challenging test case...
577604
let test_string = std::str::from_utf8(&[
578605
125, 34, 10, 10, 46, 109, 107, 100, 105, 114, 115, 32, 102, 100, 115, 32, 97, 100, 105,

0 commit comments

Comments
 (0)