Skip to content

Commit 61d4fd2

Browse files
Merge pull request #24 from github/generate-bpe-data
Generate serialized data in build script
2 parents 68bf766 + b1e3739 commit 61d4fd2

18 files changed

+434
-163
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[package]
2+
name = "bpe-openai"
3+
version = "0.1.0"
4+
edition = "2021"
5+
description = "Prebuilt fast byte-pair encoders for OpenAI."
6+
repository = "https://github.com/github/rust-gems"
7+
license = "MIT"
8+
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
9+
categories = ["algorithms", "data-structures", "encoding", "science"]
10+
11+
[lib]
12+
crate-type = ["lib", "staticlib"]
13+
bench = false
14+
15+
[dependencies]
16+
bpe = { version = "0.1.0", path = "../bpe" }
17+
rmp-serde = "1"
18+
serde = { version = "1" }
19+
20+
[build-dependencies]
21+
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
22+
rmp-serde = "1"
23+
tiktoken-rs = { version = "0.5" }
24+
serde = { version = "1" }

crates/bpe-openai/README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# OpenAI Byte Pair Encoders
2+
3+
Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
4+
Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
5+
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
6+
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
7+
8+
Supported token sets:
9+
10+
- r50k
11+
- p50k
12+
- cl100k
13+
- o200k
14+
15+
## Usage
16+
17+
Add a dependency by running
18+
19+
```sh
20+
cargo add bpe-openai
21+
```
22+
23+
or by adding the following to `Cargo.toml`
24+
25+
```toml
26+
[dependencies]
27+
bpe-openai = "0.1"
28+
```
29+
30+
Counting tokens is as simple as:
31+
32+
```rust
33+
use bpe_openai::cl100k;
34+
35+
fn main() {
36+
let bpe = cl100k();
37+
let count = bpe.count("Hello, world!");
38+
println!("{tokens}");
39+
}
40+
```
41+
42+
For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).

crates/bpe-openai/build.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use std::env;
2+
use std::fs::File;
3+
use std::path::PathBuf;
4+
5+
use bpe::byte_pair_encoding::BytePairEncoding;
6+
use serde::Serialize;
7+
use tiktoken_rs::CoreBPE;
8+
9+
fn main() {
10+
serialize_tokens(
11+
"r50k",
12+
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
13+
50256,
14+
1,
15+
);
16+
serialize_tokens(
17+
"p50k",
18+
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
19+
50280,
20+
1,
21+
);
22+
serialize_tokens(
23+
"cl100k",
24+
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
25+
100256,
26+
17846336922010275747,
27+
);
28+
serialize_tokens(
29+
"cl100k",
30+
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
31+
100256,
32+
17846336922010275747,
33+
);
34+
serialize_tokens(
35+
"o200k",
36+
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
37+
199998,
38+
17846336922010275747,
39+
);
40+
println!("cargo::rerun-if-changed=build.rs");
41+
}
42+
43+
fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
44+
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
45+
path.push(format!("bpe_{name}.dict"));
46+
let file = File::create(path).expect("can create output file");
47+
let mut serializer = rmp_serde::Serializer::new(file);
48+
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
49+
bpe.serialize(&mut serializer)
50+
.expect("serialization succeeds");
51+
}

crates/bpe-openai/src/lib.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
use std::sync::LazyLock;
2+
3+
use bpe::byte_pair_encoding::BytePairEncoding;
4+
5+
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
6+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7+
rmp_serde::from_slice(bytes).expect("")
8+
});
9+
10+
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
11+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
12+
rmp_serde::from_slice(bytes).expect("")
13+
});
14+
15+
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
16+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
17+
rmp_serde::from_slice(bytes).expect("")
18+
});
19+
20+
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
21+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
22+
rmp_serde::from_slice(bytes).expect("")
23+
});
24+
25+
pub use bpe::*;
26+
27+
pub fn r50k() -> &'static BytePairEncoding {
28+
&BPE_R50K
29+
}
30+
31+
pub fn p50k() -> &'static BytePairEncoding {
32+
&BPE_P50K
33+
}
34+
35+
pub fn cl100k() -> &'static BytePairEncoding {
36+
&BPE_CL100K
37+
}
38+
39+
pub fn o200k() -> &'static BytePairEncoding {
40+
&BPE_O200K
41+
}
42+
43+
#[cfg(test)]
44+
mod tests {
45+
use super::*;
46+
47+
#[test]
48+
fn can_load_r50k() {
49+
r50k().count("".as_bytes());
50+
}
51+
52+
#[test]
53+
fn can_load_p50k() {
54+
p50k().count("".as_bytes());
55+
}
56+
57+
#[test]
58+
fn can_load_cl100k() {
59+
cl100k().count("".as_bytes());
60+
}
61+
62+
#[test]
63+
fn can_load_o200k() {
64+
o200k().count("".as_bytes());
65+
}
66+
}

crates/bpe/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bpe"
3-
version = "0.0.1"
3+
version = "0.1.0"
44
edition = "2021"
55
description = "Fast byte-pair encoding implementation."
66
repository = "https://github.com/github/rust-gems"
@@ -16,6 +16,7 @@ bench = false
1616
name = "performance"
1717
path = "benches/performance.rs"
1818
harness = false
19+
test = false
1920

2021
[features]
2122
rand = ["dep:rand"]

crates/bpe/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac
227227

228228
![encoding runtime comparison](./benches/result/encoding-o200k.svg)
229229

230+
The graph below shows encoding results for input that is particularly challenging for tiktoken.
231+
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
232+
This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
233+
234+
![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg)
235+
230236
### Incremental encoding
231237

232238
Incremental encoding tokenizes a text while appending bytes.

crates/bpe/benches/performance.rs

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,28 @@ use criterion::{
1010
use rand::{thread_rng, Rng};
1111
use tiktoken_rs::CoreBPE;
1212

13-
static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
14-
LazyLock::new(|| {
15-
[
16-
(
17-
"cl100k",
18-
BytePairEncoding::cl100k(),
19-
tiktoken_rs::cl100k_base().unwrap(),
13+
static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
14+
[
15+
(
16+
"cl100k",
17+
BytePairEncoding::from_tiktoken(
18+
&tiktoken_rs::cl100k_base_singleton().lock(),
19+
100256,
20+
Some(17846336922010275747),
2021
),
21-
(
22-
"o200k",
23-
BytePairEncoding::o200k(),
24-
tiktoken_rs::o200k_base().unwrap(),
22+
tiktoken_rs::cl100k_base().unwrap(),
23+
),
24+
(
25+
"o200k",
26+
BytePairEncoding::from_tiktoken(
27+
&tiktoken_rs::o200k_base_singleton().lock(),
28+
199998,
29+
Some(17846336922010275747),
2530
),
26-
]
27-
});
31+
tiktoken_rs::o200k_base().unwrap(),
32+
),
33+
]
34+
});
2835

2936
fn counting_benchmark(c: &mut Criterion) {
3037
for (name, bpe, _) in TOKENIZERS.iter() {
@@ -160,6 +167,31 @@ fn appending_benchmark(c: &mut Criterion) {
160167
}
161168
}
162169

170+
fn worstcase_benchmark(c: &mut Criterion) {
171+
for (name, bpe, tiktoken) in TOKENIZERS.iter() {
172+
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
173+
let input = text.as_bytes();
174+
175+
let mut group = c.benchmark_group(format!("worstcase-{name}"));
176+
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
177+
group.throughput(criterion::Throughput::Bytes(bytes as u64));
178+
group.bench_with_input(
179+
BenchmarkId::new("backtracking", bytes),
180+
&bytes,
181+
|b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))),
182+
);
183+
group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
184+
b.iter_batched(
185+
|| select_test_bytes(input, *bytes),
186+
|input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()),
187+
criterion::BatchSize::SmallInput,
188+
)
189+
});
190+
}
191+
group.finish();
192+
}
193+
}
194+
163195
fn is_char_boundary(b: u8) -> bool {
164196
// Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
165197
// Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
@@ -188,12 +220,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
188220
text
189221
}
190222

223+
fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
224+
let mut start = thread_rng().gen_range(0..input.len() - bytes);
225+
while start > 0 && !is_char_boundary(input[start]) {
226+
start -= 1;
227+
}
228+
let mut end = start + bytes;
229+
while end < input.len() && !is_char_boundary(input[end]) {
230+
end += 1;
231+
}
232+
&input[start..end]
233+
}
234+
191235
criterion_group!(
192236
name = benches;
193237
config = Criterion::default()
194238
.warm_up_time(Duration::from_millis(500))
195-
.measurement_time(Duration::from_millis(1000))
239+
.measurement_time(Duration::from_millis(4000))
196240
.nresamples(1000);
197-
targets = counting_benchmark, encoding_benchmark, appending_benchmark
241+
targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark
198242
);
199243
criterion_main!(benches);

crates/bpe/benches/result/appending-o200k.svg

Lines changed: 10 additions & 10 deletions
Loading

0 commit comments

Comments
 (0)