Skip to content

Commit 8f53c50

Browse files
Merge pull request #22 from github/initial-release
Initial release
2 parents bb5c124 + 61d4fd2 commit 8f53c50

12 files changed

+239
-87
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[package]
2+
name = "bpe-openai"
3+
version = "0.1.0"
4+
edition = "2021"
5+
description = "Prebuilt fast byte-pair encoders for OpenAI."
6+
repository = "https://github.com/github/rust-gems"
7+
license = "MIT"
8+
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
9+
categories = ["algorithms", "data-structures", "encoding", "science"]
10+
11+
[lib]
12+
crate-type = ["lib", "staticlib"]
13+
bench = false
14+
15+
[dependencies]
16+
bpe = { version = "0.1.0", path = "../bpe" }
17+
rmp-serde = "1"
18+
serde = { version = "1" }
19+
20+
[build-dependencies]
21+
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
22+
rmp-serde = "1"
23+
tiktoken-rs = { version = "0.5" }
24+
serde = { version = "1" }

crates/bpe-openai/README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# OpenAI Byte Pair Encoders
2+
3+
Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
4+
Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
5+
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
6+
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
7+
8+
Supported token sets:
9+
10+
- r50k
11+
- p50k
12+
- cl100k
13+
- o200k
14+
15+
## Usage
16+
17+
Add a dependency by running
18+
19+
```sh
20+
cargo add bpe-openai
21+
```
22+
23+
or by adding the following to `Cargo.toml`
24+
25+
```toml
26+
[dependencies]
27+
bpe-openai = "0.1"
28+
```
29+
30+
Counting tokens is as simple as:
31+
32+
```rust
33+
use bpe_openai::cl100k;
34+
35+
fn main() {
36+
let bpe = cl100k();
37+
let count = bpe.count("Hello, world!");
38+
println!("{tokens}");
39+
}
40+
```
41+
42+
For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).

crates/bpe-openai/build.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use std::env;
2+
use std::fs::File;
3+
use std::path::PathBuf;
4+
5+
use bpe::byte_pair_encoding::BytePairEncoding;
6+
use serde::Serialize;
7+
use tiktoken_rs::CoreBPE;
8+
9+
fn main() {
10+
serialize_tokens(
11+
"r50k",
12+
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
13+
50256,
14+
1,
15+
);
16+
serialize_tokens(
17+
"p50k",
18+
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
19+
50280,
20+
1,
21+
);
22+
serialize_tokens(
23+
"cl100k",
24+
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
25+
100256,
26+
17846336922010275747,
27+
);
28+
serialize_tokens(
29+
"cl100k",
30+
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
31+
100256,
32+
17846336922010275747,
33+
);
34+
serialize_tokens(
35+
"o200k",
36+
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
37+
199998,
38+
17846336922010275747,
39+
);
40+
println!("cargo::rerun-if-changed=build.rs");
41+
}
42+
43+
fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
44+
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
45+
path.push(format!("bpe_{name}.dict"));
46+
let file = File::create(path).expect("can create output file");
47+
let mut serializer = rmp_serde::Serializer::new(file);
48+
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
49+
bpe.serialize(&mut serializer)
50+
.expect("serialization succeeds");
51+
}

crates/bpe-openai/src/lib.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
use std::sync::LazyLock;
2+
3+
use bpe::byte_pair_encoding::BytePairEncoding;
4+
5+
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
6+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7+
rmp_serde::from_slice(bytes).expect("")
8+
});
9+
10+
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
11+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
12+
rmp_serde::from_slice(bytes).expect("")
13+
});
14+
15+
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
16+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
17+
rmp_serde::from_slice(bytes).expect("")
18+
});
19+
20+
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
21+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
22+
rmp_serde::from_slice(bytes).expect("")
23+
});
24+
25+
pub use bpe::*;
26+
27+
pub fn r50k() -> &'static BytePairEncoding {
28+
&BPE_R50K
29+
}
30+
31+
pub fn p50k() -> &'static BytePairEncoding {
32+
&BPE_P50K
33+
}
34+
35+
pub fn cl100k() -> &'static BytePairEncoding {
36+
&BPE_CL100K
37+
}
38+
39+
pub fn o200k() -> &'static BytePairEncoding {
40+
&BPE_O200K
41+
}
42+
43+
#[cfg(test)]
44+
mod tests {
45+
use super::*;
46+
47+
#[test]
48+
fn can_load_r50k() {
49+
r50k().count("".as_bytes());
50+
}
51+
52+
#[test]
53+
fn can_load_p50k() {
54+
p50k().count("".as_bytes());
55+
}
56+
57+
#[test]
58+
fn can_load_cl100k() {
59+
cl100k().count("".as_bytes());
60+
}
61+
62+
#[test]
63+
fn can_load_o200k() {
64+
o200k().count("".as_bytes());
65+
}
66+
}

crates/bpe/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
name = "bpe"
33
version = "0.1.0"
44
edition = "2021"
5+
description = "Fast byte-pair encoding implementation."
6+
repository = "https://github.com/github/rust-gems"
7+
license = "MIT"
8+
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
9+
categories = ["algorithms", "data-structures", "encoding", "science"]
510

611
[lib]
712
crate-type = ["lib", "staticlib"]
@@ -11,6 +16,7 @@ bench = false
1116
name = "performance"
1217
path = "benches/performance.rs"
1318
harness = false
19+
test = false
1420

1521
[features]
1622
rand = ["dep:rand"]

crates/bpe/benches/performance.rs

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,28 @@ use criterion::{
1010
use rand::{thread_rng, Rng};
1111
use tiktoken_rs::CoreBPE;
1212

13-
static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
14-
LazyLock::new(|| {
15-
[
16-
(
17-
"cl100k",
18-
BytePairEncoding::cl100k(),
19-
tiktoken_rs::cl100k_base().unwrap(),
13+
static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
14+
[
15+
(
16+
"cl100k",
17+
BytePairEncoding::from_tiktoken(
18+
&tiktoken_rs::cl100k_base_singleton().lock(),
19+
100256,
20+
Some(17846336922010275747),
2021
),
21-
(
22-
"o200k",
23-
BytePairEncoding::o200k(),
24-
tiktoken_rs::o200k_base().unwrap(),
22+
tiktoken_rs::cl100k_base().unwrap(),
23+
),
24+
(
25+
"o200k",
26+
BytePairEncoding::from_tiktoken(
27+
&tiktoken_rs::o200k_base_singleton().lock(),
28+
199998,
29+
Some(17846336922010275747),
2530
),
26-
]
27-
});
31+
tiktoken_rs::o200k_base().unwrap(),
32+
),
33+
]
34+
});
2835

2936
fn counting_benchmark(c: &mut Criterion) {
3037
for (name, bpe, _) in TOKENIZERS.iter() {

crates/bpe/src/appendable_encoder.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,13 +90,13 @@ impl<'a> AppendableEncoder<'a> {
9090

9191
#[cfg(test)]
9292
mod tests {
93-
use crate::byte_pair_encoding::{create_test_bytes, BytePairEncoding};
93+
use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};
9494

9595
use super::AppendableEncoder;
9696

9797
#[test]
9898
fn test_appendable_encoder() {
99-
let bpe = BytePairEncoding::cl100k();
99+
let bpe = &BPE_CL100K;
100100
let mut enc = AppendableEncoder::new(bpe);
101101
let input_string = create_test_bytes(bpe, 100);
102102
for (i, c) in input_string.iter().enumerate() {

0 commit comments

Comments
 (0)