Skip to content

Commit 1250302

Browse files
committed
✨ feat(rust): Convert project to a multi-crate workspace
This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure. Refs: #24
1 parent f28ce4c commit 1250302

File tree

10 files changed

+120
-27
lines changed

10 files changed

+120
-27
lines changed

Cargo.toml

+5-21
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,5 @@
1-
[package]
2-
name = "tiktoken"
3-
version = "0.4.0"
4-
edition = "2021"
5-
rust-version = "1.57.0"
6-
7-
[lib]
8-
name = "_tiktoken"
9-
crate-type = ["cdylib"]
10-
11-
[dependencies]
12-
pyo3 = { version = "0.19.0", features = ["extension-module"] }
13-
14-
# tiktoken dependencies
15-
fancy-regex = "0.11.0"
16-
regex = "1.8.3"
17-
rustc-hash = "1.1.0"
18-
bstr = "1.5.0"
19-
20-
[profile.release]
21-
incremental = true
1+
[workspace]
2+
members = [
3+
"rs-tiktoken",
4+
"py-tiktoken",
5+
]

py-tiktoken/Cargo.toml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "py-tiktoken"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.57.0"
6+
7+
[lib]
8+
name = "_tiktoken"
9+
crate-type = ["cdylib"]
10+
11+
[dependencies]
12+
tiktoken = { path = "../rs-tiktoken" }
13+
pyo3 = { version = "0.19.0", features = ["extension-module"] }
14+
15+
# tiktoken dependencies
16+
fancy-regex = "0.11.0"
17+
regex = "1.8.3"
18+
rustc-hash = "1.1.0"
19+
bstr = "1.5.0"
20+
21+
[profile.release]
22+
incremental = true

py-tiktoken/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod tiktoken_py;

src/tiktoken_py.rs renamed to py-tiktoken/src/tiktoken_py.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use pyo3::PyResult;
1010
use pyo3::types::{PyBytes, PyList, PyTuple};
1111
use rustc_hash::FxHashMap as HashMap;
1212

13-
use crate::tiktoken::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
13+
use tiktoken::core::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
1414

1515
#[pyclass]
1616
pub struct PyCoreBPE {
@@ -181,7 +181,7 @@ pub fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
181181
mod tests {
182182
use rustc_hash::FxHashMap as HashMap;
183183

184-
use crate::tiktoken::byte_pair_split;
184+
use crate::core::byte_pair_split;
185185

186186
#[test]
187187
fn very_simple_test() {

rs-tiktoken/Cargo.toml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[package]
2+
name = "tiktoken"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.57.0"
6+
7+
[dependencies]
8+
fancy-regex = "0.11.0"
9+
regex = "1.8.3"
10+
rustc-hash = "1.1.0"
11+
bstr = "1.5.0"
12+
once_cell = "1.18.0"
13+
14+
[profile.release]
15+
incremental = true

src/tiktoken.rs renamed to rs-tiktoken/src/core.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, usize>) ->
152152

153153
pub struct FakeThreadId(NonZeroU64);
154154

155-
pub fn hash_current_thread() -> usize {
155+
fn hash_current_thread() -> usize {
156156
// It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
157157
// that works great for our use case of avoiding collisions in our array. Unfortunately,
158158
// it's private. However, there are only so many ways you can layout a u64, so just transmute

rs-tiktoken/src/encoding.rs

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//! WARNING: This code is under active development. Functionality,
2+
//! behavior, and the interface may change in future updates.
3+
4+
use std::collections::HashMap;
5+
use once_cell::sync::Lazy;
6+
use regex::Regex;
7+
8+
9+
pub struct Encoding {
10+
/// The name of the encoding. It should be clear from the name of the encoding
11+
/// what behaviour to expect, in particular, encodings with different special tokens
12+
/// should have different names.
13+
pub name: &'static str,
14+
/// A regex pattern string that is used to split the input text.
15+
pub pat_str: Regex,
16+
/// A dictionary mapping mergeable token bytes to their ranks. The ranks
17+
/// must correspond to merge priority.
18+
pub mergeable_ranks: HashMap<&'static str, u32>,
19+
/// A dictionary mapping special token strings to their token values.
20+
pub special_tokens: HashMap<&'static str, u32>,
21+
/// The number of tokens in the vocabulary. If provided, it is checked
22+
/// that the number of mergeable tokens and special tokens is equal to this number.
23+
pub explicit_n_vocab: Option<u32>,
24+
}
25+
26+
pub static GPT2: Lazy<Encoding> = Lazy::new(|| {
27+
let mergeable_ranks = Default::default();
28+
let special_tokens = [
29+
("<|endoftext|>", 50256)
30+
].iter().cloned().collect();
31+
32+
Encoding{
33+
name: "gpt2",
34+
pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(),
35+
mergeable_ranks,
36+
special_tokens,
37+
explicit_n_vocab: Some(50257),
38+
}
39+
});
40+
41+
pub fn get_encoding() {
42+
43+
}
44+
45+
#[cfg(test)]
46+
mod test {
47+
use super::*;
48+
49+
#[test]
50+
fn test_simple() {
51+
// enc = tiktoken.get_encoding("gpt2")
52+
// assert enc.encode("hello world") == [31373, 995]
53+
// assert enc.decode([31373, 995]) == "hello world"
54+
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
55+
//
56+
// enc = tiktoken.get_encoding("cl100k_base")
57+
// assert enc.encode("hello world") == [15339, 1917]
58+
// assert enc.decode([15339, 1917]) == "hello world"
59+
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
60+
//
61+
// for enc_name in tiktoken.list_encoding_names():
62+
// enc = tiktoken.get_encoding(enc_name)
63+
// for token in range(10_000):
64+
// assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
65+
}
66+
}
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
// This check is new and seems buggy (possibly with PyO3 interaction)
2-
pub mod tiktoken_py;
3-
pub mod tiktoken;
2+
pub mod core;
3+
pub mod encoding;
4+
mod model;

rs-tiktoken/src/model.rs

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
//! WARNING: This code is under active development. Functionality,
2+
//! behavior, and the interface may change in future updates.
3+

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
name="tiktoken",
66
rust_extensions=[
77
RustExtension(
8-
"tiktoken._tiktoken",
8+
target="tiktoken._tiktoken",
99
binding=Binding.PyO3,
1010
# Between our use of editable installs and wanting to use Rust for performance sensitive
1111
# code, it makes sense to just always use --release
1212
debug=False,
13+
path="py-tiktoken/Cargo.toml",
1314
)
1415
],
1516
package_data={"tiktoken": ["py.typed"]},

0 commit comments

Comments
 (0)