Skip to content

Commit d8ce942

Browse files
committed
Support enterprise encoding host in env.
1 parent 1b9faf2 commit d8ce942

File tree

2 files changed

+14
-6
lines changed

2 files changed

+14
-6
lines changed

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,9 @@ setup(
128128

129129
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
130130
custom encodings! Make sure **not** to use an editable install.
131+
132+
**Hosting your own encodings for enterprise usage.**
133+
134+
For most use cases, the default public OpenAI encodings are enabled by default and no changes are needed. However, for organizations operating in an enterprise setting, existing network configurations may necessitate hosting encodings internally.
135+
136+
To change the host which is serving encodings files for populating the plugin modules, simply set the `ENCODINGS_HOST` environmental variable. The default is the public OpenAI hosted file server. Enterprises hosting their own encodings can see which encodings and files are supported and the routing involved by viewing the [source directly](./tiktoken_ext/openai_public.py).

tiktoken_ext/openai_public.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
23

34
ENDOFTEXT = "<|endoftext|>"
@@ -6,11 +7,12 @@
67
FIM_SUFFIX = "<|fim_suffix|>"
78
ENDOFPROMPT = "<|endofprompt|>"
89

10+
ENCODINGS_HOST = os.getenv("ENCODINGS_HOST", "https://openaipublic.blob.core.windows.net")
911

1012
def gpt2():
1113
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
12-
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
13-
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
14+
vocab_bpe_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/vocab.bpe",
15+
encoder_json_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/encoder.json",
1416
vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
1517
encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
1618
)
@@ -28,7 +30,7 @@ def gpt2():
2830

2931
def r50k_base():
3032
mergeable_ranks = load_tiktoken_bpe(
31-
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
33+
f"{ENCODINGS_HOST}/encodings/r50k_base.tiktoken",
3234
expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
3335
)
3436
return {
@@ -42,7 +44,7 @@ def r50k_base():
4244

4345
def p50k_base():
4446
mergeable_ranks = load_tiktoken_bpe(
45-
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
47+
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
4648
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
4749
)
4850
return {
@@ -56,7 +58,7 @@ def p50k_base():
5658

5759
def p50k_edit():
5860
mergeable_ranks = load_tiktoken_bpe(
59-
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
61+
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
6062
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
6163
)
6264
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
@@ -70,7 +72,7 @@ def p50k_edit():
7072

7173
def cl100k_base():
7274
mergeable_ranks = load_tiktoken_bpe(
73-
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
75+
f"{ENCODINGS_HOST}/encodings/cl100k_base.tiktoken",
7476
expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
7577
)
7678
special_tokens = {

0 commit comments

Comments
 (0)