Skip to content

Commit 85b42ba

Browse files
committed
Support enterprise encoding host in env.
1 parent 9e79899 commit 85b42ba

File tree

2 files changed

+13
-6
lines changed

2 files changed

+13
-6
lines changed

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,8 @@ setup(
129129
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
130130
custom encodings! Make sure **not** to use an editable install.
131131

132+
**Hosting your own encodings for enterprise usage.**
133+
134+
For most use cases, the default public OpenAI encodings are enabled by default and no changes are needed. However, for organizations operating in an enterprise setting, existing network configurations may necessitate hosting encodings internally.
135+
136+
To change the host which is serving encodings files for populating the plugin modules, simply set the `ENCODINGS_HOST` environmental variable. The default is the public OpenAI hosted file server. Enterprises hosting their own encodings can see which encodings and files are supported and the routing involved by viewing the [source directly](./tiktoken_ext/openai_public.py).

tiktoken_ext/openai_public.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
23

34
ENDOFTEXT = "<|endoftext|>"
@@ -6,11 +7,12 @@
67
FIM_SUFFIX = "<|fim_suffix|>"
78
ENDOFPROMPT = "<|endofprompt|>"
89

10+
ENCODINGS_HOST = os.getenv("ENCODINGS_HOST", "https://openaipublic.blob.core.windows.net")
911

1012
def gpt2():
1113
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
12-
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
13-
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
14+
vocab_bpe_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/vocab.bpe",
15+
encoder_json_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/encoder.json",
1416
)
1517
return {
1618
"name": "gpt2",
@@ -23,7 +25,7 @@ def gpt2():
2325

2426
def r50k_base():
2527
mergeable_ranks = load_tiktoken_bpe(
26-
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
28+
f"{ENCODINGS_HOST}/encodings/r50k_base.tiktoken"
2729
)
2830
return {
2931
"name": "r50k_base",
@@ -36,7 +38,7 @@ def r50k_base():
3638

3739
def p50k_base():
3840
mergeable_ranks = load_tiktoken_bpe(
39-
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
41+
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken"
4042
)
4143
return {
4244
"name": "p50k_base",
@@ -49,7 +51,7 @@ def p50k_base():
4951

5052
def p50k_edit():
5153
mergeable_ranks = load_tiktoken_bpe(
52-
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
54+
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken"
5355
)
5456
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
5557
return {
@@ -62,7 +64,7 @@ def p50k_edit():
6264

6365
def cl100k_base():
6466
mergeable_ranks = load_tiktoken_bpe(
65-
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
67+
f"{ENCODINGS_HOST}/encodings/cl100k_base.tiktoken"
6668
)
6769
special_tokens = {
6870
ENDOFTEXT: 100257,

0 commit comments

Comments
 (0)