Support enterprise encoding host in env.

blaney83 · blaney83 · commit d8ce942a856f · 2024-03-08T16:20:56.000-07:00
diff --git a/README.md b/README.md
@@ -128,3 +128,9 @@ setup(
 
 Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
 custom encodings! Make sure **not** to use an editable install.
+
+**Hosting your own encodings for enterprise usage.**
+
+For most use cases, the default public OpenAI encodings are enabled by default and no changes are needed. However, for organizations operating in an enterprise setting, existing network configurations may necessitate hosting encodings internally.
+
+To change the host which is serving encodings files for populating the plugin modules, simply set the `ENCODINGS_HOST` environmental variable. The default is the public OpenAI hosted file server. Enterprises hosting their own encodings can see which encodings and files are supported and the routing involved by viewing the [source directly](./tiktoken_ext/openai_public.py).
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
@@ -1,3 +1,4 @@
+import os
 from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
 
 ENDOFTEXT = "<|endoftext|>"
@@ -6,11 +7,12 @@
 FIM_SUFFIX = "<|fim_suffix|>"
 ENDOFPROMPT = "<|endofprompt|>"
 
+ENCODINGS_HOST = os.getenv("ENCODINGS_HOST", "https://openaipublic.blob.core.windows.net")
 
 def gpt2():
     mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
-        vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
-        encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
+        vocab_bpe_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/vocab.bpe",
+        encoder_json_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/encoder.json",
         vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
         encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
     )
@@ -28,7 +30,7 @@ def gpt2():
 
 def r50k_base():
     mergeable_ranks = load_tiktoken_bpe(
-        "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
+        f"{ENCODINGS_HOST}/encodings/r50k_base.tiktoken",
         expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
     )
     return {
@@ -42,7 +44,7 @@ def r50k_base():
 
 def p50k_base():
     mergeable_ranks = load_tiktoken_bpe(
-        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
+        f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
         expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
     )
     return {
@@ -56,7 +58,7 @@ def p50k_base():
 
 def p50k_edit():
     mergeable_ranks = load_tiktoken_bpe(
-        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
+        f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
         expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
     )
     special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
@@ -70,7 +72,7 @@ def p50k_edit():
 
 def cl100k_base():
     mergeable_ranks = load_tiktoken_bpe(
-        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
+        f"{ENCODINGS_HOST}/encodings/cl100k_base.tiktoken",
         expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
     )
     special_tokens = {