Move headers from include/ to include/pytorch/tokenizers/

Summary: Mostly for avoiding internal confusion. Differential Revision: D69677244
pytorch-labs · Feb 14, 2025 · f483d83 · f483d83
1 parent bba6759
commit f483d83
Show file tree

Hide file tree

Showing 25 changed files with 21 additions and 22 deletions.
diff --git a/include/base64.h → include/pytorch/tokenizers/base64.h b/include/base64.h → include/pytorch/tokenizers/base64.h
diff --git a/include/detail/bpe_tokenizer_base.h → ...e/pytorch/tokenizers/bpe_tokenizer_base.h b/include/detail/bpe_tokenizer_base.h → ...e/pytorch/tokenizers/bpe_tokenizer_base.h
diff --git a/include/error.h → include/pytorch/tokenizers/error.h b/include/error.h → include/pytorch/tokenizers/error.h
diff --git a/include/hf_tokenizer.h → include/pytorch/tokenizers/hf_tokenizer.h b/include/hf_tokenizer.h → include/pytorch/tokenizers/hf_tokenizer.h
diff --git a/include/llama2c_tokenizer.h → ...de/pytorch/tokenizers/llama2c_tokenizer.h b/include/llama2c_tokenizer.h → ...de/pytorch/tokenizers/llama2c_tokenizer.h
diff --git a/include/log.h → include/pytorch/tokenizers/log.h b/include/log.h → include/pytorch/tokenizers/log.h
diff --git a/include/pre_tokenizer.h → include/pytorch/tokenizers/pre_tokenizer.h b/include/pre_tokenizer.h → include/pytorch/tokenizers/pre_tokenizer.h
diff --git a/include/result.h → include/pytorch/tokenizers/result.h b/include/result.h → include/pytorch/tokenizers/result.h
diff --git a/include/sentencepiece.h → include/pytorch/tokenizers/sentencepiece.h b/include/sentencepiece.h → include/pytorch/tokenizers/sentencepiece.h
diff --git a/include/tiktoken.h → include/pytorch/tokenizers/tiktoken.h b/include/tiktoken.h → include/pytorch/tokenizers/tiktoken.h
diff --git a/include/token_decoder.h → include/pytorch/tokenizers/token_decoder.h b/include/token_decoder.h → include/pytorch/tokenizers/token_decoder.h
diff --git a/include/tokenizer.h → include/pytorch/tokenizers/tokenizer.h b/include/tokenizer.h → include/pytorch/tokenizers/tokenizer.h
diff --git a/src/bpe_tokenizer_base.cpp b/src/bpe_tokenizer_base.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "detail/bpe_tokenizer_base.h"
+#include <pytorch/tokenizers/bpe_tokenizer_base.h>
 
 // Standard
 #include <inttypes.h>

diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "hf_tokenizer.h"
+#include <pytorch/tokenizers/hf_tokenizer.h>
 
 // Standard
 #include <filesystem>

diff --git a/src/llama2c_tokenizer.cpp b/src/llama2c_tokenizer.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "llama2c_tokenizer.h"
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 #include <cstring>
 
 namespace tokenizers {

diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp
@@ -5,7 +5,10 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include "pre_tokenizer.h"
+
+// Local
+#include <pytorch/tokenizers/pre_tokenizer.h>
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode.h>
 
 // Standard
 #include <algorithm>
@@ -15,9 +18,6 @@
 // Third Party
 #include <nlohmann/json.hpp>
 
-// Local
-#include "unicode.h"
-
 using json = nlohmann::json;
 
 namespace tokenizers {

diff --git a/src/sentencepiece.cpp b/src/sentencepiece.cpp
@@ -8,7 +8,7 @@
 
 // A tokenizer that works with sentencepiece.
 
-#include "sentencepiece.h"
+#include <pytorch/tokenizers/sentencepiece.h>
 #include <cinttypes>
 #include <string>
 #include "third_party/absl/strings/str_replace.h"

diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp
@@ -25,11 +25,11 @@
    limitations under the License.
  *************************************************************************/
 
-#include "tiktoken.h"
+#include <pytorch/tokenizers/base64.h>
+#include <pytorch/tokenizers/tiktoken.h>
 #include <cinttypes>
 #include <fstream>
 #include <limits>
-#include "base64.h"
 #include "re2/re2.h"
 
 namespace tokenizers {

diff --git a/src/token_decoder.cpp b/src/token_decoder.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "token_decoder.h"
+#include <pytorch/tokenizers/token_decoder.h>
 
 // Standard
 #include <cstdarg>
@@ -16,7 +16,7 @@
 #include <nlohmann/json.hpp>
 
 // Local
-#include "unicode.h"
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode.h>
 
 using json = nlohmann::json;
 

diff --git a/targets.bzl b/targets.bzl
@@ -11,13 +11,13 @@ def define_common_targets():
     runtime.cxx_library(
         name = "headers",
         exported_headers = subdir_glob([
-            ("include", "*.h"),
-            ("include", "**/*.h"),
+            ("include", "pytorch/tokenizers/*.h"),
+            ("include", "pytorch/tokenizers/**/*.h"),
         ]),
-        header_namespace = "",
         visibility = [
             "@EXECUTORCH_CLIENTS",
         ],
+        header_namespace = "",
     )
 
     runtime.cxx_library(
@@ -66,9 +66,8 @@ def define_common_targets():
             "third-party/llama.cpp-unicode/src/unicode-data.cpp",
         ],
         exported_headers = subdir_glob([
-            ("third-party/llama.cpp-unicode/include", "*.h"),
+            ("third-party/llama.cpp-unicode/include", "pytorch/tokenizers/third-party/llama.cpp-unicode/*.h"),
         ]),
-        header_namespace = "",
     )
 
     runtime.cxx_library(

diff --git a/test/test_base64.cpp b/test/test_base64.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "base64.h"
+#include <pytorch/tokenizers/base64.h>
 #include "gtest/gtest.h"
 
 namespace tokenizers {

diff --git a/test/test_llama2c_tokenizer.cpp b/test/test_llama2c_tokenizer.cpp
@@ -10,7 +10,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "llama2c_tokenizer.h"
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 using namespace ::testing;
 

diff --git a/test/test_pre_tokenizer.cpp b/test/test_pre_tokenizer.cpp
@@ -12,7 +12,7 @@
 #include <re2/re2.h>
 
 // Local
-#include "pre_tokenizer.h"
+#include <pytorch/tokenizers/pre_tokenizer.h>
 
 using json = nlohmann::json;
 using namespace tokenizers;

diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp
@@ -11,7 +11,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "sentencepiece.h"
+#include <pytorch/tokenizers/sentencepiece.h>
 
 namespace tokenizers {
 

diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp
@@ -11,7 +11,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "tiktoken.h"
+#include <pytorch/tokenizers/tiktoken.h>
 
 using namespace ::testing;