pytorch-labs · facebook-github-bot · Feb 14, 2025 · Feb 14, 2025
diff --git a/include/base64.h → include/pytorch/tokenizers/base64.h b/include/base64.h → include/pytorch/tokenizers/base64.h
diff --git a/include/detail/bpe_tokenizer_base.h → ...e/pytorch/tokenizers/bpe_tokenizer_base.h b/include/detail/bpe_tokenizer_base.h → ...e/pytorch/tokenizers/bpe_tokenizer_base.h
@@ -21,8 +21,8 @@
 #include <re2/re2.h>
 
 // Local
-#include "result.h"
-#include "tokenizer.h"
+#include <pytorch/tokenizers/result.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace tokenizers {
 namespace detail {

diff --git a/include/error.h → include/pytorch/tokenizers/error.h b/include/error.h → include/pytorch/tokenizers/error.h
@@ -13,8 +13,8 @@
 
 #pragma once
 
+#include <pytorch/tokenizers/log.h>
 #include <stdint.h>
-#include "log.h"
 
 namespace tokenizers {
 

diff --git a/include/hf_tokenizer.h → include/pytorch/tokenizers/hf_tokenizer.h b/include/hf_tokenizer.h → include/pytorch/tokenizers/hf_tokenizer.h
@@ -19,11 +19,11 @@
 #include <re2/re2.h>
 
 // Local
-#include "detail/bpe_tokenizer_base.h"
-#include "error.h"
-#include "pre_tokenizer.h"
-#include "result.h"
-#include "token_decoder.h"
+#include <pytorch/tokenizers/bpe_tokenizer_base.h>
+#include <pytorch/tokenizers/error.h>
+#include <pytorch/tokenizers/pre_tokenizer.h>
+#include <pytorch/tokenizers/result.h>
+#include <pytorch/tokenizers/token_decoder.h>
 
 namespace tokenizers {
 class HFTokenizer : public detail::BPETokenizerBase {

diff --git a/include/llama2c_tokenizer.h → ...de/pytorch/tokenizers/llama2c_tokenizer.h b/include/llama2c_tokenizer.h → ...de/pytorch/tokenizers/llama2c_tokenizer.h
@@ -7,8 +7,8 @@
  */
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
 #pragma once
+#include <pytorch/tokenizers/tokenizer.h>
 #include <memory>
-#include "tokenizer.h"
 
 namespace tokenizers {
 

diff --git a/include/log.h → include/pytorch/tokenizers/log.h b/include/log.h → include/pytorch/tokenizers/log.h
@@ -15,6 +15,7 @@
 
 #include <cstdarg>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 

diff --git a/include/pre_tokenizer.h → include/pytorch/tokenizers/pre_tokenizer.h b/include/pre_tokenizer.h → include/pytorch/tokenizers/pre_tokenizer.h
diff --git a/include/result.h → include/pytorch/tokenizers/result.h b/include/result.h → include/pytorch/tokenizers/result.h
@@ -13,10 +13,10 @@
 
 #pragma once
 
+#include <pytorch/tokenizers/error.h>
 #include <cassert>
 #include <new>
 #include <utility>
-#include "error.h"
 
 namespace tokenizers {
 

diff --git a/include/sentencepiece.h → include/pytorch/tokenizers/sentencepiece.h b/include/sentencepiece.h → include/pytorch/tokenizers/sentencepiece.h
@@ -10,10 +10,10 @@
 // A tokenizer that works with sentencepiece. Used by Llama2.
 #pragma once
 
+#include <pytorch/tokenizers/tokenizer.h>
 #include <memory>
 #include <vector>
 #include "sentencepiece_processor.h"
-#include "tokenizer.h"
 namespace tokenizers {
 
 struct TokenIndex {

diff --git a/.../llama.cpp-unicode/include/unicode-data.h → ...rd-party/llama.cpp-unicode/unicode-data.h b/.../llama.cpp-unicode/include/unicode-data.h → ...rd-party/llama.cpp-unicode/unicode-data.h
diff --git a/...party/llama.cpp-unicode/include/unicode.h → ...s/third-party/llama.cpp-unicode/unicode.h b/...party/llama.cpp-unicode/include/unicode.h → ...s/third-party/llama.cpp-unicode/unicode.h
diff --git a/include/tiktoken.h → include/pytorch/tokenizers/tiktoken.h b/include/tiktoken.h → include/pytorch/tokenizers/tiktoken.h
@@ -17,9 +17,9 @@
 #include "re2/re2.h"
 
 // Local
-#include "detail/bpe_tokenizer_base.h"
-#include "result.h"
-#include "tokenizer.h"
+#include <pytorch/tokenizers/bpe_tokenizer_base.h>
+#include <pytorch/tokenizers/result.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace tokenizers {
 

diff --git a/include/token_decoder.h → include/pytorch/tokenizers/token_decoder.h b/include/token_decoder.h → include/pytorch/tokenizers/token_decoder.h
diff --git a/include/tokenizer.h → include/pytorch/tokenizers/tokenizer.h b/include/tokenizer.h → include/pytorch/tokenizers/tokenizer.h
@@ -13,10 +13,10 @@
 
 #pragma once
 
+#include <pytorch/tokenizers/error.h>
+#include <pytorch/tokenizers/result.h>
 #include <string>
 #include <vector>
-#include "error.h"
-#include "result.h"
 
 namespace tokenizers {
 

diff --git a/src/bpe_tokenizer_base.cpp b/src/bpe_tokenizer_base.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "detail/bpe_tokenizer_base.h"
+#include <pytorch/tokenizers/bpe_tokenizer_base.h>
 
 // Standard
 #include <inttypes.h>

diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "hf_tokenizer.h"
+#include <pytorch/tokenizers/hf_tokenizer.h>
 
 // Standard
 #include <filesystem>

diff --git a/src/llama2c_tokenizer.cpp b/src/llama2c_tokenizer.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
-#include "llama2c_tokenizer.h"
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 #include <cstring>
 
 namespace tokenizers {

diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp
@@ -5,7 +5,10 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include "pre_tokenizer.h"
+
+// Local
+#include <pytorch/tokenizers/pre_tokenizer.h>
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode.h>
 
 // Standard
 #include <algorithm>
@@ -15,9 +18,6 @@
 // Third Party
 #include <nlohmann/json.hpp>
 
-// Local
-#include "unicode.h"
-
 using json = nlohmann::json;
 
 namespace tokenizers {

diff --git a/src/sentencepiece.cpp b/src/sentencepiece.cpp
@@ -8,7 +8,7 @@
 
 // A tokenizer that works with sentencepiece.
 
-#include "sentencepiece.h"
+#include <pytorch/tokenizers/sentencepiece.h>
 #include <cinttypes>
 #include <string>
 #include "third_party/absl/strings/str_replace.h"

diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp
@@ -25,11 +25,11 @@
    limitations under the License.
  *************************************************************************/
 
-#include "tiktoken.h"
+#include <pytorch/tokenizers/base64.h>
+#include <pytorch/tokenizers/tiktoken.h>
 #include <cinttypes>
 #include <fstream>
 #include <limits>
-#include "base64.h"
 #include "re2/re2.h"
 
 namespace tokenizers {

diff --git a/src/token_decoder.cpp b/src/token_decoder.cpp
@@ -7,7 +7,7 @@
  */
 // @lint-ignore-every LICENSELINT
 
-#include "token_decoder.h"
+#include <pytorch/tokenizers/token_decoder.h>
 
 // Standard
 #include <cstdarg>
@@ -16,7 +16,7 @@
 #include <nlohmann/json.hpp>
 
 // Local
-#include "unicode.h"
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode.h>
 
 using json = nlohmann::json;
 

diff --git a/targets.bzl b/targets.bzl
@@ -11,13 +11,12 @@ def define_common_targets():
     runtime.cxx_library(
         name = "headers",
         exported_headers = subdir_glob([
-            ("include", "*.h"),
-            ("include", "**/*.h"),
+            ("include", "pytorch/tokenizers/*.h"),
         ]),
-        header_namespace = "",
         visibility = [
             "@EXECUTORCH_CLIENTS",
         ],
+        header_namespace = "",
     )
 
     runtime.cxx_library(
@@ -66,7 +65,7 @@ def define_common_targets():
             "third-party/llama.cpp-unicode/src/unicode-data.cpp",
         ],
         exported_headers = subdir_glob([
-            ("third-party/llama.cpp-unicode/include", "*.h"),
+            ("include", "pytorch/tokenizers/third-party/llama.cpp-unicode/*.h"),
         ]),
         header_namespace = "",
     )

diff --git a/test/test_base64.cpp b/test/test_base64.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "base64.h"
+#include <pytorch/tokenizers/base64.h>
 #include "gtest/gtest.h"
 
 namespace tokenizers {

diff --git a/test/test_llama2c_tokenizer.cpp b/test/test_llama2c_tokenizer.cpp
@@ -10,7 +10,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "llama2c_tokenizer.h"
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 using namespace ::testing;
 

diff --git a/test/test_pre_tokenizer.cpp b/test/test_pre_tokenizer.cpp
@@ -12,7 +12,7 @@
 #include <re2/re2.h>
 
 // Local
-#include "pre_tokenizer.h"
+#include <pytorch/tokenizers/pre_tokenizer.h>
 
 using json = nlohmann::json;
 using namespace tokenizers;

diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp
@@ -11,7 +11,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "sentencepiece.h"
+#include <pytorch/tokenizers/sentencepiece.h>
 
 namespace tokenizers {
 

diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp
@@ -11,7 +11,7 @@
 #include <TestResourceUtils/TestResourceUtils.h>
 #endif
 #include <gtest/gtest.h>
-#include "tiktoken.h"
+#include <pytorch/tokenizers/tiktoken.h>
 
 using namespace ::testing;
 

diff --git a/third-party/llama.cpp-unicode/src/unicode-data.cpp b/third-party/llama.cpp-unicode/src/unicode-data.cpp
@@ -27,7 +27,7 @@ SOFTWARE.
 
 // generated with scripts/gen-unicode-data.py
 
-#include "unicode-data.h"
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode-data.h>
 
 #include <cstdint>
 #include <unordered_map>

diff --git a/third-party/llama.cpp-unicode/src/unicode.cpp b/third-party/llama.cpp-unicode/src/unicode.cpp
@@ -29,8 +29,8 @@ SOFTWARE.
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 
-#include "unicode.h"
-#include "unicode-data.h"
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode.h>
+#include <pytorch/tokenizers/third-party/llama.cpp-unicode/unicode-data.h>
 
 #include <algorithm>
 #include <cassert>