Skip to content

Commit

Permalink
Buckify tokenizers
Browse files Browse the repository at this point in the history
Differential Revision: D69509028

Pull Request resolved: #17
  • Loading branch information
larryliu0820 authored Feb 14, 2025
1 parent f2fc3d6 commit 03744ce
Show file tree
Hide file tree
Showing 23 changed files with 369 additions and 94 deletions.
52 changes: 28 additions & 24 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
add_library(tokenizers STATIC ${tokenizers_source_files} ${unicode_source_files})
file(GLOB unicode_source_files
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
add_library(tokenizers STATIC ${tokenizers_source_files}
${unicode_source_files})

# Using abseil from sentencepiece/third_party
target_include_directories(
tokenizers PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
tokenizers
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)

target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

# Build test
if(TOKENIZERS_BUILD_TEST)
enable_testing()
include(FetchContent)
enable_testing()
include(FetchContent)
# CMAKE
FetchContent_Declare(
googletest
Expand All @@ -63,20 +65,22 @@ if(TOKENIZERS_BUILD_TEST)
FetchContent_MakeAvailable(googletest)

file(GLOB test_source_files ${CMAKE_CURRENT_SOURCE_DIR}/test/test_*.cpp)

set(test_env "RESOURCES_PATH=${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
foreach(test_source_file ${test_source_files})
get_filename_component(test_name ${test_source_file} NAME_WE)
message(STATUS "Configuring unit test ${test_name}")
add_executable(${test_name} ${test_source_file})
target_include_directories(${test_name} PRIVATE
GTEST_INCLUDE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
)
target_link_libraries(${test_name} gtest_main tokenizers)
target_compile_definitions(${test_name} PRIVATE RESOURCES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
add_test(${test_name} "${test_name}")
get_filename_component(test_name ${test_source_file} NAME_WE)
message(STATUS "Configuring unit test ${test_name}")
add_executable(${test_name} ${test_source_file})
target_include_directories(
${test_name}
PRIVATE GTEST_INCLUDE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
target_link_libraries(${test_name} gtest_main tokenizers)
add_test(${test_name} "${test_name}")
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
endforeach()
endif()

Expand Down
8 changes: 8 additions & 0 deletions TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Any targets that should be shared between fbcode and xplat must be defined in
# targets.bzl. This file can contain fbcode-only targets.

load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
4 changes: 4 additions & 0 deletions include/detail/bpe_tokenizer_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

// Base class for all BPE tokenizer implementations
#pragma once

// Standard
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>

Expand Down
4 changes: 4 additions & 0 deletions include/pre_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

#pragma once

// Standard
Expand Down Expand Up @@ -41,6 +43,8 @@ class PreTokenizer {
*/
virtual std::vector<std::string> pre_tokenize(
re2::StringPiece input) const = 0;

virtual ~PreTokenizer() = default;
}; // end class PreTokenizer

// -- Factory ------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion include/sentencepiece.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

// A tokenizer that works with sentencepiece.
// A tokenizer that works with sentencepiece. Used by Llama2.
#pragma once

#include <memory>
Expand Down
5 changes: 5 additions & 0 deletions include/token_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

#pragma once

// Standard
Expand Down Expand Up @@ -45,6 +47,9 @@ class TokenDecoder {
*/
virtual std::string decode(re2::StringPiece token) const = 0;

// virtual destructor
virtual ~TokenDecoder() = default;

}; // end class TokenDecoder

// -- Factory ------------------------------------------------------------------
Expand Down
12 changes: 7 additions & 5 deletions src/bpe_tokenizer_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

#include "detail/bpe_tokenizer_base.h"

// Standard
Expand Down Expand Up @@ -56,7 +58,7 @@ static std::vector<uint64_t> _byte_pair_merge(
if (rank) {
// usize::MAX is a sentinel value and cannot be a valid rank
if (*rank == _max_size()) {
fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
}
parts[i].second = *rank;
}
Expand Down Expand Up @@ -177,8 +179,8 @@ BPETokenizerBase::encode_with_special_token_(
} catch (const std::out_of_range&) {
// Should never go here, since special pattern includes all special
// chars.
fprintf(stderr, "unknown special token: %s\n", special->c_str());
exit(EXIT_FAILURE);
TK_LOG(Error, "unknown special token: %s\n", special->c_str());
return Error::EncodeFailure;
}

tokens.push_back(token);
Expand Down Expand Up @@ -259,8 +261,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
if (iter != special_token_decoder_.end()) {
token_bytes = iter->second;
} else {
fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
exit(EXIT_FAILURE);
TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
return Error::DecodeFailure;
}
}
_decode(token_bytes, ret);
Expand Down
22 changes: 14 additions & 8 deletions src/hf_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

#include "hf_tokenizer.h"

// Standard
Expand Down Expand Up @@ -127,17 +129,17 @@ Error HFTokenizer::load(const std::string& path) {
// If a tokenizer config file is found, parse it to look up the eos/bos tokens
if (!model_config_json.empty()) {
// Load it and parse it as json
std::ifstream file(model_config_json);
if (!file) {
std::ifstream config_file(model_config_json);
if (!config_file) {
fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
return Error::LoadFailure;
}
std::string contents(
(std::istreambuf_iterator<char>(file)),
std::string config_contents(
(std::istreambuf_iterator<char>(config_file)),
std::istreambuf_iterator<char>());
json parsed_json;
json parsed_config_json;
try {
parsed_json = json::parse(contents);
parsed_config_json = json::parse(config_contents);
} catch (const json::exception& e) {
std::cout << "Error parsing model config json json file: " << e.what()
<< std::endl;
Expand All @@ -146,8 +148,8 @@ Error HFTokenizer::load(const std::string& path) {

// Pull out the token strings
try {
const std::string bos_token = parsed_json.at("bos_token");
const std::string eos_token = parsed_json.at("eos_token");
const std::string bos_token = parsed_config_json.at("bos_token");
const std::string eos_token = parsed_config_json.at("eos_token");
const auto& bos_it = special_token_encoder_.find(bos_token);
const auto& eos_it = special_token_encoder_.find(eos_token);
if (bos_it == special_token_encoder_.end()) {
Expand Down Expand Up @@ -256,7 +258,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
if (_decoder) {
ret += _decoder->decode(input);
} else {
#ifdef _USE_INTERNAL_STRING_VIEW
ret += input.as_string();
#else
ret += input;
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/pre_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ namespace {

// Standard GPT2 regex
// https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
static const std::string GPT2_EXPR =
constexpr char GPT2_EXPR[] =
R"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)";

} // namespace
Expand Down
4 changes: 4 additions & 0 deletions src/tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,11 @@ Error Tiktoken::_encode(
}

void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
#ifdef _USE_INTERNAL_STRING_VIEW
ret += input.as_string();
#else
ret += input;
#endif
}

template <typename T>
Expand Down
4 changes: 3 additions & 1 deletion src/token_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// @lint-ignore-every LICENSELINT

#include "token_decoder.h"

// Standard
Expand Down Expand Up @@ -60,7 +62,7 @@ static std::string format(const char* fmt, ...) {
int size = vsnprintf(NULL, 0, fmt, ap);
// GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
// int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
// GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
Expand Down
96 changes: 96 additions & 0 deletions targets.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

def define_common_targets():
"""Defines targets that should be shared between fbcode and xplat.
The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

runtime.cxx_library(
name = "headers",
exported_headers = subdir_glob([
("include", "*.h"),
("include", "**/*.h"),
]),
header_namespace = "",
visibility = [
"@EXECUTORCH_CLIENTS",
],
)

runtime.cxx_library(
name = "sentencepiece",
srcs = [
"src/sentencepiece.cpp",
],
exported_deps = [
":headers",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
],
external_deps = [
"sentencepiece",
],
)

runtime.cxx_library(
name = "tiktoken",
srcs = [
"src/tiktoken.cpp",
"src/bpe_tokenizer_base.cpp",
],
exported_deps = [
":headers",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
],
exported_external_deps = [
"re2",
],
)

runtime.cxx_library(
name = "unicode",
srcs = [
"third-party/llama.cpp-unicode/src/unicode.cpp",
"third-party/llama.cpp-unicode/src/unicode-data.cpp",
],
exported_headers = subdir_glob([
("third-party/llama.cpp-unicode/include", "*.h"),
]),
header_namespace = "",
)

runtime.cxx_library(
name = "hf_tokenizer",
srcs = [
"src/hf_tokenizer.cpp",
"src/bpe_tokenizer_base.cpp",
"src/pre_tokenizer.cpp",
"src/token_decoder.cpp",
],
exported_deps = [
":headers",
":unicode",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
],
exported_external_deps = [
"re2",
"nlohmann_json",
],
)
Binary file added test/resources/test_bpe_tokenizer.bin
Binary file not shown.
1 change: 1 addition & 0 deletions test/resources/test_tiktoken_invalid_base64.model
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tet 0
1 change: 1 addition & 0 deletions test/resources/test_tiktoken_invalid_rank.model
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ICAgICAgIA== 18446744073709551616
1 change: 1 addition & 0 deletions test/resources/test_tiktoken_no_space.model
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ICAgICAgIA==10
File renamed without changes.
2 changes: 1 addition & 1 deletion test/test_pre_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ using namespace tokenizers;

// Helpers /////////////////////////////////////////////////////////////////////

void assert_split_match(
static void assert_split_match(
const PreTokenizer& ptok,
const std::string& prompt,
const std::vector<std::string>& expected) {
Expand Down
Loading

0 comments on commit 03744ce

Please sign in to comment.