Skip to content

Commit

Permalink
Tokenizer test (pytorch-labs#21)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: pytorch-labs#21

Test Plan:
## OSS 
Build
```
cmake .     -DCMAKE_INSTALL_PREFIX=cmake-out -DTOKENIZERS_BUILD_TEST=ON -Bcmake-out
cmake --build cmake-out -j9 --target install
```

Test
```
(executorch) [[email protected] /data/users/lfq/tokenizers/cmake-out (lfq.tokenizer-test)]$ ctest
Test project /data/users/lfq/tokenizers/cmake-out
    Start 1: test_base64
1/5 Test pytorch-labs#1: test_base64 ......................   Passed    0.00 sec
    Start 2: test_llama2c_tokenizer
2/5 Test pytorch-labs#2: test_llama2c_tokenizer ...........   Passed    0.00 sec
    Start 3: test_pre_tokenizer
3/5 Test pytorch-labs#3: test_pre_tokenizer ...............   Passed    0.73 sec
    Start 4: test_sentencepiece
4/5 Test pytorch-labs#4: test_sentencepiece ...............   Passed    0.04 sec
    Start 5: test_tiktoken
5/5 Test pytorch-labs#5: test_tiktoken ....................   Passed    3.32 sec

100% tests passed, 0 tests failed out of 5

Total Test time (real) =   4.10 sec
```

## Internal
```
 buck2 test fbsource//xplat/pytorch/tokenizers/test:
 buck2 test fbcode//pytorch/tokenizers/test:
```

Differential Revision: D69860352

Pulled By: lucylq
  • Loading branch information
lucylq authored and facebook-github-bot committed Feb 19, 2025
1 parent 0763945 commit bf530db
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 27 deletions.
6 changes: 6 additions & 0 deletions targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def define_common_targets():
]),
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
header_namespace = "",
)
Expand All @@ -29,12 +30,14 @@ def define_common_targets():
],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
],
external_deps = [
"sentencepiece",
"abseil-cpp",
],
)

Expand All @@ -49,6 +52,7 @@ def define_common_targets():
],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
Expand Down Expand Up @@ -84,6 +88,7 @@ def define_common_targets():
],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
compiler_flags = [
"-D_USE_INTERNAL_STRING_VIEW",
Expand All @@ -104,5 +109,6 @@ def define_common_targets():
],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
)
8 changes: 8 additions & 0 deletions test/TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Any targets that should be shared between fbcode and xplat must be defined in
# targets.bzl. This file can contain fbcode-only targets.

load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
86 changes: 86 additions & 0 deletions test/targets.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
load(
"@fbsource//tools/build_defs:default_platform_defs.bzl",
"ANDROID",
"CXX",
)
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

def define_common_targets():
"""Defines targets that should be shared between fbcode and xplat.
The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""
runtime.cxx_test(
name = "test_base64",
srcs = [
"test_base64.cpp",
],
deps = [
"//pytorch/tokenizers:headers",
],
)

runtime.cxx_test(
name = "test_llama2c_tokenizer",
srcs = [
"test_llama2c_tokenizer.cpp",
],
deps = [
"//pytorch/tokenizers:llama2c_tokenizer",
],
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform.
)

runtime.cxx_test(
name = "test_pre_tokenizer",
srcs = [
"test_pre_tokenizer.cpp",
],
deps = [
"//pytorch/tokenizers:headers",
"//pytorch/tokenizers:hf_tokenizer",
],
)

runtime.cxx_test(
name = "test_sentencepiece",
srcs = [
"test_sentencepiece.cpp",
],
deps = ["//pytorch/tokenizers:sentencepiece"],
external_deps = [
"sentencepiece",
"abseil-cpp",
],
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
)

runtime.cxx_test(
name = "test_tiktoken",
srcs = [
"test_tiktoken.cpp",
],
deps = [
"//pytorch/tokenizers:tiktoken",
],
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
platforms = [CXX, ANDROID], # Cannot bundle resources on Apple platform.
external_deps = [
"re2",
],
)

runtime.filegroup(
name = "resources",
srcs = native.glob([
"resources/**",
]),
)
10 changes: 0 additions & 10 deletions test/test_llama2c_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
* LICENSE file in the root directory of this source tree.
*/

#ifdef TOKENIZERS_FB_BUCK
#include <TestResourceUtils/TestResourceUtils.h>
#endif
#include <gtest/gtest.h>
#include <pytorch/tokenizers/llama2c_tokenizer.h>

Expand All @@ -17,16 +14,9 @@ using namespace ::testing;
namespace tokenizers {

namespace {
// Test case based on llama2.c tokenizer
static inline std::string _get_resource_path(const std::string& name) {
#ifdef TOKENIZERS_FB_BUCK
return facebook::xplat::testing::getPathForTestResource(
"test/resources/" + name);
#else
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
#endif
}

} // namespace

class Llama2cTokenizerTest : public Test {
Expand Down
9 changes: 0 additions & 9 deletions test/test_sentencepiece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,15 @@
*/
// @lint-ignore-every LICENSELINT

#ifdef TOKENIZERS_FB_BUCK
#include <TestResourceUtils/TestResourceUtils.h>
#endif
#include <gtest/gtest.h>
#include <pytorch/tokenizers/sentencepiece.h>

namespace tokenizers {

namespace {
static inline std::string _get_resource_path(const std::string& name) {
#ifdef TOKENIZERS_FB_BUCK
return facebook::xplat::testing::getPathForTestResource(
"test/resources/" + name);
#else
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
#endif
}

} // namespace

TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
Expand Down
8 changes: 0 additions & 8 deletions test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
*/
// @lint-ignore-every LICENSELINT

#ifdef TOKENIZERS_FB_BUCK
#include <TestResourceUtils/TestResourceUtils.h>
#endif
#include <gtest/gtest.h>
#include <pytorch/tokenizers/tiktoken.h>

Expand Down Expand Up @@ -45,12 +42,7 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
}

static inline std::string _get_resource_path(const std::string& name) {
#ifdef TOKENIZERS_FB_BUCK
return facebook::xplat::testing::getPathForTestResource(
"test/resources/" + name);
#else
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
#endif
}

} // namespace
Expand Down

0 comments on commit bf530db

Please sign in to comment.