pytorch-labs · larryliu0820 · Feb 21, 2025 · Feb 21, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,75 @@
+[build-system]
+requires = [
+  "cmake", # For building binary targets in the wheel.
+  "pip>=23",  # For building the pip package.
+  "setuptools>=63",  # For building the pip package contents.
+  "wheel",  # For building the pip package archive.
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pytorch_tokenizers"
+dynamic = [
+  # setup.py will set the version.
+  'version',
+  'readme',
+]
+description = "A package with common tokenizers in Python and C++"
+authors = [
+  {name="PyTorch Team", email="[email protected]"},
+]
+license = {file = "LICENSE"}
+keywords = ["pytorch", "machine learning", "llm"]
+# PyPI package information.
+classifiers = [
+    # How mature is this project? Common values are
+    #   3 - Alpha
+    #   4 - Beta
+    #   5 - Production/Stable
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD License",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
+    # Update this as we support more versions of python.
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+# Python dependencies required for use.
+requires-python = ">=3.10"
+dependencies=[
+  "tiktoken",
+  "tokenizers",
+  "sentencepiece",
+]
+
+[project.urls]
+# The keys are arbitrary but will be visible on PyPI.
+Homepage = "https://pytorch.org/executorch/"
+Repository = "https://github.com/pytorch/executorch"
+Issues = "https://github.com/pytorch/executorch/issues"
+Changelog = "https://github.com/pytorch/executorch/releases"
+
+
+[tool.setuptools.exclude-package-data]
+"*" = ["*.pyc"]
+
+[tool.usort]
+# Do not try to put "first-party" imports in their own section.
+first_party_detection = false
+
+[tool.black]
+# Emit syntax compatible with older versions of python instead of only the range
+# specified by `requires-python`. TODO: Remove this once we support these older
+# versions of python and can expand the `requires-python` range.
+target-version = ["py38", "py39", "py310", "py311", "py312"]
diff --git a/pytorch_tokenizers/tools/llama2c/convert.py b/pytorch_tokenizers/tools/llama2c/convert.py
@@ -15,105 +15,9 @@
 import struct
 from typing import List
 
-from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
-
-
-class Tokenizer:
-    def __init__(self, model_path: str):
-        assert os.path.isfile(
-            model_path
-        ), f"Need a valid tokenizer model path but got {model_path}"
-        # pyre-fixme[28]: Unexpected keyword argument `model_file` to call `SentencePieceProcessor.__init__`.
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        self.model_path = model_path
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        logging.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_piece_size`.
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def decode(self, t: List[int]) -> str:
-        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
-        return self.sp_model.decode(t)
-
-    def decode_token(self, t: int) -> str:
-        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
-        return self.sp_model.decode(t)
-
-    def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
-        """
-        Export tokenizer.model to another serialization format. Here we did some lightweight
-        processing such as supporting prepend padding token, prepend max token length and
-        replace '_' back to empty space.
-
-        The binary format is:
-        1. vocab size: int32
-        2. bos token id: int32
-        3. eos token id: int32
-        4. max token length: int32
-        5. score: float32, len of bytes: int32, token bytes: [byte] for each token
+from pytorch_tokenizers import Llama2cTokenizer
 
-        :param output_path: output path of the new binary.
-        :param prepend_padding: a boolean to control if we want to prepend a padding token.
-
-        :return: None
-        """
-
-        # get all the tokens (postprocessed) and their scores as floats
-        tokens, scores = [], []
-
-        if prepend_padding:
-            # Here we use the default padding token and its score.
-            tokens.append("<pad>".encode("utf-8"))
-            scores.append(-1)
-
-        for i in range(self.n_words):
-            # decode the token and light postprocessing
-            # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
-            t = self.sp_model.id_to_piece(i)
-            # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
-            s = self.sp_model.get_score(i)
-            # sentencepiece use '<s>' as BOS and '</s>' for EOS
-            if i == self.bos_id:
-                t = "<s>"
-            elif i == self.eos_id:
-                t = "</s>"
-            t = t.replace("▁", " ")  # sentencepiece uses this character as whitespace
-            b = t.encode("utf-8")  # bytes of this token, utf-8 encoded
-
-            tokens.append(b)
-            scores.append(s)
-
-        # record the max token length
-        max_token_length = 0 if not tokens else max(len(t) for t in tokens)
-
-        # write to a binary file
-        with open(output_path, "wb") as f:
-            # write the vocab size, bos/eos ids and max token length
-            f.write(
-                struct.pack(
-                    "IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
-                )
-            )
-            for bytes, score in zip(tokens, scores):
-                f.write(struct.pack("fI", score, len(bytes)))
-                f.write(bytes)
-        logging.info(f"Wrote tokenizer to {output_path}")
+from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
 
 
 if __name__ == "__main__":
@@ -141,7 +45,7 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
 
     args = parser.parse_args()
 
-    t = Tokenizer(args.tokenizer_model)
+    t = Llama2cTokenizer(args.tokenizer_model)
 
     output_path = (
         args.output_path

diff --git a/setup.py b/setup.py
@@ -0,0 +1,12 @@
+import setuptools
+
+with open("README.md", "r") as f:
+    long_description = f.read()
+
+setuptools.setup(
+    version="0.1.0",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=setuptools.find_packages(where="pytorch_tokenizers"),
+    package_dir={"": "pytorch_tokenizers"},
+)