Skip to content

Commit 068d079

Browse files
committed
Move vocab thing to vocab.py
1 parent 88e06d0 commit 068d079

File tree

2 files changed

+305
-303
lines changed

2 files changed

+305
-303
lines changed

examples/convert-no-torch.py

Lines changed: 2 additions & 302 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@
2424
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
2525
from dataclasses import dataclass
2626
from pathlib import Path
27-
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
27+
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
2828

2929
import numpy as np
30-
from sentencepiece import SentencePieceProcessor
30+
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
3131

3232
if 'NO_LOCAL_GGUF' not in os.environ:
3333
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
@@ -380,306 +380,6 @@ def load(metadata_path: Path) -> Metadata:
380380
return metadata
381381

382382

383-
#
384-
# vocab
385-
#
386-
387-
388-
@runtime_checkable
389-
class BaseVocab(Protocol):
390-
tokenizer_model: ClassVar[str]
391-
name: ClassVar[str]
392-
393-
394-
class NoVocab(BaseVocab):
395-
tokenizer_model = "no_vocab"
396-
name = "no_vocab"
397-
398-
def __repr__(self) -> str:
399-
return "<NoVocab for a model without integrated vocabulary>"
400-
401-
402-
@runtime_checkable
403-
class Vocab(BaseVocab, Protocol):
404-
vocab_size: int
405-
added_tokens_dict: dict[str, int]
406-
added_tokens_list: list[str]
407-
fname_tokenizer: Path
408-
409-
def __init__(self, base_path: Path): ...
410-
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
411-
412-
413-
class BpeVocab(Vocab):
414-
tokenizer_model = "gpt2"
415-
name = "bpe"
416-
417-
def __init__(self, base_path: Path):
418-
added_tokens: dict[str, int] = {}
419-
420-
if (fname_tokenizer := base_path / 'vocab.json').exists():
421-
# "slow" tokenizer
422-
with open(fname_tokenizer, encoding="utf-8") as f:
423-
self.vocab = json.load(f)
424-
425-
try:
426-
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
427-
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
428-
added_tokens = json.load(f)
429-
except FileNotFoundError:
430-
pass
431-
else:
432-
# "fast" tokenizer
433-
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
434-
435-
# if this fails, FileNotFoundError propagates to caller
436-
with open(fname_tokenizer, encoding="utf-8") as f:
437-
tokenizer_json = json.load(f)
438-
439-
tokenizer_model: dict[str, Any] = tokenizer_json['model']
440-
if (
441-
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
442-
or tokenizer_json['decoder']['type'] != 'ByteLevel'
443-
):
444-
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
445-
446-
self.vocab = tokenizer_model["vocab"]
447-
448-
if (added := tokenizer_json.get('added_tokens')) is not None:
449-
# Added tokens here can be duplicates of the main vocabulary.
450-
added_tokens = {item['content']: item['id']
451-
for item in added
452-
if item['content'] not in self.vocab}
453-
454-
vocab_size = len(self.vocab)
455-
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
456-
actual_ids = sorted(added_tokens.values())
457-
if expected_ids != actual_ids:
458-
expected_end_id = vocab_size + len(actual_ids) - 1
459-
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
460-
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
461-
462-
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
463-
self.added_tokens_dict = added_tokens
464-
self.added_tokens_list = [text for (text, idx) in items]
465-
self.vocab_size_base = vocab_size
466-
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
467-
self.fname_tokenizer = fname_tokenizer
468-
469-
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
470-
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
471-
472-
for i, _ in enumerate(self.vocab):
473-
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
474-
475-
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
476-
for text in self.added_tokens_list:
477-
score = -1000.0
478-
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
479-
480-
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
481-
yield from self.bpe_tokens()
482-
yield from self.added_tokens()
483-
484-
def __repr__(self) -> str:
485-
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
486-
487-
488-
class SentencePieceVocab(Vocab):
489-
tokenizer_model = "llama"
490-
name = "spm"
491-
492-
def __init__(self, base_path: Path):
493-
added_tokens: dict[str, int] = {}
494-
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
495-
# normal location
496-
try:
497-
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
498-
added_tokens = json.load(f)
499-
except FileNotFoundError:
500-
pass
501-
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
502-
# not found in alternate location either
503-
raise FileNotFoundError('Cannot find tokenizer.model')
504-
505-
self.sentencepiece_tokenizer = SentencePieceProcessor()
506-
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
507-
vocab_size = self.sentencepiece_tokenizer.vocab_size()
508-
509-
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
510-
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
511-
actual_new_ids = sorted(new_tokens.keys())
512-
513-
if expected_new_ids != actual_new_ids:
514-
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
515-
516-
# Token pieces that were added to the base vocabulary.
517-
self.added_tokens_dict = added_tokens
518-
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
519-
self.vocab_size_base = vocab_size
520-
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
521-
self.fname_tokenizer = fname_tokenizer
522-
523-
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
524-
tokenizer = self.sentencepiece_tokenizer
525-
for i in range(tokenizer.vocab_size()):
526-
piece = tokenizer.IdToPiece(i)
527-
text = piece.encode("utf-8")
528-
score: float = tokenizer.GetScore(i)
529-
530-
toktype = gguf.TokenType.NORMAL
531-
if tokenizer.IsUnknown(i):
532-
toktype = gguf.TokenType.UNKNOWN
533-
if tokenizer.IsControl(i):
534-
toktype = gguf.TokenType.CONTROL
535-
536-
# NOTE: I think added_tokens are user defined.
537-
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
538-
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
539-
540-
if tokenizer.IsUnused(i):
541-
toktype = gguf.TokenType.UNUSED
542-
if tokenizer.IsByte(i):
543-
toktype = gguf.TokenType.BYTE
544-
545-
yield text, score, toktype
546-
547-
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
548-
for text in self.added_tokens_list:
549-
score = -1000.0
550-
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
551-
552-
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
553-
yield from self.sentencepiece_tokens()
554-
yield from self.added_tokens()
555-
556-
def __repr__(self) -> str:
557-
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
558-
559-
560-
class LlamaHfVocab(Vocab):
561-
tokenizer_model = "llama"
562-
name = "hfft"
563-
564-
def __init__(self, base_path: Path):
565-
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
566-
# if this fails, FileNotFoundError propagates to caller
567-
with open(fname_tokenizer, encoding='utf-8') as f:
568-
tokenizer_json = json.load(f)
569-
570-
# pre-check so we know if we need transformers
571-
tokenizer_model: dict[str, Any] = tokenizer_json['model']
572-
is_llama3 = (
573-
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
574-
and not tokenizer_model.get('byte_fallback', True)
575-
)
576-
if is_llama3:
577-
raise TypeError('Llama 3 must be converted with BpeVocab')
578-
579-
if not is_llama3 and (
580-
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
581-
or tokenizer_json['decoder']['type'] != 'Sequence'
582-
):
583-
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
584-
585-
try:
586-
from transformers import AutoTokenizer
587-
except ImportError as e:
588-
raise ImportError(
589-
"To use LlamaHfVocab, please install the `transformers` package. "
590-
"You can install it with `pip install transformers`."
591-
) from e
592-
593-
# Allow the tokenizer to default to slow or fast versions.
594-
# Explicitly set tokenizer to use local paths.
595-
self.tokenizer = AutoTokenizer.from_pretrained(
596-
base_path,
597-
cache_dir=base_path,
598-
local_files_only=True,
599-
)
600-
assert self.tokenizer.is_fast # assume tokenizer.json is used
601-
602-
# Initialize lists and dictionaries for added tokens
603-
self.added_tokens_list = []
604-
self.added_tokens_dict = dict()
605-
self.added_tokens_ids = set()
606-
607-
# Process added tokens
608-
for tok, tokidx in sorted(
609-
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
610-
):
611-
# Only consider added tokens that are not in the base vocabulary
612-
if tokidx >= self.tokenizer.vocab_size:
613-
self.added_tokens_list.append(tok)
614-
self.added_tokens_dict[tok] = tokidx
615-
self.added_tokens_ids.add(tokidx)
616-
617-
# Store special tokens and their IDs
618-
self.specials = {
619-
tok: self.tokenizer.get_vocab()[tok]
620-
for tok in self.tokenizer.all_special_tokens
621-
}
622-
self.special_ids = set(self.tokenizer.all_special_ids)
623-
624-
# Set vocabulary sizes
625-
self.vocab_size_base = self.tokenizer.vocab_size
626-
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
627-
628-
self.fname_tokenizer = fname_tokenizer
629-
630-
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
631-
reverse_vocab = {
632-
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
633-
}
634-
635-
for token_id in range(self.vocab_size_base):
636-
# Skip processing added tokens here
637-
if token_id in self.added_tokens_ids:
638-
continue
639-
640-
# Convert token text to bytes
641-
token_text = reverse_vocab[token_id].encode("utf-8")
642-
643-
# Yield token text, score, and type
644-
yield token_text, self.get_token_score(token_id), self.get_token_type(
645-
token_id, token_text, self.special_ids # Reuse already stored special IDs
646-
)
647-
648-
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
649-
# Special case for byte tokens
650-
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
651-
return gguf.TokenType.BYTE
652-
653-
# Determine token type based on whether it's a special token
654-
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
655-
656-
def get_token_score(self, token_id: int) -> float:
657-
# Placeholder for actual logic to determine the token's score
658-
# This needs to be implemented based on specific requirements
659-
return -1000.0 # Default score
660-
661-
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
662-
for text in self.added_tokens_list:
663-
if text in self.specials:
664-
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
665-
score = self.get_token_score(self.specials[text])
666-
else:
667-
toktype = gguf.TokenType.USER_DEFINED
668-
score = -1000.0
669-
670-
yield text.encode("utf-8"), score, toktype
671-
672-
def has_newline_token(self):
673-
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
674-
675-
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
676-
yield from self.hf_tokens()
677-
yield from self.added_tokens()
678-
679-
def __repr__(self) -> str:
680-
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
681-
682-
683383
#
684384
# data loading
685385
# TODO: reuse (probably move to gguf.py?)

0 commit comments

Comments
 (0)