|
24 | 24 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
25 | 25 | from dataclasses import dataclass
|
26 | 26 | from pathlib import Path
|
27 |
| -from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional |
| 27 | +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional |
28 | 28 |
|
29 | 29 | import numpy as np
|
30 |
| -from sentencepiece import SentencePieceProcessor |
| 30 | +from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab |
31 | 31 |
|
32 | 32 | if 'NO_LOCAL_GGUF' not in os.environ:
|
33 | 33 | sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
@@ -380,306 +380,6 @@ def load(metadata_path: Path) -> Metadata:
|
380 | 380 | return metadata
|
381 | 381 |
|
382 | 382 |
|
383 |
| -# |
384 |
| -# vocab |
385 |
| -# |
386 |
| - |
387 |
| - |
388 |
| -@runtime_checkable |
389 |
| -class BaseVocab(Protocol): |
390 |
| - tokenizer_model: ClassVar[str] |
391 |
| - name: ClassVar[str] |
392 |
| - |
393 |
| - |
394 |
| -class NoVocab(BaseVocab): |
395 |
| - tokenizer_model = "no_vocab" |
396 |
| - name = "no_vocab" |
397 |
| - |
398 |
| - def __repr__(self) -> str: |
399 |
| - return "<NoVocab for a model without integrated vocabulary>" |
400 |
| - |
401 |
| - |
402 |
| -@runtime_checkable |
403 |
| -class Vocab(BaseVocab, Protocol): |
404 |
| - vocab_size: int |
405 |
| - added_tokens_dict: dict[str, int] |
406 |
| - added_tokens_list: list[str] |
407 |
| - fname_tokenizer: Path |
408 |
| - |
409 |
| - def __init__(self, base_path: Path): ... |
410 |
| - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... |
411 |
| - |
412 |
| - |
413 |
| -class BpeVocab(Vocab): |
414 |
| - tokenizer_model = "gpt2" |
415 |
| - name = "bpe" |
416 |
| - |
417 |
| - def __init__(self, base_path: Path): |
418 |
| - added_tokens: dict[str, int] = {} |
419 |
| - |
420 |
| - if (fname_tokenizer := base_path / 'vocab.json').exists(): |
421 |
| - # "slow" tokenizer |
422 |
| - with open(fname_tokenizer, encoding="utf-8") as f: |
423 |
| - self.vocab = json.load(f) |
424 |
| - |
425 |
| - try: |
426 |
| - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. |
427 |
| - with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: |
428 |
| - added_tokens = json.load(f) |
429 |
| - except FileNotFoundError: |
430 |
| - pass |
431 |
| - else: |
432 |
| - # "fast" tokenizer |
433 |
| - fname_tokenizer = base_path / FAST_TOKENIZER_FILE |
434 |
| - |
435 |
| - # if this fails, FileNotFoundError propagates to caller |
436 |
| - with open(fname_tokenizer, encoding="utf-8") as f: |
437 |
| - tokenizer_json = json.load(f) |
438 |
| - |
439 |
| - tokenizer_model: dict[str, Any] = tokenizer_json['model'] |
440 |
| - if ( |
441 |
| - tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) |
442 |
| - or tokenizer_json['decoder']['type'] != 'ByteLevel' |
443 |
| - ): |
444 |
| - raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') |
445 |
| - |
446 |
| - self.vocab = tokenizer_model["vocab"] |
447 |
| - |
448 |
| - if (added := tokenizer_json.get('added_tokens')) is not None: |
449 |
| - # Added tokens here can be duplicates of the main vocabulary. |
450 |
| - added_tokens = {item['content']: item['id'] |
451 |
| - for item in added |
452 |
| - if item['content'] not in self.vocab} |
453 |
| - |
454 |
| - vocab_size = len(self.vocab) |
455 |
| - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) |
456 |
| - actual_ids = sorted(added_tokens.values()) |
457 |
| - if expected_ids != actual_ids: |
458 |
| - expected_end_id = vocab_size + len(actual_ids) - 1 |
459 |
| - raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " |
460 |
| - f"{vocab_size} - {expected_end_id}; got {actual_ids}") |
461 |
| - |
462 |
| - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) |
463 |
| - self.added_tokens_dict = added_tokens |
464 |
| - self.added_tokens_list = [text for (text, idx) in items] |
465 |
| - self.vocab_size_base = vocab_size |
466 |
| - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) |
467 |
| - self.fname_tokenizer = fname_tokenizer |
468 |
| - |
469 |
| - def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
470 |
| - reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} |
471 |
| - |
472 |
| - for i, _ in enumerate(self.vocab): |
473 |
| - yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL |
474 |
| - |
475 |
| - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
476 |
| - for text in self.added_tokens_list: |
477 |
| - score = -1000.0 |
478 |
| - yield text.encode("utf-8"), score, gguf.TokenType.CONTROL |
479 |
| - |
480 |
| - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
481 |
| - yield from self.bpe_tokens() |
482 |
| - yield from self.added_tokens() |
483 |
| - |
484 |
| - def __repr__(self) -> str: |
485 |
| - return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" |
486 |
| - |
487 |
| - |
488 |
| -class SentencePieceVocab(Vocab): |
489 |
| - tokenizer_model = "llama" |
490 |
| - name = "spm" |
491 |
| - |
492 |
| - def __init__(self, base_path: Path): |
493 |
| - added_tokens: dict[str, int] = {} |
494 |
| - if (fname_tokenizer := base_path / 'tokenizer.model').exists(): |
495 |
| - # normal location |
496 |
| - try: |
497 |
| - with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: |
498 |
| - added_tokens = json.load(f) |
499 |
| - except FileNotFoundError: |
500 |
| - pass |
501 |
| - elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): |
502 |
| - # not found in alternate location either |
503 |
| - raise FileNotFoundError('Cannot find tokenizer.model') |
504 |
| - |
505 |
| - self.sentencepiece_tokenizer = SentencePieceProcessor() |
506 |
| - self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) |
507 |
| - vocab_size = self.sentencepiece_tokenizer.vocab_size() |
508 |
| - |
509 |
| - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} |
510 |
| - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) |
511 |
| - actual_new_ids = sorted(new_tokens.keys()) |
512 |
| - |
513 |
| - if expected_new_ids != actual_new_ids: |
514 |
| - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") |
515 |
| - |
516 |
| - # Token pieces that were added to the base vocabulary. |
517 |
| - self.added_tokens_dict = added_tokens |
518 |
| - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] |
519 |
| - self.vocab_size_base = vocab_size |
520 |
| - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) |
521 |
| - self.fname_tokenizer = fname_tokenizer |
522 |
| - |
523 |
| - def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
524 |
| - tokenizer = self.sentencepiece_tokenizer |
525 |
| - for i in range(tokenizer.vocab_size()): |
526 |
| - piece = tokenizer.IdToPiece(i) |
527 |
| - text = piece.encode("utf-8") |
528 |
| - score: float = tokenizer.GetScore(i) |
529 |
| - |
530 |
| - toktype = gguf.TokenType.NORMAL |
531 |
| - if tokenizer.IsUnknown(i): |
532 |
| - toktype = gguf.TokenType.UNKNOWN |
533 |
| - if tokenizer.IsControl(i): |
534 |
| - toktype = gguf.TokenType.CONTROL |
535 |
| - |
536 |
| - # NOTE: I think added_tokens are user defined. |
537 |
| - # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto |
538 |
| - # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED |
539 |
| - |
540 |
| - if tokenizer.IsUnused(i): |
541 |
| - toktype = gguf.TokenType.UNUSED |
542 |
| - if tokenizer.IsByte(i): |
543 |
| - toktype = gguf.TokenType.BYTE |
544 |
| - |
545 |
| - yield text, score, toktype |
546 |
| - |
547 |
| - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
548 |
| - for text in self.added_tokens_list: |
549 |
| - score = -1000.0 |
550 |
| - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED |
551 |
| - |
552 |
| - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
553 |
| - yield from self.sentencepiece_tokens() |
554 |
| - yield from self.added_tokens() |
555 |
| - |
556 |
| - def __repr__(self) -> str: |
557 |
| - return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" |
558 |
| - |
559 |
| - |
560 |
| -class LlamaHfVocab(Vocab): |
561 |
| - tokenizer_model = "llama" |
562 |
| - name = "hfft" |
563 |
| - |
564 |
| - def __init__(self, base_path: Path): |
565 |
| - fname_tokenizer = base_path / FAST_TOKENIZER_FILE |
566 |
| - # if this fails, FileNotFoundError propagates to caller |
567 |
| - with open(fname_tokenizer, encoding='utf-8') as f: |
568 |
| - tokenizer_json = json.load(f) |
569 |
| - |
570 |
| - # pre-check so we know if we need transformers |
571 |
| - tokenizer_model: dict[str, Any] = tokenizer_json['model'] |
572 |
| - is_llama3 = ( |
573 |
| - tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) |
574 |
| - and not tokenizer_model.get('byte_fallback', True) |
575 |
| - ) |
576 |
| - if is_llama3: |
577 |
| - raise TypeError('Llama 3 must be converted with BpeVocab') |
578 |
| - |
579 |
| - if not is_llama3 and ( |
580 |
| - tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) |
581 |
| - or tokenizer_json['decoder']['type'] != 'Sequence' |
582 |
| - ): |
583 |
| - raise FileNotFoundError('Cannot find Llama BPE tokenizer') |
584 |
| - |
585 |
| - try: |
586 |
| - from transformers import AutoTokenizer |
587 |
| - except ImportError as e: |
588 |
| - raise ImportError( |
589 |
| - "To use LlamaHfVocab, please install the `transformers` package. " |
590 |
| - "You can install it with `pip install transformers`." |
591 |
| - ) from e |
592 |
| - |
593 |
| - # Allow the tokenizer to default to slow or fast versions. |
594 |
| - # Explicitly set tokenizer to use local paths. |
595 |
| - self.tokenizer = AutoTokenizer.from_pretrained( |
596 |
| - base_path, |
597 |
| - cache_dir=base_path, |
598 |
| - local_files_only=True, |
599 |
| - ) |
600 |
| - assert self.tokenizer.is_fast # assume tokenizer.json is used |
601 |
| - |
602 |
| - # Initialize lists and dictionaries for added tokens |
603 |
| - self.added_tokens_list = [] |
604 |
| - self.added_tokens_dict = dict() |
605 |
| - self.added_tokens_ids = set() |
606 |
| - |
607 |
| - # Process added tokens |
608 |
| - for tok, tokidx in sorted( |
609 |
| - self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] |
610 |
| - ): |
611 |
| - # Only consider added tokens that are not in the base vocabulary |
612 |
| - if tokidx >= self.tokenizer.vocab_size: |
613 |
| - self.added_tokens_list.append(tok) |
614 |
| - self.added_tokens_dict[tok] = tokidx |
615 |
| - self.added_tokens_ids.add(tokidx) |
616 |
| - |
617 |
| - # Store special tokens and their IDs |
618 |
| - self.specials = { |
619 |
| - tok: self.tokenizer.get_vocab()[tok] |
620 |
| - for tok in self.tokenizer.all_special_tokens |
621 |
| - } |
622 |
| - self.special_ids = set(self.tokenizer.all_special_ids) |
623 |
| - |
624 |
| - # Set vocabulary sizes |
625 |
| - self.vocab_size_base = self.tokenizer.vocab_size |
626 |
| - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) |
627 |
| - |
628 |
| - self.fname_tokenizer = fname_tokenizer |
629 |
| - |
630 |
| - def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
631 |
| - reverse_vocab = { |
632 |
| - id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() |
633 |
| - } |
634 |
| - |
635 |
| - for token_id in range(self.vocab_size_base): |
636 |
| - # Skip processing added tokens here |
637 |
| - if token_id in self.added_tokens_ids: |
638 |
| - continue |
639 |
| - |
640 |
| - # Convert token text to bytes |
641 |
| - token_text = reverse_vocab[token_id].encode("utf-8") |
642 |
| - |
643 |
| - # Yield token text, score, and type |
644 |
| - yield token_text, self.get_token_score(token_id), self.get_token_type( |
645 |
| - token_id, token_text, self.special_ids # Reuse already stored special IDs |
646 |
| - ) |
647 |
| - |
648 |
| - def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: |
649 |
| - # Special case for byte tokens |
650 |
| - if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): |
651 |
| - return gguf.TokenType.BYTE |
652 |
| - |
653 |
| - # Determine token type based on whether it's a special token |
654 |
| - return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL |
655 |
| - |
656 |
| - def get_token_score(self, token_id: int) -> float: |
657 |
| - # Placeholder for actual logic to determine the token's score |
658 |
| - # This needs to be implemented based on specific requirements |
659 |
| - return -1000.0 # Default score |
660 |
| - |
661 |
| - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
662 |
| - for text in self.added_tokens_list: |
663 |
| - if text in self.specials: |
664 |
| - toktype = self.get_token_type(self.specials[text], b'', self.special_ids) |
665 |
| - score = self.get_token_score(self.specials[text]) |
666 |
| - else: |
667 |
| - toktype = gguf.TokenType.USER_DEFINED |
668 |
| - score = -1000.0 |
669 |
| - |
670 |
| - yield text.encode("utf-8"), score, toktype |
671 |
| - |
672 |
| - def has_newline_token(self): |
673 |
| - return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab |
674 |
| - |
675 |
| - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
676 |
| - yield from self.hf_tokens() |
677 |
| - yield from self.added_tokens() |
678 |
| - |
679 |
| - def __repr__(self) -> str: |
680 |
| - return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" |
681 |
| - |
682 |
| - |
683 | 383 | #
|
684 | 384 | # data loading
|
685 | 385 | # TODO: reuse (probably move to gguf.py?)
|
|
0 commit comments