Skip to content

format and use hf tokenizer api #65

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions aiu_fms_testing_utils/testing/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import List, Tuple, Callable, MutableMapping, Any, Optional

import torch
from aiu_fms_testing_utils.utils import ids_for_prompt
from aiu_fms_testing_utils.utils.aiu_setup import dprint
import os

Expand Down Expand Up @@ -206,8 +205,9 @@ def load_validation_information(
# Text format will get tokenized
validation_info.append(
{
"tokens": ids_for_prompt(
validation_file_path.read_text(encoding="utf-8"), tokenizer
"tokens": tokenizer.encode(
validation_file_path.read_text(encoding="utf-8"),
return_tensors="pt",
),
"logits": None,
}
Expand Down Expand Up @@ -378,12 +378,8 @@ def print_failed_cases(failed_cases, aiu_tokens, validation_tokens, tokenizer):
aiu_token = aiu_tokens[sentence_index][token_index]
validation_token = validation_tokens[sentence_index][token_index]

aiu_str = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(aiu_token)
)
validation_str = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(validation_token)
)
aiu_str = tokenizer.decode(aiu_token)
validation_str = tokenizer.decode(validation_token)
print(
f"In sentence {sentence_index + 1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}"
)
13 changes: 2 additions & 11 deletions aiu_fms_testing_utils/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,6 @@ def warmup_model(
dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")


def ids_for_prompt(prompt, tokenizer):
tokens = tokenizer.tokenize(prompt)
ids = tokenizer.convert_tokens_to_ids(tokens)
if tokenizer.bos_token_id != tokenizer.eos_token_id:
ids = [tokenizer.bos_token_id] + ids
ids = torch.tensor(ids, dtype=torch.long, device="cpu")
return ids


def __download_file(url, filename):
try:
response = requests.get(url, stream=True)
Expand Down Expand Up @@ -110,7 +101,7 @@ def __sample_requests(

# Tokenize the prompts and completions.
prompt = prompt_list[i]
prompt_token_ids = ids_for_prompt(prompt, tokenizer)
prompt_token_ids = tokenizer.encode(prompt, return_tensors="pt").squeeze(0)
Copy link
Contributor

@JRosenkranz JRosenkranz Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the above BaseTokenizer type hint need to be updated to a huggingface tokenizer?


prompt_len = len(prompt_token_ids)
if prompt_len < prompt_length_min or prompt_len > prompt_length_max:
Expand Down Expand Up @@ -217,7 +208,7 @@ def prepare_inputs(
)
prompt_list = []
for prompt, _ in prompts_and_sizes:
prompt_list.append(ids_for_prompt(prompt, tokenizer))
prompt_list.append(tokenizer.encode(prompt, return_tensors="pt").squeeze(0))

input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
return input_ids, padding_kwargs
8 changes: 4 additions & 4 deletions scripts/generate_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
GoldenTokenHook,
top_k_loss_calculator,
)
from aiu_fms_testing_utils.utils import ids_for_prompt, sample_sharegpt_requests
from aiu_fms_testing_utils.utils import sample_sharegpt_requests
from fms.models import get_model
from fms.utils import tokenizers
from fms.utils.generation import pad_input_ids
from transformers import AutoTokenizer

parser = argparse.ArgumentParser(
description="Script to determine a reasonable logits loss threshold when testing with aiu"
Expand Down Expand Up @@ -156,7 +156,7 @@
if default_dtype is not None:
torch.set_default_dtype(default_dtype)

tokenizer = tokenizers.get_tokenizer(args.tokenizer)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)

torch.set_grad_enabled(False)

Expand Down Expand Up @@ -190,7 +190,7 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
)
prompt_list = []
for prompt, _ in prompts_and_sizes:
prompt_list.append(ids_for_prompt(prompt, tokenizer))
prompt_list.append(tokenizer.encode(prompt, return_tensors="pt").squeeze(0))

input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
return input_ids, padding_kwargs
Expand Down
33 changes: 14 additions & 19 deletions scripts/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
from torch import distributed as dist
from fms.models import get_model, register_model
from fms.models.llama import LLaMAConfig, _llama_factory_factory
from fms.utils import generation, tokenizers
from fms.utils import generation
from fms.utils.generation import pad_input_ids

from transformers import AutoTokenizer


# This example script validates the LLaMA implementation by running inference on a couple of prompts.
#
Expand Down Expand Up @@ -551,7 +553,7 @@ def select_int8_module(
dprint(model)
dprint("=" * 60 + "\n")

tokenizer = tokenizers.get_tokenizer(args.tokenizer)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
model.eval()
torch.set_grad_enabled(False)
loading_model_time = time.time() - loading_model_time
Expand All @@ -570,15 +572,6 @@ def select_int8_module(
add_special_tokens = tokenizer.bos_token_id != tokenizer.eos_token_id


def ids_for_prompt(prompt):
tokens = tokenizer.tokenize(prompt)
ids = tokenizer.convert_tokens_to_ids(tokens)
if add_special_tokens:
ids = [tokenizer.bos_token_id] + ids
ids = torch.tensor(ids, dtype=torch.long, device=device)
return ids


def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
# we may want the prompt length to be fixed to some max length
# this will ensure that prior to padding the input ids
Expand Down Expand Up @@ -626,7 +619,11 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
for i, prompt_file_path in enumerate(prompt_file_paths):
if i == args.batch_size:
break
prompts.append(ids_for_prompt(prompt_file_path.read_text(encoding="utf-8")))
prompts.append(
tokenizer.encode(
prompt_file_path.read_text(encoding="utf-8"), return_tensors="pt"
)
)

else:
if args.prompt_type == "chat":
Expand Down Expand Up @@ -656,10 +653,10 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
dprint("prompt_type must be one of chat or code")
exit()

prompt1 = ids_for_prompt(prompt1)
prompt2 = ids_for_prompt(prompt2)
prompt3 = ids_for_prompt(prompt3)
prompt4 = ids_for_prompt(prompt4)
prompt1 = tokenizer.encode(prompt1, return_tensors="pt").squeeze(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we do a batch encode here?

prompt2 = tokenizer.encode(prompt2, return_tensors="pt").squeeze(0)
prompt3 = tokenizer.encode(prompt3, return_tensors="pt").squeeze(0)
prompt4 = tokenizer.encode(prompt4, return_tensors="pt").squeeze(0)
prompts = [prompt1, prompt2, prompt3, prompt4]
prompts = prompts * ((args.batch_size // 4) + 1)
prompts = prompts[: args.batch_size]
Expand Down Expand Up @@ -703,9 +700,7 @@ def print_result(result, result_idx: int):
if not args.no_early_termination:
result = generation.truncate_after_eos(result, tokenizer.eos_token_id)

output_str = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(result)
)
output_str = tokenizer.decode(result)

if args.output_path != "":
output_path = Path(args.output_path)
Expand Down
32 changes: 13 additions & 19 deletions scripts/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import torch._inductor.config
from fms.models import get_model, register_model
from fms.models.llama import LLaMAConfig, _llama_factory_factory
from fms.utils import generation, tokenizers
from fms.utils import generation
from fms.utils.generation import pad_input_ids
from torch import distributed as dist
from aiu_fms_testing_utils.utils import warmup_model
Expand All @@ -27,6 +27,7 @@
)
from aiu_fms_testing_utils.utils import aiu_setup
from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
from transformers import AutoTokenizer

# This example script validates models on AIU through comparisons to other devices.
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -469,7 +470,7 @@
dprint(validation_model)
dprint("=" * 60 + "\n")

tokenizer = tokenizers.get_tokenizer(args.tokenizer)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
model.eval()
torch.set_grad_enabled(False)
loading_model_time = time.time() - loading_model_time
Expand All @@ -490,15 +491,6 @@
add_special_tokens = tokenizer.bos_token_id != tokenizer.eos_token_id


def ids_for_prompt(prompt):
tokens = tokenizer.tokenize(prompt)
ids = tokenizer.convert_tokens_to_ids(tokens)
if add_special_tokens:
ids = [tokenizer.bos_token_id] + ids
ids = torch.tensor(ids, dtype=torch.long, device="cpu")
return ids


def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
# we may want the prompt length to be fixed to some max length
# this will ensure that prior to padding the input ids
Expand Down Expand Up @@ -547,7 +539,11 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
for i, prompt_file_path in enumerate(prompt_file_paths):
if i == args.batch_size:
break
prompts.append(ids_for_prompt(prompt_file_path.read_text(encoding="utf-8")))
prompts.append(
tokenizer.encode(
prompt_file_path.read_text(encoding="utf-8"), return_tensors="pt"
)
)

else:
if args.prompt_type == "chat":
Expand Down Expand Up @@ -577,10 +573,10 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
dprint("prompt_type must be one of chat or code")
exit()

prompt1 = ids_for_prompt(prompt1)
prompt2 = ids_for_prompt(prompt2)
prompt3 = ids_for_prompt(prompt3)
prompt4 = ids_for_prompt(prompt4)
prompt1 = tokenizer.encode(prompt1, return_tensors="pt").squeeze(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could this be a batch encode?

prompt2 = tokenizer.encode(prompt2, return_tensors="pt").squeeze(0)
prompt3 = tokenizer.encode(prompt3, return_tensors="pt").squeeze(0)
prompt4 = tokenizer.encode(prompt4, return_tensors="pt").squeeze(0)
prompts = [prompt1, prompt2, prompt3, prompt4]
prompts = prompts * ((args.batch_size // 4) + 1)
prompts = prompts[: args.batch_size]
Expand Down Expand Up @@ -622,9 +618,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
if not args.no_early_termination:
result = generation.truncate_after_eos(result, tokenizer.eos_token_id)

output_str = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(result)
)
output_str = tokenizer.decode(result)

if args.output_path != "":
output_path = Path(args.output_path)
Expand Down
12 changes: 5 additions & 7 deletions tests/models/test_decoders.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fms.models.hf.utils import AutoConfig
from fms.utils import serialization, tokenizers
from fms.utils import serialization
import pytest
from fms.models import get_model
from fms.utils.generation import pad_input_ids
Expand All @@ -20,9 +20,10 @@
from aiu_fms_testing_utils.utils import (
warmup_model,
sample_sharegpt_requests,
ids_for_prompt,
)
import json
from transformers import AutoTokenizer

from aiu_fms_testing_utils.utils.aiu_setup import dprint, aiu_dist_setup

import os
Expand Down Expand Up @@ -56,9 +57,6 @@
GRANITE_3p3_8B_INSTRUCT: os.path.join(
MICRO_MODELS_HOME, "granite-3.3-8b-layers-3-step-100000"
),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing this model?

LLAMA_3p1_70B_INSTRUCT: os.path.join(
MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000"
),
}

SHARE_GPT_DATASET_PATH = os.environ.get(
Expand Down Expand Up @@ -295,7 +293,7 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
)
prompt_list = []
for prompt, _ in prompts_and_sizes:
prompt_list.append(ids_for_prompt(prompt, tokenizer))
prompt_list.append(tokenizer.encode(prompt, return_tensors="pt").squeeze(0))

input_ids, extra_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
return input_ids, extra_kwargs
Expand Down Expand Up @@ -451,7 +449,7 @@ def test_common_shapes(
**distributed_kwargs,
}

tokenizer = tokenizers.get_tokenizer(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# prepare the AIU model
model = persistent_model.get_or_create(
Expand Down
8 changes: 4 additions & 4 deletions tests/models/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
ModelSignatureParams,
get_signature,
)
from fms.utils import tokenizers
import pytest
from fms.models import get_model
from fms.utils.generation import pad_input_ids
import itertools
import torch
from aiu_fms_testing_utils.utils import ids_for_prompt, sample_squad_v2_qa_requests
from aiu_fms_testing_utils.utils import sample_squad_v2_qa_requests
from aiu_fms_testing_utils.utils.aiu_setup import dprint
import os
import numpy as np
from transformers import AutoTokenizer

# Add models to test here
ROBERTA_SQUAD_V2 = "deepset/roberta-base-squad2"
Expand Down Expand Up @@ -61,7 +61,7 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
)
prompt_list = []
for prompt, _ in prompts_and_sizes:
prompt_list.append(ids_for_prompt(prompt, tokenizer))
prompt_list.append(tokenizer.encode(prompt, return_tensors="pt").squeeze(0))

input_ids, padding_kwargs = pad_input_ids(
prompt_list, min_pad_length=seq_length, is_causal_mask=False
Expand Down Expand Up @@ -111,7 +111,7 @@ def test_common_shapes(model_path, batch_size, seq_length):
f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}"
)

tokenizer = tokenizers.get_tokenizer(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

if os.path.exists(model_path):
model_path_kwargs = {"model_path": model_path}
Expand Down