Skip to content

[WIP] Fix nanotron compatibility #706

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions examples/lighteval_config_override_nanotron_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# As of right now auto batch size doesn't work, so we use some default
batch_size: 8
generation: null
logging:
output_dir: "tests/nanotron_logs"
save_details: false
push_to_hub: false
public_run: false
results_org: null
tensorboard_metric_prefix: "eval"
parallelism:
dp: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
tasks:
dataset_loading_processes: 8
max_samples: 10
multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: leaderboard|arc:challenge|25|0,leaderboard|truthfulqa:mc|0|0,leaderboard|hellaswag|10|0,leaderboard|mmlu:college_chemistry|5|0,leaderboard|mmlu:us_foreign_policy|5|0,lighteval|agieval:aqua-rat|0|0,lighteval|agieval:logiqa-en|0|0,lighteval|agieval:lsat-ar|0|0,lighteval|agieval:lsat-lr|0|0,lighteval|agieval:lsat-rc|0|0,lighteval|agieval:sat-en-without-passage|0|0,lighteval|agieval:sat-en|0|0,lighteval|bigbench:causal_judgment|3|0,lighteval|bigbench:date_understanding|3|0,lighteval|bigbench:disambiguation_qa|3|0,lighteval|bigbench:geometric_shapes|3|0,lighteval|bigbench:logical_deduction_five_objects|3|0,lighteval|bigbench:logical_deduction_seven_objects|3|0,lighteval|bigbench:movie_recommendation|3|0,lighteval|bigbench:navigate|3|0,lighteval|bigbench:ruin_names|3|0,lighteval|bigbench:salient_translation_error_detection|3|0,lighteval|bigbench:snarks|3|0,lighteval|bigbench:temporal_sequences|3|0,lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0,lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0,test|gsm8k|0|1
custom_tasks: examples/custom_tasks_tests.py
13 changes: 10 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]

[tool.uv]
no-build-isolation-package = ['flash-attn']

[project]
name = "lighteval"
version = "0.9.1.dev0"
Expand Down Expand Up @@ -88,14 +91,18 @@ optimum = ["optimum==1.12.0"]
quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
adapters = ["peft==0.3.0"]
nanotron = [
"nanotron",
"tensorboardX"
"nanotron@git+https://github.com/huggingface/[email protected]",
"tensorboardX",
"ninja",
"triton",
"flash-attn>=2.5.0,<2.7.0",
"datatrove[io]"
]
tensorboardX = ["tensorboardX"]
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0","deepdiff"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm,nanotron]"]
docs = ["hf-doc-builder", "watchdog"]
extended_tasks = [
"langdetect", # ifeval
Expand Down
14 changes: 14 additions & 0 deletions src/lighteval/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,17 @@ class LightEvalConfig:
class FullNanotronConfig:
lighteval_config: LightEvalConfig
nanotron_config: "Config"

@property
def generation_parameters(self):
# Return the generation parameters from the lighteval config
# or create default generation parameters if none are set
if self.lighteval_config.generation:
return self.lighteval_config.generation
return GenerationArgs()

def __getattr__(self, name):
# Delegate attribute access to nanotron_config if not found in FullNanotronConfig
if hasattr(self.nanotron_config, name):
return getattr(self.nanotron_config, name)
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
45 changes: 21 additions & 24 deletions src/lighteval/main_nanotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
from typing_extensions import Annotated


CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")

HELP_PANEL_NAME_1 = "Common Parameters"
HELP_PANEL_NAME_2 = "Logging Parameters"
HELP_PANEL_NAME_3 = "Debug Parameters"
Expand All @@ -43,41 +41,37 @@ def nanotron(
str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
],
lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
):
"""
Evaluate models using nanotron as backend.
"""
from nanotron.config import Config, get_config_from_file

from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
from lighteval.config.lighteval_config import (
FullNanotronConfig,
LightEvalConfig,
)
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.logging.hierarchical_logger import htrack_block
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
from lighteval.utils.utils import EnvConfig

env_config = EnvConfig(token=os.getenv("HF_TOKEN"), cache_dir=cache_dir)

if not is_nanotron_available():
raise ImportError(NO_NANOTRON_ERROR_MSG)

with htrack_block("Load nanotron config"):
# Create nanotron config
if not checkpoint_config_path.endswith(".yaml"):
raise ValueError("The checkpoint path should point to a YAML file")
if not checkpoint_config_path.endswith(".yaml"):
raise ValueError("The checkpoint path should point to a YAML file")

model_config = get_config_from_file(
checkpoint_config_path,
config_class=Config,
model_config_class=None,
skip_unused_config_keys=True,
skip_null_keys=True,
)
model_config = get_config_from_file(
checkpoint_config_path,
config_class=Config,
model_config_class=None,
skip_unused_config_keys=True,
skip_null_keys=True,
)

# We are getting an type error, because the get_config_from_file is not correctly typed,
lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig) # type: ignore
nanotron_config = FullNanotronConfig(lighteval_config, model_config)
# We are getting an type error, because the get_config_from_file is not correctly typed,
lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig) # type: ignore
nanotron_config = FullNanotronConfig(lighteval_config, model_config)

evaluation_tracker = EvaluationTracker(
output_dir=lighteval_config.logging.output_dir,
Expand All @@ -92,12 +86,11 @@ def nanotron(

pipeline_parameters = PipelineParameters(
launcher_type=ParallelismManager.NANOTRON,
env_config=env_config,
job_id=os.environ.get("SLURM_JOB_ID", 0),
nanotron_checkpoint_path=checkpoint_config_path,
dataset_loading_processes=lighteval_config.tasks.dataset_loading_processes,
custom_tasks_directory=lighteval_config.tasks.custom_tasks,
override_batch_size=lighteval_config.batch_size,
# override_batch_size=lighteval_config.batch_size,
num_fewshot_seeds=1,
max_samples=lighteval_config.tasks.max_samples,
use_chat_template=False,
Expand All @@ -115,4 +108,8 @@ def nanotron(

pipeline.show_results()

results = pipeline.get_results()

pipeline.save_and_push_results()

return results
55 changes: 23 additions & 32 deletions src/lighteval/models/nanotron/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
)
from lighteval.utils.imports import is_nanotron_available
from lighteval.utils.parallelism import find_executable_batch_size
from lighteval.utils.utils import EnvConfig, as_list
from lighteval.utils.utils import as_list


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -101,7 +101,6 @@ def __init__(
trust_remote_code: bool = False,
debug_one_layer_model: bool = False,
model_class: Optional[Type] = None,
env_config: EnvConfig = None,
):
"""Initializes a nanotron model for evaluation.
Args:
Expand All @@ -115,6 +114,10 @@ def __init__(
self._max_length = max_length
self.parallel_config = parallel_config
self.parallel_context = parallel_context
if hasattr(lighteval_config, "batch_size"):
self.batch_size = lighteval_config.batch_size
else:
self.batch_size = None

if parallel_config.pp > 1:
# To implement PP parallelism we need to think about how we want to sync the output for the PP ranks without outputs
Expand All @@ -138,7 +141,6 @@ def __init__(
self._add_special_tokens = add_special_tokens
self._tokenizer = self._create_auto_tokenizer(
pretrained=tokenizer.tokenizer_name_or_path,
env_config=env_config,
trust_remote_code=trust_remote_code,
)
self._tokenizer.model_max_length = self.max_length
Expand Down Expand Up @@ -230,23 +232,18 @@ def _create_auto_tokenizer(
*,
pretrained: str,
tokenizer: Optional[str] = None,
env_config: EnvConfig = None,
trust_remote_code: bool = False,
) -> transformers.PreTrainedTokenizer:
"""Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""

try:
tokenizer = AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
cache_dir=env_config.cache_dir,
token=env_config.token,
trust_remote_code=trust_remote_code,
)
except RecursionError:
tokenizer = AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
cache_dir=env_config.cache_dir,
token=env_config.token,
unk_token="<unk>",
trust_remote_code=trust_remote_code,
)
Expand Down Expand Up @@ -305,9 +302,9 @@ def max_length(self) -> int:
def device(self) -> Union[int, str, torch.device]:
return "cuda"

def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
if override_bs:
return override_bs
def _get_batch_size(self, max_input_length: int, starting_batch_size: int = 512) -> int:
if self.batch_size is not None:
return self.batch_size
logger.warning("Detecting largest batch size")

@find_executable_batch_size(
Expand Down Expand Up @@ -343,7 +340,9 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)

def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
return self.model(inputs)
# This is only called for detecting the batch size so we just need a mock input_mask
input_mask = torch.ones_like(inputs)
return self.model(inputs, input_mask)

def homogeneize_ending_conditions(self, ending_condition: tuple | dict | list | str) -> tuple[list, int]:
"""Ending conditions are submitted in several possible formats.
Expand Down Expand Up @@ -400,7 +399,8 @@ def _check_continuations_start_space(self, continuation: str) -> str:
return continuation

def loglikelihood_single_token(
self, requests: List[Tuple[str, dict]], override_bs=0
self,
requests: List[Tuple[str, dict]],
) -> List[LoglikelihoodSingleTokenResponse]:
"""Tokenize the context and continuation and compute the log likelihood of those
tokenized sequences.
Expand Down Expand Up @@ -433,11 +433,10 @@ def loglikelihood_single_token(

return self._loglikelihood_single_token(
requests,
override_bs=override_bs,
disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
)

def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) -> List[LoglikelihoodResponse]:
def loglikelihood(self, requests: List[LoglikelihoodRequest]) -> List[LoglikelihoodResponse]:
"""Tokenize the context and continuation and compute the log likelihood of those
tokenized sequences.
"""
Expand All @@ -455,12 +454,12 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None)

return self._loglikelihood_tokens(
requests,
override_bs=override_bs,
disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
)

def loglikelihood_rolling(
self, requests: List[LoglikelihoodRollingRequest], override_bs: int = 0
self,
requests: List[LoglikelihoodRollingRequest],
) -> List[LoglikelihoodResponse]:
"""This function is used to compute the log likelihood of the context for perplexity metrics."""
for request in tqdm(
Expand All @@ -471,7 +470,6 @@ def loglikelihood_rolling(

results = self._loglikelihood_tokens(
requests,
override_bs=override_bs,
disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
return_bool_score=False,
)
Expand Down Expand Up @@ -637,7 +635,7 @@ def _get_subsets(self, dataset, num_dataset_splits):

@torch.inference_mode()
def _loglikelihood_single_token(
self, requests, disable_tqdm: bool = False, override_bs: int = 0, num_dataset_splits: int = 1
self, requests, disable_tqdm: bool = False, num_dataset_splits: int = 1
) -> List[LoglikelihoodSingleTokenResponse]:
dataset = LoglikelihoodSingleTokenDataset(requests=requests)
res = []
Expand All @@ -664,9 +662,7 @@ def _loglikelihood_single_token(
# pull longest context sample from request
context_enc = dataset[0].tokenized_context
max_context = len(context_enc[-self.max_length :])
batch_size = self._get_batch_size(
override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
)
batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)

starting_batch_size = batch_size * 2 # for the next round

Expand Down Expand Up @@ -711,14 +707,13 @@ def _loglikelihood_single_token(
inputs, padding_length=max_context, max_context=max_context, full_attention_masks=True
)
# batched_inputs, batch_attention, input_lengths, truncated, padded

out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)

if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
# This process got outputs

# Gather all the output across TP
out = out.transpose(0, 1).contiguous() # [batch, seq_length, vocab]
# Gather all the output accross TP
out = out.view(*batch_model.input_ids.shape, -1).contiguous() # [batch, seq_length, vocab]

gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
Expand Down Expand Up @@ -866,7 +861,6 @@ def _loglikelihood_tokens(
self,
requests,
disable_tqdm: bool = False,
override_bs: int = -1,
num_dataset_splits: int = 1,
return_bool_score: bool = True,
) -> List[LoglikelihoodResponse]:
Expand Down Expand Up @@ -897,9 +891,7 @@ def _loglikelihood_tokens(

max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])

batch_size = self._get_batch_size(
override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
)
batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)
starting_batch_size = batch_size * 2 # for the next round

# For the DP replicas
Expand Down Expand Up @@ -954,7 +946,7 @@ def _loglikelihood_tokens(
dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
out = torch.cat(gathered_out, dim=-1)

out = out.transpose(0, 1) # [batch, seq_length, vocab]
out = out.view(*batch_model.input_ids.shape, -1) # [batch, seq_length, vocab]
multi_logits = F.log_softmax(out, dim=-1) # [batch, padding_length, vocab]

logits_sum = []
Expand Down Expand Up @@ -1100,7 +1092,6 @@ def greedy_until(
self,
requests: List[GreedyUntilRequest],
disable_tqdm: bool = False,
override_bs: int = -1,
num_dataset_splits: int = 1,
) -> List[GenerativeResponse]:
"""Greedy generation until a stop token is generated."""
Expand Down Expand Up @@ -1140,7 +1131,6 @@ def greedy_until(
max_input_length = min(len(context_enc) + max_gen, self.max_length)

batch_size = self._get_batch_size(
override_bs=override_bs,
max_input_length=max_input_length,
starting_batch_size=starting_batch_size,
)
Expand Down Expand Up @@ -1246,6 +1236,7 @@ def greedy_until(
max_micro_batch_size=batch_size, # ok for PP=1 for PP>1 we'll need to split the batch
returns_logits=returns_logits,
generation_config=self.generation_config,
# tokenizer=self.tokenizer #NOTE[duynht]; This is needed for the current nanotron@main, but that is not compatible with HuggingfaceTB/SmolLM2-nanotron-ckpt
)
dist.barrier() # Got everyone to send their stuff
outputs = list(outputs)
Expand Down
Loading