huggingface · duynht · May 6, 2025 · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/examples/lighteval_config_override_nanotron_tests.yaml b/examples/lighteval_config_override_nanotron_tests.yaml
@@ -0,0 +1,24 @@
+# As of right now auto batch size doesn't work, so we use some default
+batch_size: 8
+generation: null
+logging:
+  output_dir: "tests/nanotron_logs"
+  save_details: false
+  push_to_hub: false
+  public_run: false
+  results_org: null
+  tensorboard_metric_prefix: "eval"
+parallelism:
+  dp: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: false
+  tp_mode: ALL_REDUCE
+tasks:
+  dataset_loading_processes: 8
+  max_samples: 10
+  multichoice_continuations_start_space: null
+  num_fewshot_seeds: null
+  tasks: leaderboard|arc:challenge|25|0,leaderboard|truthfulqa:mc|0|0,leaderboard|hellaswag|10|0,leaderboard|mmlu:college_chemistry|5|0,leaderboard|mmlu:us_foreign_policy|5|0,lighteval|agieval:aqua-rat|0|0,lighteval|agieval:logiqa-en|0|0,lighteval|agieval:lsat-ar|0|0,lighteval|agieval:lsat-lr|0|0,lighteval|agieval:lsat-rc|0|0,lighteval|agieval:sat-en-without-passage|0|0,lighteval|agieval:sat-en|0|0,lighteval|bigbench:causal_judgment|3|0,lighteval|bigbench:date_understanding|3|0,lighteval|bigbench:disambiguation_qa|3|0,lighteval|bigbench:geometric_shapes|3|0,lighteval|bigbench:logical_deduction_five_objects|3|0,lighteval|bigbench:logical_deduction_seven_objects|3|0,lighteval|bigbench:movie_recommendation|3|0,lighteval|bigbench:navigate|3|0,lighteval|bigbench:ruin_names|3|0,lighteval|bigbench:salient_translation_error_detection|3|0,lighteval|bigbench:snarks|3|0,lighteval|bigbench:temporal_sequences|3|0,lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0,lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0,test|gsm8k|0|1
+  custom_tasks: examples/custom_tasks_tests.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,9 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["src"]
 
+[tool.uv]
+no-build-isolation-package = ['flash-attn']
+
 [project]
 name = "lighteval"
 version = "0.9.1.dev0"
@@ -88,14 +91,18 @@ optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
-  "nanotron",
-  "tensorboardX"
+  "nanotron@git+https://github.com/huggingface/[email protected]",
+  "tensorboardX",
+  "ninja",
+  "triton",
+  "flash-attn>=2.5.0,<2.7.0",
+  "datatrove[io]"
 ]
 tensorboardX = ["tensorboardX"]
 vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0","deepdiff"]
-dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
+dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm,nanotron]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval

diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
@@ -101,3 +101,17 @@ class LightEvalConfig:
 class FullNanotronConfig:
     lighteval_config: LightEvalConfig
     nanotron_config: "Config"
+
+    @property
+    def generation_parameters(self):
+        # Return the generation parameters from the lighteval config
+        # or create default generation parameters if none are set
+        if self.lighteval_config.generation:
+            return self.lighteval_config.generation
+        return GenerationArgs()
+
+    def __getattr__(self, name):
+        # Delegate attribute access to nanotron_config if not found in FullNanotronConfig
+        if hasattr(self.nanotron_config, name):
+            return getattr(self.nanotron_config, name)
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -27,8 +27,6 @@
 from typing_extensions import Annotated
 
 
-CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
-
 HELP_PANEL_NAME_1 = "Common Parameters"
 HELP_PANEL_NAME_2 = "Logging Parameters"
 HELP_PANEL_NAME_3 = "Debug Parameters"
@@ -43,41 +41,37 @@ def nanotron(
         str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
     ],
     lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
-    cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
 ):
     """
     Evaluate models using nanotron as backend.
     """
     from nanotron.config import Config, get_config_from_file
 
-    from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
+    from lighteval.config.lighteval_config import (
+        FullNanotronConfig,
+        LightEvalConfig,
+    )
     from lighteval.logging.evaluation_tracker import EvaluationTracker
-    from lighteval.logging.hierarchical_logger import htrack_block
     from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
     from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
-    from lighteval.utils.utils import EnvConfig
-
-    env_config = EnvConfig(token=os.getenv("HF_TOKEN"), cache_dir=cache_dir)
 
     if not is_nanotron_available():
         raise ImportError(NO_NANOTRON_ERROR_MSG)
 
-    with htrack_block("Load nanotron config"):
-        # Create nanotron config
-        if not checkpoint_config_path.endswith(".yaml"):
-            raise ValueError("The checkpoint path should point to a YAML file")
+    if not checkpoint_config_path.endswith(".yaml"):
+        raise ValueError("The checkpoint path should point to a YAML file")
 
-        model_config = get_config_from_file(
-            checkpoint_config_path,
-            config_class=Config,
-            model_config_class=None,
-            skip_unused_config_keys=True,
-            skip_null_keys=True,
-        )
+    model_config = get_config_from_file(
+        checkpoint_config_path,
+        config_class=Config,
+        model_config_class=None,
+        skip_unused_config_keys=True,
+        skip_null_keys=True,
+    )
 
-        # We are getting an type error, because the get_config_from_file is not correctly typed,
-        lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
-        nanotron_config = FullNanotronConfig(lighteval_config, model_config)
+    # We are getting an type error, because the get_config_from_file is not correctly typed,
+    lighteval_config: LightEvalConfig = get_config_from_file(lighteval_config_path, config_class=LightEvalConfig)  # type: ignore
+    nanotron_config = FullNanotronConfig(lighteval_config, model_config)
 
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
@@ -92,12 +86,11 @@ def nanotron(
 
     pipeline_parameters = PipelineParameters(
         launcher_type=ParallelismManager.NANOTRON,
-        env_config=env_config,
         job_id=os.environ.get("SLURM_JOB_ID", 0),
         nanotron_checkpoint_path=checkpoint_config_path,
         dataset_loading_processes=lighteval_config.tasks.dataset_loading_processes,
         custom_tasks_directory=lighteval_config.tasks.custom_tasks,
-        override_batch_size=lighteval_config.batch_size,
+        # override_batch_size=lighteval_config.batch_size,
         num_fewshot_seeds=1,
         max_samples=lighteval_config.tasks.max_samples,
         use_chat_template=False,
@@ -115,4 +108,8 @@ def nanotron(
 
     pipeline.show_results()
 
+    results = pipeline.get_results()
+
     pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/models/nanotron/nanotron_model.py b/src/lighteval/models/nanotron/nanotron_model.py
@@ -56,7 +56,7 @@
 )
 from lighteval.utils.imports import is_nanotron_available
 from lighteval.utils.parallelism import find_executable_batch_size
-from lighteval.utils.utils import EnvConfig, as_list
+from lighteval.utils.utils import as_list
 
 
 logger = logging.getLogger(__name__)
@@ -101,7 +101,6 @@ def __init__(
         trust_remote_code: bool = False,
         debug_one_layer_model: bool = False,
         model_class: Optional[Type] = None,
-        env_config: EnvConfig = None,
     ):
         """Initializes a nanotron model for evaluation.
         Args:
@@ -115,6 +114,10 @@ def __init__(
         self._max_length = max_length
         self.parallel_config = parallel_config
         self.parallel_context = parallel_context
+        if hasattr(lighteval_config, "batch_size"):
+            self.batch_size = lighteval_config.batch_size
+        else:
+            self.batch_size = None
 
         if parallel_config.pp > 1:
             # To implement PP parallelism we need to think about how we want to sync the output for the PP ranks without outputs
@@ -138,7 +141,6 @@ def __init__(
         self._add_special_tokens = add_special_tokens
         self._tokenizer = self._create_auto_tokenizer(
             pretrained=tokenizer.tokenizer_name_or_path,
-            env_config=env_config,
             trust_remote_code=trust_remote_code,
         )
         self._tokenizer.model_max_length = self.max_length
@@ -230,23 +232,18 @@ def _create_auto_tokenizer(
         *,
         pretrained: str,
         tokenizer: Optional[str] = None,
-        env_config: EnvConfig = None,
         trust_remote_code: bool = False,
     ) -> transformers.PreTrainedTokenizer:
         """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
 
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 pretrained if tokenizer is None else tokenizer,
-                cache_dir=env_config.cache_dir,
-                token=env_config.token,
                 trust_remote_code=trust_remote_code,
             )
         except RecursionError:
             tokenizer = AutoTokenizer.from_pretrained(
                 pretrained if tokenizer is None else tokenizer,
-                cache_dir=env_config.cache_dir,
-                token=env_config.token,
                 unk_token="<unk>",
                 trust_remote_code=trust_remote_code,
             )
@@ -305,9 +302,9 @@ def max_length(self) -> int:
     def device(self) -> Union[int, str, torch.device]:
         return "cuda"
 
-    def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
-        if override_bs:
-            return override_bs
+    def _get_batch_size(self, max_input_length: int, starting_batch_size: int = 512) -> int:
+        if self.batch_size is not None:
+            return self.batch_size
         logger.warning("Detecting largest batch size")
 
         @find_executable_batch_size(
@@ -343,7 +340,9 @@ def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
         return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
 
     def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
-        return self.model(inputs)
+        # This is only called for detecting the batch size so we just need a mock input_mask
+        input_mask = torch.ones_like(inputs)
+        return self.model(inputs, input_mask)
 
     def homogeneize_ending_conditions(self, ending_condition: tuple | dict | list | str) -> tuple[list, int]:
         """Ending conditions are submitted in several possible formats.
@@ -400,7 +399,8 @@ def _check_continuations_start_space(self, continuation: str) -> str:
         return continuation
 
     def loglikelihood_single_token(
-        self, requests: List[Tuple[str, dict]], override_bs=0
+        self,
+        requests: List[Tuple[str, dict]],
     ) -> List[LoglikelihoodSingleTokenResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
@@ -433,11 +433,10 @@ def loglikelihood_single_token(
 
         return self._loglikelihood_single_token(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
         )
 
-    def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) -> List[LoglikelihoodResponse]:
+    def loglikelihood(self, requests: List[LoglikelihoodRequest]) -> List[LoglikelihoodResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
@@ -455,12 +454,12 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None)
 
         return self._loglikelihood_tokens(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
         )
 
     def loglikelihood_rolling(
-        self, requests: List[LoglikelihoodRollingRequest], override_bs: int = 0
+        self,
+        requests: List[LoglikelihoodRollingRequest],
     ) -> List[LoglikelihoodResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         for request in tqdm(
@@ -471,7 +470,6 @@ def loglikelihood_rolling(
 
         results = self._loglikelihood_tokens(
             requests,
-            override_bs=override_bs,
             disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0),
             return_bool_score=False,
         )
@@ -637,7 +635,7 @@ def _get_subsets(self, dataset, num_dataset_splits):
 
     @torch.inference_mode()
     def _loglikelihood_single_token(
-        self, requests, disable_tqdm: bool = False, override_bs: int = 0, num_dataset_splits: int = 1
+        self, requests, disable_tqdm: bool = False, num_dataset_splits: int = 1
     ) -> List[LoglikelihoodSingleTokenResponse]:
         dataset = LoglikelihoodSingleTokenDataset(requests=requests)
         res = []
@@ -664,9 +662,7 @@ def _loglikelihood_single_token(
             # pull longest context sample from request
             context_enc = dataset[0].tokenized_context
             max_context = len(context_enc[-self.max_length :])
-            batch_size = self._get_batch_size(
-                override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
-            )
+            batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)
 
             starting_batch_size = batch_size * 2  # for the next round
 
@@ -711,14 +707,13 @@ def _loglikelihood_single_token(
                     inputs, padding_length=max_context, max_context=max_context, full_attention_masks=True
                 )
                 # batched_inputs, batch_attention, input_lengths, truncated, padded
-
                 out = self.model(input_ids=batch_model.input_ids, input_mask=batch_model.input_mask)
 
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     # This process got outputs
 
-                    # Gather all the output across TP
-                    out = out.transpose(0, 1).contiguous()  # [batch, seq_length, vocab]
+                    # Gather all the output accross TP
+                    out = out.view(*batch_model.input_ids.shape, -1).contiguous()  # [batch, seq_length, vocab]
 
                     gathered_out = [torch.zeros_like(out) for _ in range(self.parallel_context.tp_pg.size())]
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
@@ -866,7 +861,6 @@ def _loglikelihood_tokens(
         self,
         requests,
         disable_tqdm: bool = False,
-        override_bs: int = -1,
         num_dataset_splits: int = 1,
         return_bool_score: bool = True,
     ) -> List[LoglikelihoodResponse]:
@@ -897,9 +891,7 @@ def _loglikelihood_tokens(
 
             max_context = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
 
-            batch_size = self._get_batch_size(
-                override_bs=override_bs, max_input_length=max_context, starting_batch_size=starting_batch_size
-            )
+            batch_size = self._get_batch_size(max_input_length=max_context, starting_batch_size=starting_batch_size)
             starting_batch_size = batch_size * 2  # for the next round
 
             # For the DP replicas
@@ -954,7 +946,7 @@ def _loglikelihood_tokens(
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
 
-                    out = out.transpose(0, 1)  # [batch, seq_length, vocab]
+                    out = out.view(*batch_model.input_ids.shape, -1)  # [batch, seq_length, vocab]
                     multi_logits = F.log_softmax(out, dim=-1)  # [batch, padding_length, vocab]
 
                     logits_sum = []
@@ -1100,7 +1092,6 @@ def greedy_until(
         self,
         requests: List[GreedyUntilRequest],
         disable_tqdm: bool = False,
-        override_bs: int = -1,
         num_dataset_splits: int = 1,
     ) -> List[GenerativeResponse]:
         """Greedy generation until a stop token is generated."""
@@ -1140,7 +1131,6 @@ def greedy_until(
                 max_input_length = min(len(context_enc) + max_gen, self.max_length)
 
             batch_size = self._get_batch_size(
-                override_bs=override_bs,
                 max_input_length=max_input_length,
                 starting_batch_size=starting_batch_size,
             )
@@ -1246,6 +1236,7 @@ def greedy_until(
                     max_micro_batch_size=batch_size,  # ok for PP=1 for PP>1 we'll need to split the batch
                     returns_logits=returns_logits,
                     generation_config=self.generation_config,
+                    # tokenizer=self.tokenizer #NOTE[duynht]; This is needed for the current nanotron@main, but that is not compatible with HuggingfaceTB/SmolLM2-nanotron-ckpt
                 )
                 dist.barrier()  # Got everyone to send their stuff
                 outputs = list(outputs)