Merge branch 'main' into mol_opt

YerevaNN · Jul 25, 2024 · f46d33f · f46d33f
2 parents 52692ec + a388d5f
commit f46d33f
Show file tree

Hide file tree

Showing 39 changed files with 4,128 additions and 881 deletions.
diff --git a/.gitignore b/.gitignore
@@ -183,3 +183,4 @@ src/live_prompt.ipynb
 src/*.ipynb
 *.sh
 submitit_logs/*
+local_submit_files/*
diff --git a/chemlactica/config/config_yamls/galactica_125m_pretrain_config.yaml b/chemlactica/config/config_yamls/galactica_125m_pretrain_config.yaml
@@ -6,12 +6,26 @@ train_config:
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
-  max_learning_rate: 6.0e-4
-  n_heads: 12
-  n_layers: 12
+  max_learning_rate: 1.4e-3
   warmup_steps: 500
   weight_decay: 0.1
+  bf16: true
+  bf16_full_eval: true
+  fp16: false
+  tf32: true
+  evaluation_strategy: "no"
+  save_total_limit: 8
+  grad_accumulation_scheduler: false
+  dynamic_grad_accumulation: false
+  grad_accumulation_patience: 4000
+  grad_accumulation_max: 256
+  grad_accumulation_delta_steps: 100
+  grad_accumulation_delta_percentage: 0.02
 model_config:
+  n_heads: 12
+  n_layers: 12
   block_size: 2048
   vocab_size: 50000
   separator_token: </s>
+  separator_token_id: 2
+  tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
diff --git a/chemlactica/config/config_yamls/galactica_125m_sft_config.yaml b/chemlactica/config/config_yamls/galactica_125m_sft_config.yaml
@@ -6,12 +6,26 @@ train_config:
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
-  max_learning_rate: 2.0e-5
-  n_heads: 12
-  n_layers: 12
+  max_learning_rate: 1.0e-4
   warmup_steps: 0
   weight_decay: 0.1
+  bf16: true
+  bf16_full_eval: true
+  fp16: false
+  tf32: true
+  evaluation_strategy: "steps"
+  save_total_limit: 4
+  grad_accumulation_scheduler: false
+  dynamic_grad_accumulation: false
+  grad_accumulation_patience: 4000
+  grad_accumulation_max: 256
+  grad_accumulation_delta_steps: 100
+  grad_accumulation_delta_percentage: 0.02
 model_config:
+  n_heads: 12
+  n_layers: 12
   block_size: 2048
   vocab_size: 50000
   separator_token: </s>
+  separator_token_id: 2
+  tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
diff --git a/chemlactica/config/config_yamls/gemma_2b_pretrain_config.yaml b/chemlactica/config/config_yamls/gemma_2b_pretrain_config.yaml
@@ -2,17 +2,32 @@ train_config:
   adam_beta1: 0.9
   adam_beta2: 0.95
   batch_size: 500000
-  dropout_prob: 0.1
+  dropout_prob: 0
   eval_step: 256
   global_gradient_norm: 1.0
   learning_rate_decay: 0.1
-  max_learning_rate: 6.0e-4
-  n_heads: 12
-  n_layers: 18
+  max_learning_rate: 5.0e-4
   warmup_steps: 500
   weight_decay: 0.1
+  bf16: true
+  bf16_full_eval: true
+  fp16: false
+  tf32: true
+  evaluation_strategy: "no"
+  save_total_limit: 4
+  grad_accumulation_scheduler: false
+  dynamic_grad_accumulation: false
+  grad_accumulation_patience: 4000
+  grad_accumulation_max: 256
+  grad_accumulation_delta_steps: 100
+  grad_accumulation_delta_percentage: 0.02
 model_config:
+  n_heads: 12
+  n_layers: 18
   block_size: 2048
   vocab_size: 256000
   separator_token: <bos>
-  tokenizer_path: "chemlactica/tokenizer/GemmaTokenizer"
+  separator_token_id: 2
+  # tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
+  tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
+  # tokenizer_path: "google/gemma-2b"
diff --git a/chemlactica/config/config_yamls/gemma_2b_sft_config.yaml b/chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
@@ -0,0 +1,31 @@
+train_config:
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  batch_size: 500000
+  dropout_prob: 0.1
+  eval_step: 256
+  global_gradient_norm: 1.0
+  learning_rate_decay: 0.1
+  max_learning_rate: 1.0e-4
+  warmup_steps: 305
+  weight_decay: 0.1
+  bf16: true
+  bf16_full_eval: true
+  fp16: false
+  tf32: true
+  evaluation_strategy: "steps"
+  save_total_limit: 8
+  grad_accumulation_scheduler: false
+  dynamic_grad_accumulation: false
+  grad_accumulation_patience: 4000
+  grad_accumulation_max: 256
+  grad_accumulation_delta_steps: 100
+  grad_accumulation_delta_percentage: 0.02
+model_config:
+  n_heads: 12
+  n_layers: 18
+  block_size: 2048
+  vocab_size: 256000
+  separator_token: <bos>
+  separator_token_id: 2
+  tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
diff --git a/chemlactica/config/default_train_config.py b/chemlactica/config/default_train_config.py
@@ -6,6 +6,7 @@ class ModelConfig:
     block_size: int = 2048
     vocab_size: int = 50000
     separator_token: str = "</s>"
+    separator_token_id: int = 2
     tokenizer_path: str = "chemlactica/tokenizer/ChemLacticaTokenizer66"
 
 
@@ -22,11 +23,24 @@ class TrainConfig:
     warmup_steps: int = 500
     weight_decay: float = 0.1
     optimizer: str = "adamw_torch"
-    lr_scheduler_type: str = "linear"
+    lr_scheduler_type: str = "linear"  # other options [linear, constant_with_warmup]
+    bf16: bool = True
+    bf16_full_eval: bool = True
+    fp16: bool = False
+    tf32: bool = True
+    evaluation_strategy: str = "steps"  # options are [no, steps, epoch]
+    # set manually to total number of checkpoints anticipated to minimize device OOM errors
+    save_total_limit: int = 4
+    grad_accumulation_scheduler: bool = False
+    dynamic_grad_accumulation: bool = False
+    grad_accumulation_patience: int = 4000
+    grad_accumulation_max: int = 256
+    grad_accumulation_delta_steps: int = 100
+    grad_accumulation_delta_percentage: float = 0.02
 
 
 @dataclass
 class SFTTrainConfig:
     packing: bool = False
-    max_seq_length: int = 512
-    neftune_noise_alpha: int = 0
+    max_seq_length: int = 64
+    neftune_noise_alpha: int = 10
diff --git a/chemlactica/config/galactica_accelerate_config.yaml b/chemlactica/config/galactica_accelerate_config.yaml
@@ -9,7 +9,7 @@ fsdp_config:
   fsdp_sharding_strategy: 1
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_transformer_layer_cls_to_wrap: OPTForCausalLM
-  fsdp_activation_checkpointing: true
+  fsdp_activation_checkpointing: false
   fsdp_use_orig_params: true
 machine_rank: 0
 main_training_function: main

diff --git a/chemlactica/config/gemma_accelerate_config.yaml b/chemlactica/config/gemma_accelerate_config.yaml
@@ -9,7 +9,7 @@ fsdp_config:
   fsdp_sharding_strategy: 1
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_transformer_layer_cls_to_wrap: GemmaForCausalLM
-  fsdp_activation_checkpointing: true
+  fsdp_activation_checkpointing: false
   fsdp_use_orig_params: true
 machine_rank: 0
 main_training_function: main

diff --git a/chemlactica/custom_accelerator.py b/chemlactica/custom_accelerator.py
@@ -1,7 +1,10 @@
-from accelerate import accelerator
+from accelerate.state import (
+    DistributedType,
+)
 import torch
-from accelerate import optimizer
+from accelerate import optimizer, accelerator
 import inspect
+from chemlactica.utils.distributed_utils import custom_prepare_data_loader
 
 
 class CustomAcceleratedOptimizer(optimizer.AcceleratedOptimizer):
@@ -39,3 +42,36 @@ def prepare_optimizer(
         )
         self._optimizers.append(optimizer)
         return optimizer
+
+    def prepare_data_loader(
+        self,
+        data_loader: torch.utils.data.DataLoader,
+        device_placement=None,
+        slice_fn_for_dispatch=None,
+    ):
+        # Ensure we can't double wrap a DataLoader due to `find_batch_size`
+        if getattr(data_loader, "_is_accelerate_prepared", False):
+            if data_loader not in self._dataloaders:
+                self._dataloaders.append(data_loader)
+            return data_loader
+        if device_placement is None:
+            device_placement = (
+                self.device_placement
+                if self.distributed_type != DistributedType.XLA
+                else False
+            )
+        prepared_data_loader = custom_prepare_data_loader(
+            data_loader,
+            self.device,
+            num_processes=self.num_processes,
+            process_index=self.process_index,
+            split_batches=self.split_batches,
+            put_on_device=device_placement,
+            rng_types=self.rng_types.copy(),
+            dispatch_batches=self.dispatch_batches,
+            even_batches=self.even_batches,
+            slice_fn_for_dispatch=slice_fn_for_dispatch,
+            use_seedable_sampler=self.use_seedable_sampler,
+        )
+        self._dataloaders.append(prepared_data_loader)
+        return prepared_data_loader
diff --git a/chemlactica/custom_trainer.py b/chemlactica/custom_trainer.py
@@ -3,6 +3,9 @@
 import os
 from torch._tensor import Tensor
 from torch.nn.modules import Module
+from custom_accelerator import CustomAccelerator
+from transformers.utils import is_accelerate_available
+from accelerate.utils import GradientAccumulationPlugin
 
 # from torch.distributed.fsdp.fully_sharded_data_parallel import (
 #     FullyShardedDataParallel as FSDP,
@@ -15,7 +18,6 @@
 from chemlactica.utils.utils import get_tokenizer
 from dataclasses import dataclass, field
 
-
 # if is_torch_tpu_available(check_device=False):
 #     import torch_xla.core.xla_model as xm
 
@@ -36,7 +38,7 @@ class CustomArguments(TrainingArguments):
 class CustomTrainer(Trainer):
     def __init__(self, *args, **kwargs):
         # the number of samples to print when the training begins, for debugging purposes
-        self.num_samples_to_print = 5
+        self.num_samples_to_print = 10
         self.tokenizer_path = kwargs["args"].tokenizer_path
         super().__init__(*args, **kwargs)
 
@@ -48,6 +50,78 @@ def training_step(self, model: Module, inputs: Dict[str, Tensor | Any]) -> Tenso
             self.num_samples_to_print = None
         return super().training_step(model, inputs)
 
+    def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+
+        # create accelerator object
+        self.accelerator = CustomAccelerator(
+            deepspeed_plugin=self.args.deepspeed_plugin,
+            gradient_accumulation_plugin=gradient_accumulation_plugin,
+            **self.args.accelerator_config.to_dict(),
+        )
+        # some Trainer classes need to use `gather` instead of `gather_for_metrics`,
+        # thus we store a flag
+        self.gather_function = self.accelerator.gather_for_metrics
+
+        # deepspeed and accelerate flags covering both trainer args and accelerate launcher
+        self.is_deepspeed_enabled = (
+            getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        )
+        self.is_fsdp_enabled = (
+            getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        )
+
+        # post accelerator creation setup
+        if self.is_fsdp_enabled:
+            fsdp_plugin = self.accelerator.state.fsdp_plugin
+            fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
+                "limit_all_gathers", fsdp_plugin.limit_all_gathers
+            )
+            if is_accelerate_available("0.23.0"):
+                fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
+                    "activation_checkpointing", fsdp_plugin.activation_checkpointing
+                )
+                if (
+                    fsdp_plugin.activation_checkpointing
+                    and self.args.gradient_checkpointing
+                ):
+                    raise ValueError(
+                        "The activation_checkpointing in FSDP config and "
+                        "the gradient_checkpointing in training arg "
+                        "can't be set to True simultaneously. "
+                        "Please use FSDP's activation_checkpointing logic "
+                        "when using FSDP."
+                    )
+
+        if (
+            self.is_deepspeed_enabled
+            and getattr(self.args, "hf_deepspeed_config", None) is None
+        ):
+            self.propagate_args_to_deepspeed()
+
+        # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
+        if (
+            self.args.save_only_model
+            and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
+            and self.args.load_best_model_at_end
+        ):
+            wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
+            raise ValueError(
+                f"{wrapper} can't be used with `save_only_model` "
+                "along with `load_best_model_at_end`."
+            )
+
+        # `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP
+        if (
+            self.is_deepspeed_enabled or self.is_fsdp_enabled
+        ) and self.args.auto_find_batch_size:
+            wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
+            raise NotImplementedError(
+                f"`{wrapper}` doesn't support `auto_find_batch_size`."
+            )
+
     def _build_slurm_eval_command(self, train_command, trial):
         checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
         run_dir = self._get_output_dir(trial=trial)

diff --git a/chemlactica/get_dataset.py b/chemlactica/get_dataset.py
@@ -67,7 +67,7 @@ def get_dataset(
                 "text", data_files={"validation": valid_data_files}, streaming=False
             )
             processed_eval_dataset = process_dataset(
-                dataset=eval_dataset,
+                dataset=eval_dataset["validation"],
                 train_config=train_config,
                 model_config=model_config,
                 process_batch_sizes=(50, 50),

diff --git a/chemlactica/get_trainer.py b/chemlactica/get_trainer.py
@@ -15,7 +15,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
             args=training_args,
             # compute_metrics=compute_metrics,
             train_dataset=dataset["train"] if not evaluate_only else None,
-            eval_dataset=dataset["validation"]["validation"]
+            eval_dataset=dataset["validation"]
             if not evaluate_only or slurm_eval
             else None,
             # optimizers=[optimizer, lr_scheduler],
@@ -25,7 +25,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
     elif train_type == "sft":
         sft_config = SFTTrainConfig()
         tokenizer = get_tokenizer(training_args.tokenizer_path)
-        response_template = "[PROPERTY]activity "
+        response_template = tokenizer.encode("[PROPERTY]activity")
         collator = DataCollatorForCompletionOnlyLM(
             response_template, tokenizer=tokenizer
         )

diff --git a/chemlactica/jsonl_dataset.py b/chemlactica/jsonl_dataset.py
@@ -39,6 +39,7 @@ def samples_generator(
 ):
     file_states = setup_generator(shared_jsonl_files, files)
 
+    # TODO: there should be a more elegant way to do this without per line conditions
     returned = True
     while returned:
         returned = False