Skip to content

Commit

Permalink
Merge branch 'main' into mol_opt
Browse files Browse the repository at this point in the history
  • Loading branch information
tigranfah committed Jul 25, 2024
2 parents 52692ec + a388d5f commit f46d33f
Show file tree
Hide file tree
Showing 39 changed files with 4,128 additions and 881 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,4 @@ src/live_prompt.ipynb
src/*.ipynb
*.sh
submitit_logs/*
local_submit_files/*
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,26 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 6.0e-4
n_heads: 12
n_layers: 12
max_learning_rate: 1.4e-3
warmup_steps: 500
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "no"
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 12
block_size: 2048
vocab_size: 50000
separator_token: </s>
separator_token_id: 2
tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
20 changes: 17 additions & 3 deletions chemlactica/config/config_yamls/galactica_125m_sft_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,26 @@ train_config:
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 2.0e-5
n_heads: 12
n_layers: 12
max_learning_rate: 1.0e-4
warmup_steps: 0
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 4
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 12
block_size: 2048
vocab_size: 50000
separator_token: </s>
separator_token_id: 2
tokenizer_path: "./chemlactica/tokenizer/ChemLacticaTokenizer66"
25 changes: 20 additions & 5 deletions chemlactica/config/config_yamls/gemma_2b_pretrain_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,32 @@ train_config:
adam_beta1: 0.9
adam_beta2: 0.95
batch_size: 500000
dropout_prob: 0.1
dropout_prob: 0
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 6.0e-4
n_heads: 12
n_layers: 18
max_learning_rate: 5.0e-4
warmup_steps: 500
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "no"
save_total_limit: 4
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 18
block_size: 2048
vocab_size: 256000
separator_token: <bos>
tokenizer_path: "chemlactica/tokenizer/GemmaTokenizer"
separator_token_id: 2
# tokenizer_path: "./chemlactica/tokenizer/GemmaTokenizer"
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
# tokenizer_path: "google/gemma-2b"
31 changes: 31 additions & 0 deletions chemlactica/config/config_yamls/gemma_2b_sft_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
train_config:
adam_beta1: 0.9
adam_beta2: 0.95
batch_size: 500000
dropout_prob: 0.1
eval_step: 256
global_gradient_norm: 1.0
learning_rate_decay: 0.1
max_learning_rate: 1.0e-4
warmup_steps: 305
weight_decay: 0.1
bf16: true
bf16_full_eval: true
fp16: false
tf32: true
evaluation_strategy: "steps"
save_total_limit: 8
grad_accumulation_scheduler: false
dynamic_grad_accumulation: false
grad_accumulation_patience: 4000
grad_accumulation_max: 256
grad_accumulation_delta_steps: 100
grad_accumulation_delta_percentage: 0.02
model_config:
n_heads: 12
n_layers: 18
block_size: 2048
vocab_size: 256000
separator_token: <bos>
separator_token_id: 2
tokenizer_path: "/auto/home/menuab/code/ChemLactica/chemlactica/tokenizer/GemmaTokenizer"
20 changes: 17 additions & 3 deletions chemlactica/config/default_train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class ModelConfig:
block_size: int = 2048
vocab_size: int = 50000
separator_token: str = "</s>"
separator_token_id: int = 2
tokenizer_path: str = "chemlactica/tokenizer/ChemLacticaTokenizer66"


Expand All @@ -22,11 +23,24 @@ class TrainConfig:
warmup_steps: int = 500
weight_decay: float = 0.1
optimizer: str = "adamw_torch"
lr_scheduler_type: str = "linear"
lr_scheduler_type: str = "linear" # other options [linear, constant_with_warmup]
bf16: bool = True
bf16_full_eval: bool = True
fp16: bool = False
tf32: bool = True
evaluation_strategy: str = "steps" # options are [no, steps, epoch]
# set manually to total number of checkpoints anticipated to minimize device OOM errors
save_total_limit: int = 4
grad_accumulation_scheduler: bool = False
dynamic_grad_accumulation: bool = False
grad_accumulation_patience: int = 4000
grad_accumulation_max: int = 256
grad_accumulation_delta_steps: int = 100
grad_accumulation_delta_percentage: float = 0.02


@dataclass
class SFTTrainConfig:
packing: bool = False
max_seq_length: int = 512
neftune_noise_alpha: int = 0
max_seq_length: int = 64
neftune_noise_alpha: int = 10
2 changes: 1 addition & 1 deletion chemlactica/config/galactica_accelerate_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ fsdp_config:
fsdp_sharding_strategy: 1
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_transformer_layer_cls_to_wrap: OPTForCausalLM
fsdp_activation_checkpointing: true
fsdp_activation_checkpointing: false
fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/config/gemma_accelerate_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ fsdp_config:
fsdp_sharding_strategy: 1
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_transformer_layer_cls_to_wrap: GemmaForCausalLM
fsdp_activation_checkpointing: true
fsdp_activation_checkpointing: false
fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
Expand Down
40 changes: 38 additions & 2 deletions chemlactica/custom_accelerator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from accelerate import accelerator
from accelerate.state import (
DistributedType,
)
import torch
from accelerate import optimizer
from accelerate import optimizer, accelerator
import inspect
from chemlactica.utils.distributed_utils import custom_prepare_data_loader


class CustomAcceleratedOptimizer(optimizer.AcceleratedOptimizer):
Expand Down Expand Up @@ -39,3 +42,36 @@ def prepare_optimizer(
)
self._optimizers.append(optimizer)
return optimizer

def prepare_data_loader(
self,
data_loader: torch.utils.data.DataLoader,
device_placement=None,
slice_fn_for_dispatch=None,
):
# Ensure we can't double wrap a DataLoader due to `find_batch_size`
if getattr(data_loader, "_is_accelerate_prepared", False):
if data_loader not in self._dataloaders:
self._dataloaders.append(data_loader)
return data_loader
if device_placement is None:
device_placement = (
self.device_placement
if self.distributed_type != DistributedType.XLA
else False
)
prepared_data_loader = custom_prepare_data_loader(
data_loader,
self.device,
num_processes=self.num_processes,
process_index=self.process_index,
split_batches=self.split_batches,
put_on_device=device_placement,
rng_types=self.rng_types.copy(),
dispatch_batches=self.dispatch_batches,
even_batches=self.even_batches,
slice_fn_for_dispatch=slice_fn_for_dispatch,
use_seedable_sampler=self.use_seedable_sampler,
)
self._dataloaders.append(prepared_data_loader)
return prepared_data_loader
78 changes: 76 additions & 2 deletions chemlactica/custom_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import os
from torch._tensor import Tensor
from torch.nn.modules import Module
from custom_accelerator import CustomAccelerator
from transformers.utils import is_accelerate_available
from accelerate.utils import GradientAccumulationPlugin

# from torch.distributed.fsdp.fully_sharded_data_parallel import (
# FullyShardedDataParallel as FSDP,
Expand All @@ -15,7 +18,6 @@
from chemlactica.utils.utils import get_tokenizer
from dataclasses import dataclass, field


# if is_torch_tpu_available(check_device=False):
# import torch_xla.core.xla_model as xm

Expand All @@ -36,7 +38,7 @@ class CustomArguments(TrainingArguments):
class CustomTrainer(Trainer):
def __init__(self, *args, **kwargs):
# the number of samples to print when the training begins, for debugging purposes
self.num_samples_to_print = 5
self.num_samples_to_print = 10
self.tokenizer_path = kwargs["args"].tokenizer_path
super().__init__(*args, **kwargs)

Expand All @@ -48,6 +50,78 @@ def training_step(self, model: Module, inputs: Dict[str, Tensor | Any]) -> Tenso
self.num_samples_to_print = None
return super().training_step(model, inputs)

def create_accelerator_and_postprocess(self):
grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
grad_acc_kwargs["sync_with_dataloader"] = False
gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)

# create accelerator object
self.accelerator = CustomAccelerator(
deepspeed_plugin=self.args.deepspeed_plugin,
gradient_accumulation_plugin=gradient_accumulation_plugin,
**self.args.accelerator_config.to_dict(),
)
# some Trainer classes need to use `gather` instead of `gather_for_metrics`,
# thus we store a flag
self.gather_function = self.accelerator.gather_for_metrics

# deepspeed and accelerate flags covering both trainer args and accelerate launcher
self.is_deepspeed_enabled = (
getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
)
self.is_fsdp_enabled = (
getattr(self.accelerator.state, "fsdp_plugin", None) is not None
)

# post accelerator creation setup
if self.is_fsdp_enabled:
fsdp_plugin = self.accelerator.state.fsdp_plugin
fsdp_plugin.limit_all_gathers = self.args.fsdp_config.get(
"limit_all_gathers", fsdp_plugin.limit_all_gathers
)
if is_accelerate_available("0.23.0"):
fsdp_plugin.activation_checkpointing = self.args.fsdp_config.get(
"activation_checkpointing", fsdp_plugin.activation_checkpointing
)
if (
fsdp_plugin.activation_checkpointing
and self.args.gradient_checkpointing
):
raise ValueError(
"The activation_checkpointing in FSDP config and "
"the gradient_checkpointing in training arg "
"can't be set to True simultaneously. "
"Please use FSDP's activation_checkpointing logic "
"when using FSDP."
)

if (
self.is_deepspeed_enabled
and getattr(self.args, "hf_deepspeed_config", None) is None
):
self.propagate_args_to_deepspeed()

# `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
if (
self.args.save_only_model
and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
and self.args.load_best_model_at_end
):
wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
raise ValueError(
f"{wrapper} can't be used with `save_only_model` "
"along with `load_best_model_at_end`."
)

# `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP
if (
self.is_deepspeed_enabled or self.is_fsdp_enabled
) and self.args.auto_find_batch_size:
wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
raise NotImplementedError(
f"`{wrapper}` doesn't support `auto_find_batch_size`."
)

def _build_slurm_eval_command(self, train_command, trial):
checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
run_dir = self._get_output_dir(trial=trial)
Expand Down
2 changes: 1 addition & 1 deletion chemlactica/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def get_dataset(
"text", data_files={"validation": valid_data_files}, streaming=False
)
processed_eval_dataset = process_dataset(
dataset=eval_dataset,
dataset=eval_dataset["validation"],
train_config=train_config,
model_config=model_config,
process_batch_sizes=(50, 50),
Expand Down
4 changes: 2 additions & 2 deletions chemlactica/get_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
args=training_args,
# compute_metrics=compute_metrics,
train_dataset=dataset["train"] if not evaluate_only else None,
eval_dataset=dataset["validation"]["validation"]
eval_dataset=dataset["validation"]
if not evaluate_only or slurm_eval
else None,
# optimizers=[optimizer, lr_scheduler],
Expand All @@ -25,7 +25,7 @@ def get_trainer(train_type, model, dataset, training_args, evaluate_only, slurm_
elif train_type == "sft":
sft_config = SFTTrainConfig()
tokenizer = get_tokenizer(training_args.tokenizer_path)
response_template = "[PROPERTY]activity "
response_template = tokenizer.encode("[PROPERTY]activity")
collator = DataCollatorForCompletionOnlyLM(
response_template, tokenizer=tokenizer
)
Expand Down
1 change: 1 addition & 0 deletions chemlactica/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def samples_generator(
):
file_states = setup_generator(shared_jsonl_files, files)

# TODO: there should be a more elegant way to do this without per line conditions
returned = True
while returned:
returned = False
Expand Down
Loading

0 comments on commit f46d33f

Please sign in to comment.