From 395afbdbc28aac1fe467b1b3a93323adaa17f2dd Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 2 Jul 2024 12:37:20 +0200 Subject: [PATCH 01/16] working with quantized version for hyperparameter search Signed-off-by: julioc-p <14-10820@usb.ve> --- .../hyperparameter_optimization.py | 79 +++---------------- 1 file changed, 12 insertions(+), 67 deletions(-) diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index 8d61647..a7e6dac 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -1,18 +1,14 @@ # imports -import transformers from transformers import (AutoModelForCausalLM, AutoTokenizer, TrainingArguments, + BitsAndBytesConfig ) from trl import SFTTrainer from peft import LoraConfig from datasets import load_dataset -from transformers import AutoTokenizer, AutoModelForCausalLM from huggingface_hub import HfApi, login -from transformers.hyperparameter_search import HPSearchBackend -from transformers.trainer import * -import optuna -import gc +import torch import os HF_TOKEN = os.getenv('HF_TOKEN', 'add_hf_token') @@ -20,63 +16,6 @@ login(HF_TOKEN, add_to_git_credential=True) -gc.collect() -torch.cuda.empty_cache() - - -def run_hp_search_optuna(trainer, n_trials, direction, **kwargs): - - def _objective(trial, checkpoint_dir=None): - checkpoint = None - if checkpoint_dir: - for subdir in os.listdir(checkpoint_dir): - if subdir.startswith(PREFIX_CHECKPOINT_DIR): - checkpoint = os.path.join(checkpoint_dir, subdir) - ################# - # UPDATES START - ################# - if not checkpoint: - # free GPU memory - del trainer.model - gc.collect() - torch.cuda.empty_cache() - trainer.objective = None - trainer.train(resume_from_checkpoint=checkpoint, trial=trial) - # If there hasn't been any evaluation during the training loop. - if getattr(trainer, "objective", None) is None: - metrics = trainer.evaluate() - trainer.objective = trainer.compute_objective(metrics) - return trainer.objective - - timeout = kwargs.pop("timeout", None) - n_jobs = kwargs.pop("n_jobs", 1) - study = optuna.create_study(direction=direction, **kwargs) - study.optimize(_objective, n_trials=n_trials, - timeout=timeout, n_jobs=n_jobs) - best_trial = study.best_trial - return BestRun(str(best_trial.number), best_trial.value, best_trial.params) - - -def hyperparameter_search( - self, - hp_space, - n_trials, - direction, - compute_objective=default_compute_objective, -) -> Union[BestRun, List[BestRun]]: - - trainer.hp_search_backend = HPSearchBackend.OPTUNA - self.hp_space = hp_space - trainer.hp_name = None - trainer.compute_objective = compute_objective - best_run = run_hp_search_optuna(trainer, n_trials, direction) - self.hp_search_backend = None - return best_run - - -transformers.trainer.Trainer.hyperparameter_search = hyperparameter_search - - # defining hyperparameter search space for optuna @@ -86,7 +25,6 @@ def optuna_hp_space(trial): "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64]), "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 15), "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-2), - "gradient_clipping": trial.suggest_float("gradient_clipping", 0.1, 0.5), } # Define a function to calculate BLEU score @@ -95,12 +33,16 @@ def optuna_hp_space(trial): # configuration arguments model_id = "google/gemma-2-27b-it" -# model init function for the trainer +# bits and bytes config +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 +) def model_init(trial): - - return AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") + return AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto",) # tokenizer load @@ -178,6 +120,9 @@ def formatting_func(example): model_init=model_init, ) +# avoid placing model on device as it is already placed on device in model_init +trainer.place_model_on_device = False + best_trial = trainer.hyperparameter_search( direction="minimize", hp_space=optuna_hp_space, From e01f4f4024f661638c5068e099f42c8d873347b1 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 2 Jul 2024 15:05:52 +0200 Subject: [PATCH 02/16] 3 epochs passing quantized model in the beginning Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/hyperparameter_optimization.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index a7e6dac..e47c4bf 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -31,7 +31,7 @@ def optuna_hp_space(trial): # configuration arguments -model_id = "google/gemma-2-27b-it" +model_id = "google/gemma-2-9b-it" # bits and bytes config bnb_config = BitsAndBytesConfig( @@ -60,7 +60,7 @@ def model_init(trial): output_dir = "trained_model" training_arguments = TrainingArguments( output_dir=output_dir, - num_train_epochs=1, + num_train_epochs=3, gradient_checkpointing=True, per_device_train_batch_size=1, gradient_accumulation_steps=8, @@ -105,11 +105,14 @@ def formatting_func(example): output_texts.append(text) return output_texts -# instantiation of the trainer + +# Passing model +model = model_init(None) +# instantiation of the trainer trainer = SFTTrainer( - model=model_id, + model=model, train_dataset=training_dataset, eval_dataset=eval_dataset, args=training_arguments, From 871df3c5e804160572a7c4de3413fd1dc7ffff0e Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 2 Jul 2024 15:19:44 +0200 Subject: [PATCH 03/16] gpu friendly optimizer Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/hyperparameter_optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index e47c4bf..a750a38 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -64,7 +64,7 @@ def model_init(trial): gradient_checkpointing=True, per_device_train_batch_size=1, gradient_accumulation_steps=8, - optim="paged_adamw_32bit", + optim="adafactor", save_steps=0, logging_steps=10, learning_rate=5e-4, From 571afd8216ca210a995c73bcc9d0884eee18b27f Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 2 Jul 2024 15:46:18 +0200 Subject: [PATCH 04/16] taking out batch size from hyperparameters Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/hyperparameter_optimization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index a750a38..2abc0a4 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -22,7 +22,6 @@ def optuna_hp_space(trial): return { "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), - "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64]), "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 15), "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-2), } @@ -62,7 +61,7 @@ def model_init(trial): output_dir=output_dir, num_train_epochs=3, gradient_checkpointing=True, - per_device_train_batch_size=1, + per_device_train_batch_size=8, gradient_accumulation_steps=8, optim="adafactor", save_steps=0, From 32a3a21ebc247c4678cdd82477d8959eddefa7e8 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Thu, 4 Jul 2024 16:26:43 +0200 Subject: [PATCH 05/16] shuffling data before running hyperparameter optimization Signed-off-by: julioc-p <14-10820@usb.ve> --- requirements.txt | 1 + .../hyperparameter_optimization.py | 39 ++++++++++++------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index 47727bd..8e41bc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,5 @@ datetime==5.5 optuna==3.6.1 python-dotenv==1.0.1 typing==3.7.4.3 +random==1.0.1 diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index 2abc0a4..0d93bed 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -1,21 +1,23 @@ # imports +import gc from transformers import (AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig ) -from trl import SFTTrainer -from peft import LoraConfig +from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from datasets import load_dataset from huggingface_hub import HfApi, login import torch - +import CustomSFTTrainer +import random import os HF_TOKEN = os.getenv('HF_TOKEN', 'add_hf_token') api = HfApi() login(HF_TOKEN, add_to_git_credential=True) - +gc.collect() +torch.cuda.empty_cache() # defining hyperparameter search space for optuna @@ -41,17 +43,28 @@ def optuna_hp_space(trial): def model_init(trial): - return AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto",) + model = AutoModelForCausalLM.from_pretrained( + model_id, quantization_config=bnb_config, device_map="auto") + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, lora_config) + return model # tokenizer load tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') -# Loading training and evaluation data -training_dataset = load_dataset( - "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train[:7500]") -eval_dataset = load_dataset( - "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train[7500:8000]") +dataset = load_dataset( + "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train") + +random.seed(42) +random_indices = random.sample(range(len(dataset)), k=500) + +training_indices = random_indices[:400] +eval_indices = random_indices[400:500] +training_dataset = dataset.filter( + lambda _, idx: idx in training_indices, with_indices=True) +eval_dataset = dataset.filter( + lambda _, idx: idx in eval_indices, with_indices=True) max_seq_length = 1024 @@ -61,9 +74,9 @@ def model_init(trial): output_dir=output_dir, num_train_epochs=3, gradient_checkpointing=True, - per_device_train_batch_size=8, + per_device_train_batch_size=1, gradient_accumulation_steps=8, - optim="adafactor", + optim="paged_adamw_32bit", save_steps=0, logging_steps=10, learning_rate=5e-4, @@ -110,7 +123,7 @@ def formatting_func(example): # instantiation of the trainer -trainer = SFTTrainer( +trainer = CustomSFTTrainer( model=model, train_dataset=training_dataset, eval_dataset=eval_dataset, From cf5f1fb3669cb6d9622b654c1b2611bc6be9f197 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Thu, 4 Jul 2024 17:00:28 +0200 Subject: [PATCH 06/16] Custom SFT Trainer and better use of gpu for model training Signed-off-by: julioc-p <14-10820@usb.ve> --- requirements.txt | 1 - src/hpc_scripts/CustomSFTTrainer.py | 60 +++++++++++++++++++++++++++++ src/hpc_scripts/model_training.py | 24 +++++++++--- 3 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 src/hpc_scripts/CustomSFTTrainer.py diff --git a/requirements.txt b/requirements.txt index 8e41bc4..47727bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,4 @@ datetime==5.5 optuna==3.6.1 python-dotenv==1.0.1 typing==3.7.4.3 -random==1.0.1 diff --git a/src/hpc_scripts/CustomSFTTrainer.py b/src/hpc_scripts/CustomSFTTrainer.py new file mode 100644 index 0000000..66b1657 --- /dev/null +++ b/src/hpc_scripts/CustomSFTTrainer.py @@ -0,0 +1,60 @@ +from typing import List, Union +from trl import SFTTrainer +import optuna +from transformers.trainer_utils import HPSearchBackend, BestRun, PREFIX_CHECKPOINT_DIR, default_compute_objective +import os +import gc +import torch + + +class CustomSFTTrainer(SFTTrainer): + + @staticmethod + def run_hp_search_optuna(trainer, n_trials, direction, **kwargs): + + def _objective(trial, checkpoint_dir=None): + checkpoint = None + if checkpoint_dir: + for subdir in os.listdir(checkpoint_dir): + if subdir.startswith(PREFIX_CHECKPOINT_DIR): + checkpoint = os.path.join(checkpoint_dir, subdir) + ################# + # UPDATES START + ################# + if not checkpoint: + # free GPU memory + del trainer.model + gc.collect() + torch.cuda.empty_cache() + trainer.objective = None + trainer.train(resume_from_checkpoint=checkpoint, trial=trial) + # If there hasn't been any evaluation during the training loop. + if getattr(trainer, "objective", None) is None: + metrics = trainer.evaluate() + trainer.objective = trainer.compute_objective(metrics) + return trainer.objective + + timeout = kwargs.pop("timeout", None) + n_jobs = kwargs.pop("n_jobs", 1) + study = optuna.create_study(direction=direction, **kwargs) + study.optimize(_objective, n_trials=n_trials, + timeout=timeout, n_jobs=n_jobs) + best_trial = study.best_trial + return BestRun(str(best_trial.number), best_trial.value, best_trial.params) + + def hyperparameter_search( + self, + hp_space, + n_trials, + direction, + compute_objective=default_compute_objective, + ) -> Union[BestRun, List[BestRun]]: + + self.hp_search_backend = HPSearchBackend.OPTUNA + self.hp_space = hp_space + self.hp_name = None + self.compute_objective = compute_objective + best_run = CustomSFTTrainer.run_hp_search_optuna( + self, n_trials, direction) + self.hp_search_backend = None + return best_run diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 4175fcb..94d407d 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -1,3 +1,4 @@ +import random from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, @@ -17,12 +18,12 @@ # training pipeline taken from https://huggingface.co/blog/gemma-peft -model_id = "google/gemma-2-27b-it" +model_id = "google/gemma-2-9b-it" bnb_config = BitsAndBytesConfig( - load_in_8bit=True, - bnb_8bit_quant_type="nf4", - bnb_8bit_compute_dtype=torch.bfloat16 + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') @@ -34,6 +35,16 @@ dataset = load_dataset( "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train") +random.seed(42) +random_indices = random.sample(range(len(dataset)), k=len(dataset)) + +training_indices = random_indices[:-len(dataset)//5] +eval_indices = random_indices[-len(dataset)//5:] +training_dataset = dataset.filter( + lambda _, idx: idx in training_indices, with_indices=True) +eval_dataset = dataset.filter( + lambda _, idx: idx in eval_indices, with_indices=True) + # Training (hyper)parameters (initial config taken from: https://medium.com/@lucamassaron/sherlock-holmes-q-a-enhanced-with-gemma-2b-it-fine-tuning-2907b06d2645) max_seq_length = 1024 @@ -46,7 +57,7 @@ output_dir=output_dir, num_train_epochs=3, gradient_checkpointing=True, - per_device_train_batch_size=16, + per_device_train_batch_size=1, gradient_accumulation_steps=8, optim="paged_adamw_32bit", save_steps=0, @@ -102,7 +113,8 @@ def formatting_func(example): formatting_func=formatting_func, tokenizer=tokenizer, max_seq_length=max_seq_length, - callbacks=[EarlyStoppingCallback(early_stopping_patience=15)], + callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], + eval_dataset=eval_dataset ) trainer.train() print("Model is trained") From 730eaf0b9868fa2ebbae8022d50028b636e53667 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Thu, 4 Jul 2024 17:57:35 +0200 Subject: [PATCH 07/16] initial hyperparameter results Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 94d407d..578c8e9 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -55,15 +55,15 @@ training_arguments = TrainingArguments( output_dir=output_dir, - num_train_epochs=3, + num_train_epochs=11, gradient_checkpointing=True, per_device_train_batch_size=1, gradient_accumulation_steps=8, optim="paged_adamw_32bit", save_steps=0, logging_steps=10, - learning_rate=5e-4, - weight_decay=0.001, + learning_rate=6.295127877143855e-06, + weight_decay=0.00021573996504024309, fp16=True, bf16=False, max_grad_norm=0.3, From 9343ce98e035a895f8ce397f6cf771a1cad196ca Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Thu, 4 Jul 2024 19:46:08 +0200 Subject: [PATCH 08/16] evaluation strategy match for the load best model Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 578c8e9..ae43a67 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -74,6 +74,7 @@ report_to="tensorboard", disable_tqdm=False, load_best_model_at_end=True, + evaluation_strategy='steps', # debug="underflow_overflow" ) From b5fa0b68d78fd52f1a87953705ceca89a13dfdeb Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Sat, 6 Jul 2024 17:11:34 +0200 Subject: [PATCH 09/16] new hyperparameters Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/hyperparameter_optimization.py | 4 ++-- src/hpc_scripts/model_training.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/hpc_scripts/hyperparameter_optimization.py b/src/hpc_scripts/hyperparameter_optimization.py index 0d93bed..c90a8e4 100644 --- a/src/hpc_scripts/hyperparameter_optimization.py +++ b/src/hpc_scripts/hyperparameter_optimization.py @@ -54,7 +54,7 @@ def model_init(trial): tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') dataset = load_dataset( - "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train") + "Kubermatic/Merged_QAs", split="train") random.seed(42) random_indices = random.sample(range(len(dataset)), k=500) @@ -141,7 +141,7 @@ def formatting_func(example): best_trial = trainer.hyperparameter_search( direction="minimize", hp_space=optuna_hp_space, - n_trials=20, + n_trials=5, ) print(best_trial) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index ae43a67..f3b510e 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -33,13 +33,14 @@ # Training Data dataset = load_dataset( - "Kubermatic/cncf-question-and-answer-dataset-for-llm-training", split="train") + "Kubermatic/Merged_QAs", split="train") random.seed(42) -random_indices = random.sample(range(len(dataset)), k=len(dataset)) +l = len(dataset) // 25 +random_indices = random.sample(range(len(dataset)), k=l) -training_indices = random_indices[:-len(dataset)//5] -eval_indices = random_indices[-len(dataset)//5:] +training_indices = random_indices[:-l//25] +eval_indices = random_indices[-l//25:] training_dataset = dataset.filter( lambda _, idx: idx in training_indices, with_indices=True) eval_dataset = dataset.filter( @@ -55,15 +56,15 @@ training_arguments = TrainingArguments( output_dir=output_dir, - num_train_epochs=11, + num_train_epochs=5, gradient_checkpointing=True, - per_device_train_batch_size=1, + per_device_train_batch_size=4, gradient_accumulation_steps=8, optim="paged_adamw_32bit", save_steps=0, logging_steps=10, - learning_rate=6.295127877143855e-06, - weight_decay=0.00021573996504024309, + learning_rate=1.344609154868106e-05, + weight_decay=0.00019307024914471071, fp16=True, bf16=False, max_grad_norm=0.3, From c5d99bff7948998a268c03ba3e0282ff7a47f80b Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Mon, 8 Jul 2024 13:58:09 +0200 Subject: [PATCH 10/16] using 1 200 th of the dataset Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index f3b510e..535994e 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -36,11 +36,11 @@ "Kubermatic/Merged_QAs", split="train") random.seed(42) -l = len(dataset) // 25 +l = len(dataset) // 200 random_indices = random.sample(range(len(dataset)), k=l) -training_indices = random_indices[:-l//25] -eval_indices = random_indices[-l//25:] +training_indices = random_indices[:-l//160] +eval_indices = random_indices[-l//160:] training_dataset = dataset.filter( lambda _, idx: idx in training_indices, with_indices=True) eval_dataset = dataset.filter( From 53d28091a6331b3d1be9e7aa4f57bbae21cddfab Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Mon, 8 Jul 2024 18:45:32 +0200 Subject: [PATCH 11/16] division factor in model training and taking 80% for training and 20% for evaluation Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 535994e..6c0f0a4 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -36,11 +36,12 @@ "Kubermatic/Merged_QAs", split="train") random.seed(42) -l = len(dataset) // 200 +division_factor = 200 +l = len(dataset) // division_factor random_indices = random.sample(range(len(dataset)), k=l) -training_indices = random_indices[:-l//160] -eval_indices = random_indices[-l//160:] +training_indices = random_indices[:-l//5] +eval_indices = random_indices[-l//5:] training_dataset = dataset.filter( lambda _, idx: idx in training_indices, with_indices=True) eval_dataset = dataset.filter( From c41a76b3c2924a891e195d6980f2ccbd491d0173 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 9 Jul 2024 11:15:16 +0200 Subject: [PATCH 12/16] eval accumulation steps and eval steps Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 6c0f0a4..f79f0b0 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -36,7 +36,7 @@ "Kubermatic/Merged_QAs", split="train") random.seed(42) -division_factor = 200 +division_factor = 50 l = len(dataset) // division_factor random_indices = random.sample(range(len(dataset)), k=l) @@ -76,7 +76,8 @@ report_to="tensorboard", disable_tqdm=False, load_best_model_at_end=True, - evaluation_strategy='steps', + eval_accumulation_steps=1, + eval_steps=500, # debug="underflow_overflow" ) @@ -116,7 +117,7 @@ def formatting_func(example): formatting_func=formatting_func, tokenizer=tokenizer, max_seq_length=max_seq_length, - callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], + callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], eval_dataset=eval_dataset ) trainer.train() From 78e26500f82a992bd527ab5dee556931769212dd Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 9 Jul 2024 11:22:43 +0200 Subject: [PATCH 13/16] evaluation strategy steps Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index f79f0b0..b9d8aeb 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -77,6 +77,7 @@ disable_tqdm=False, load_best_model_at_end=True, eval_accumulation_steps=1, + evaluation_strategy='steps', eval_steps=500, # debug="underflow_overflow" ) From 4f8eedafa03e2e3a423b75f70789742ffe40f2b5 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 9 Jul 2024 17:36:11 +0200 Subject: [PATCH 14/16] training dataset separated Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index b9d8aeb..6d39a4d 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -79,6 +79,7 @@ eval_accumulation_steps=1, evaluation_strategy='steps', eval_steps=500, + per_device_eval_batch_size=4 # debug="underflow_overflow" ) @@ -112,7 +113,7 @@ def formatting_func(example): trainer = SFTTrainer( model=model, - train_dataset=dataset, + train_dataset=training_dataset, args=training_arguments, peft_config=lora_config, formatting_func=formatting_func, From 167ab6cc4adb9fd873b8f1a59ffb39af1d4da7d4 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 9 Jul 2024 18:17:58 +0200 Subject: [PATCH 15/16] attention eager and using train_test_split Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index 6d39a4d..d30aa5e 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -1,4 +1,3 @@ -import random from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, @@ -26,26 +25,18 @@ bnb_4bit_compute_dtype=torch.bfloat16 ) -tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') -# TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA -model = AutoModelForCausalLM.from_pretrained( - model_id, quantization_config=bnb_config, device_map="auto") - -# Training Data dataset = load_dataset( "Kubermatic/Merged_QAs", split="train") +dataset.shuffle(42) +dataset = dataset.train_test_split(test_size=0.2) -random.seed(42) -division_factor = 50 -l = len(dataset) // division_factor -random_indices = random.sample(range(len(dataset)), k=l) +print(dataset["train"]) +print(dataset["test"]) -training_indices = random_indices[:-l//5] -eval_indices = random_indices[-l//5:] -training_dataset = dataset.filter( - lambda _, idx: idx in training_indices, with_indices=True) -eval_dataset = dataset.filter( - lambda _, idx: idx in eval_indices, with_indices=True) +tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') +# TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA +model = AutoModelForCausalLM.from_pretrained( + model_id, quantization_config=bnb_config, device_map="auto", attn_implementation='eager') # Training (hyper)parameters (initial config taken from: https://medium.com/@lucamassaron/sherlock-holmes-q-a-enhanced-with-gemma-2b-it-fine-tuning-2907b06d2645) @@ -113,14 +104,14 @@ def formatting_func(example): trainer = SFTTrainer( model=model, - train_dataset=training_dataset, + train_dataset=dataset, args=training_arguments, peft_config=lora_config, formatting_func=formatting_func, tokenizer=tokenizer, max_seq_length=max_seq_length, - callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], - eval_dataset=eval_dataset + callbacks=[EarlyStoppingCallback(early_stopping_patience=15)], + eval_dataset=dataset, ) trainer.train() print("Model is trained") From 8d8ebb4bcb21859fd644faf47af1b75a95300861 Mon Sep 17 00:00:00 2001 From: julioc-p <14-10820@usb.ve> Date: Tue, 9 Jul 2024 21:16:39 +0200 Subject: [PATCH 16/16] training and eval dataset much smaller Signed-off-by: julioc-p <14-10820@usb.ve> --- src/hpc_scripts/model_training.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hpc_scripts/model_training.py b/src/hpc_scripts/model_training.py index d30aa5e..426bd47 100644 --- a/src/hpc_scripts/model_training.py +++ b/src/hpc_scripts/model_training.py @@ -28,10 +28,7 @@ dataset = load_dataset( "Kubermatic/Merged_QAs", split="train") dataset.shuffle(42) -dataset = dataset.train_test_split(test_size=0.2) - -print(dataset["train"]) -print(dataset["test"]) +dataset = dataset.train_test_split(train_size=0.20, test_size=0.04) tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right') # TODO: Check if this can be changed to AutoModelForQuestionAnswering with GEMMA @@ -104,14 +101,14 @@ def formatting_func(example): trainer = SFTTrainer( model=model, - train_dataset=dataset, + train_dataset=dataset["train"], args=training_arguments, peft_config=lora_config, formatting_func=formatting_func, tokenizer=tokenizer, max_seq_length=max_seq_length, callbacks=[EarlyStoppingCallback(early_stopping_patience=15)], - eval_dataset=dataset, + eval_dataset=dataset["test"], ) trainer.train() print("Model is trained")