add workflows

outerbounds · Aug 30, 2024 · 25a5204 · 25a5204
1 parent 8914bcc
commit 25a5204
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 23 deletions.
diff --git a/finetune_hf_peft.py b/finetune_hf_peft.py
@@ -1,9 +1,21 @@
 import os
 import json
-from metaflow import FlowSpec, step, IncludeFile, Parameter, secrets, resources, retry, pypi, huggingface_card, nvidia, S3
+from metaflow import FlowSpec, step, IncludeFile, Parameter, secrets, resources, secrets, retry, pypi_base, huggingface_card, kubernetes, S3
 from metaflow.profilers import gpu_profile
 from exceptions import GatedRepoError, GATED_HF_ORGS
 
+@pypi_base(packages={
+    'datasets': '',
+    'torch': '',
+    'transformers': '',
+    'peft': '',
+    'trl': '',
+    'accelerate': '',
+    'bitsandbytes': '',
+    'sentencepiece': '',
+    'safetensors': '',
+    'requests': ''
+})
 class FinetuneLlama3LoRA(FlowSpec):
 
     script_args_file = IncludeFile(
@@ -19,7 +31,6 @@ class FinetuneLlama3LoRA(FlowSpec):
         help="Flag for a smoke test"
     )
 
-    @pypi(disabled=True)
     @secrets(sources=["huggingface-token"])
     @step
     def start(self):
@@ -33,25 +44,17 @@ def start(self):
             raise GatedRepoError(self.script_args.dataset_name)
         self.next(self.sft)
 
-    @pypi(packages={
-        'datasets': '',
-        'torch': '',
-        'transformers': '',
-        'peft': '',
-        'trl': '',
-        'accelerate': '',
-        'bitsandbytes': '',
-        'sentencepiece': '',
-        'safetensors': ''
-    })
     @gpu_profile(interval=1)
     @huggingface_card
-    @nvidia
+    @secrets(sources=["huggingface-token"])
+    @kubernetes(gpu=1)
     @step
     def sft(self):
+        import os
         from my_peft_tools import create_model, create_trainer, save_model, get_tar_bytes
         import huggingface_hub
-        huggingface_hub.login('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')
+
+        huggingface_hub.login(os.environ['HF_TOKEN']) # contained in hugginface-token secret
         model, tokenizer = create_model(self.script_args)
         trainer = create_trainer(self.script_args, tokenizer, model, smoke=self.smoke, card=True)
         trainer.train()
@@ -62,11 +65,10 @@ def sft(self):
                 s3.put('lora_merged.tar.gz', get_tar_bytes(merge_output_dirname))
         self.next(self.end)
 
-    @pypi(disabled=True)
     @step
     def end(self):
         print("Training completed successfully!")
 
 
 if __name__ == '__main__':
-    FinetuneLlama3LoRA()
+    FinetuneLlama3LoRA()
diff --git a/hf_peft_args.json b/hf_peft_args.json
@@ -1,7 +1,7 @@
 {
     "local_rank": -1,
-    "per_device_train_batch_size": 1,
-    "per_device_eval_batch_size": 4,
+    "per_device_train_batch_size": 16,
+    "per_device_eval_batch_size": 16,
     "gradient_accumulation_steps": 17,
     "learning_rate": 3e-4,
     "max_grad_norm": 0.3,
@@ -30,4 +30,4 @@
     "logging_steps": 5,
     "merge": false,
     "output_dir": "./lora_checkpoints"
-}
+}
diff --git a/huggingface_model_card.json b/huggingface_model_card.json
diff --git a/my_peft_tools.py b/my_peft_tools.py
@@ -186,13 +186,13 @@ def save_model(args, trainer, dirname="final", merge_dirname="final_merged_check
     if args.merge:
         """
         This conditional block merges the LoRA adapter with the original model weights.
-        NOTE: For use with NIM, we do not need to do the merge, the adapter_config.json 
+        NOTE: For use with NIM, we do not need to do the merge.
         """
         model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
         model = model.merge_and_unload()
         output_merged_dir = os.path.join(args.output_dir, merge_dirname)
         model.save_pretrained(output_merged_dir, safe_serialization=True)
-        return output_dir, merge_dirname
+        return output_dir, output_merged_dir
     else:
         return output_dir, None