yaml

huggingface · Apr 30, 2024 · 7033d24 · 7033d24
1 parent 8f01f82
commit 7033d24
Show file tree

Hide file tree

Showing 5 changed files with 7 additions and 6 deletions.
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -59,6 +59,7 @@ jobs:
         --durations=0 \
         --ignore tests/kernels \
         --ignore tests/fp8 \
+        --ignore tests/test_train_llama.py \
         --verbose \
         tests/
     # NOTE: T4 can't run FA2, DoReMi's LLaMa needs FÀ

diff --git a/.github/workflows/llama_tests.yaml b/.github/workflows/llama_tests.yaml
@@ -19,7 +19,7 @@ on:
 jobs:
   tests:
     # NOTE: 8-a10 to run LLama
-    runs-on: [multi-gpu, nvidia-gpu, 8-a10, ci]
+    runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci]
     container:
       image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
       ports:

diff --git a/examples/config_train_llama.py b/examples/config_train_llama.py
@@ -73,7 +73,7 @@
 )
 
 parallelism = ParallelismArgs(
-    dp=4,
+    dp=2,
     pp=1,
     tp=2,
     pp_engine="1f1b",
@@ -82,7 +82,7 @@
 )
 
 # a global batch-size of 1M tokens.  micro_batch_size * dp * sequence_length * batch_accumulation_per_replica
-tokens = TokensArgs(sequence_length=512, train_steps=200, micro_batch_size=128, batch_accumulation_per_replica=4)
+tokens = TokensArgs(sequence_length=512, train_steps=200, micro_batch_size=128, batch_accumulation_per_replica=8)
 
 checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)

diff --git a/examples/config_train_llama.yaml b/examples/config_train_llama.yaml
@@ -75,7 +75,7 @@ optimizer:
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
-  dp: 4
+  dp: 2
   expert_parallel_size: 1
   pp: 1
   pp_engine: 1f1b
@@ -88,7 +88,7 @@ tokenizer:
   tokenizer_name_or_path: gpt2
   tokenizer_revision: null
 tokens:
-  batch_accumulation_per_replica: 4
+  batch_accumulation_per_replica: 8
   limit_test_batches: 0
   limit_val_batches: 0
   micro_batch_size: 128

diff --git a/tests/test_train_llama.py b/tests/test_train_llama.py
@@ -14,7 +14,7 @@
 CONFIG_FILE = "examples/config_train_llama.yaml"
 CREATE_CONFIG_FILE = "examples/config_train_llama.py"
 TRAIN_SCRIPT = "run_train.py"
-NUM_GPUS = 8
+NUM_GPUS = 4
 
 ## 100+ steps: lm_loss < 3.5
 ## 200  steps: lm_loss < 3