From a3efab8cd56fcf99ccbd468edf9819799017b3d6 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 2 May 2024 11:50:21 +0000
Subject: [PATCH] test llama example with 8-t4

---
 .github/workflows/fa2_unit_tests.yaml        |  3 --
 .github/workflows/llama_tests.yaml           |  7 +++--
 tests/{test_train_llama.py => test_llama.py} | 29 +++++++++++++++++++-
 3 files changed, 33 insertions(+), 6 deletions(-)
 rename tests/{test_train_llama.py => test_llama.py} (79%)

diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
index e7eacc94..c5b3346f 100644
--- a/.github/workflows/fa2_unit_tests.yaml
+++ b/.github/workflows/fa2_unit_tests.yaml
@@ -57,6 +57,3 @@ jobs:
       # NOTE: -m fa2 will only run the unit tests that have the mark
       # "fa2" (these are FA2-related tests)
       run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --verbose tests/
-
-    - name: Run tiny Llama example
-      run: ./examples/train_tiny_llama.sh
diff --git a/.github/workflows/llama_tests.yaml b/.github/workflows/llama_tests.yaml
index 61798d09..eee801c7 100644
--- a/.github/workflows/llama_tests.yaml
+++ b/.github/workflows/llama_tests.yaml
@@ -51,5 +51,8 @@ jobs:
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
 
-    - name: Run Llama loss tests
-      run: pytest -sv tests/test_train_llama.py
+    - name: Run Llama example
+      run: pytest --verbose tests/test_llama.py::test_tiny_llama
+
+    - name: Run Llama loss test
+      run: pytest --verbose tests/test_llama.py::test_train_llama
diff --git a/tests/test_train_llama.py b/tests/test_llama.py
similarity index 79%
rename from tests/test_train_llama.py
rename to tests/test_llama.py
index 5e7030eb..942c45bc 100644
--- a/tests/test_train_llama.py
+++ b/tests/test_llama.py
@@ -12,6 +12,9 @@
 TRAIN_SCRIPT = "run_train.py"
 NUM_GPUS = 8
 
+TINY_LLLAMA_CONFIG_FILE = "examples/config_tiny_llama.yaml"
+TINY_LLLAMA_CREATE_CONFIG_FILE = "examples/config_tiny_llama.py"
+
 ## Experiment results:
 ## 100 steps: 3.28
 ## 160 steps: 2.83
@@ -42,7 +45,7 @@ def extract_loss(line):
         raise ValueError(f"Could not extract loss value from line: {line}")
 
 
-def test_tiny_llama():
+def test_train_llama():
     # create CONFIG_FILE
     cmd = f"python {CREATE_CONFIG_FILE}"
     subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
@@ -76,6 +79,30 @@ def test_tiny_llama():
     assert process.returncode == 0
 
 
+# also run the tiny llama example. Only want to assert it can be ran.
+def test_tiny_llama():
+    # create CONFIG_FILE
+    cmd = f"python {TINY_LLLAMA_CREATE_CONFIG_FILE}"
+    subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    # run training
+    # set DISABLE_FLASH_ATTENTION=1 to replace flash attention implementations
+    cmd = f'DISABLE_FLASH_ATTENTION=1 FI_PROVIDER="efa" CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={NUM_GPUS} --rdzv_endpoint=localhost:29800  {TRAIN_SCRIPT} --config-file {TINY_LLLAMA_CONFIG_FILE}'
+    os.setpgrp()  # create new process group, become its leader
+    atexit.register(exit_with_children)  # kill all children processes when this process exits
+
+    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while True:
+        line = process.stdout.readline()
+        if process.poll() is not None and line == b"":
+            break
+        if line:
+            print(line.decode("utf-8"), end="")
+
+    process.wait()  # Wait for the process to finish
+    assert process.returncode == 0
+
+
 if __name__ == "__main__":
     cmd = f"python {CREATE_CONFIG_FILE}"
     subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)