From a3efab8cd56fcf99ccbd468edf9819799017b3d6 Mon Sep 17 00:00:00 2001 From: zzhhjjj Date: Thu, 2 May 2024 11:50:21 +0000 Subject: [PATCH] test llama example with 8-t4 --- .github/workflows/fa2_unit_tests.yaml | 3 -- .github/workflows/llama_tests.yaml | 7 +++-- tests/{test_train_llama.py => test_llama.py} | 29 +++++++++++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) rename tests/{test_train_llama.py => test_llama.py} (79%) diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml index e7eacc94..c5b3346f 100644 --- a/.github/workflows/fa2_unit_tests.yaml +++ b/.github/workflows/fa2_unit_tests.yaml @@ -57,6 +57,3 @@ jobs: # NOTE: -m fa2 will only run the unit tests that have the mark # "fa2" (these are FA2-related tests) run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --verbose tests/ - - - name: Run tiny Llama example - run: ./examples/train_tiny_llama.sh diff --git a/.github/workflows/llama_tests.yaml b/.github/workflows/llama_tests.yaml index 61798d09..eee801c7 100644 --- a/.github/workflows/llama_tests.yaml +++ b/.github/workflows/llama_tests.yaml @@ -51,5 +51,8 @@ jobs: - name: Show installed libraries and their versions run: pip freeze | tee installed.txt - - name: Run Llama loss tests - run: pytest -sv tests/test_train_llama.py + - name: Run Llama example + run: pytest --verbose tests/test_llama.py::test_tiny_llama + + - name: Run Llama loss test + run: pytest --verbose tests/test_llama.py::test_train_llama diff --git a/tests/test_train_llama.py b/tests/test_llama.py similarity index 79% rename from tests/test_train_llama.py rename to tests/test_llama.py index 5e7030eb..942c45bc 100644 --- a/tests/test_train_llama.py +++ b/tests/test_llama.py @@ -12,6 +12,9 @@ TRAIN_SCRIPT = "run_train.py" NUM_GPUS = 8 +TINY_LLLAMA_CONFIG_FILE = "examples/config_tiny_llama.yaml" +TINY_LLLAMA_CREATE_CONFIG_FILE = "examples/config_tiny_llama.py" + ## Experiment results: ## 100 steps: 3.28 ## 160 steps: 2.83 @@ -42,7 +45,7 @@ def extract_loss(line): raise ValueError(f"Could not extract loss value from line: {line}") -def test_tiny_llama(): +def test_train_llama(): # create CONFIG_FILE cmd = f"python {CREATE_CONFIG_FILE}" subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -76,6 +79,30 @@ def test_tiny_llama(): assert process.returncode == 0 +# also run the tiny llama example. Only want to assert it can be ran. +def test_tiny_llama(): + # create CONFIG_FILE + cmd = f"python {TINY_LLLAMA_CREATE_CONFIG_FILE}" + subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + # run training + # set DISABLE_FLASH_ATTENTION=1 to replace flash attention implementations + cmd = f'DISABLE_FLASH_ATTENTION=1 FI_PROVIDER="efa" CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node={NUM_GPUS} --rdzv_endpoint=localhost:29800 {TRAIN_SCRIPT} --config-file {TINY_LLLAMA_CONFIG_FILE}' + os.setpgrp() # create new process group, become its leader + atexit.register(exit_with_children) # kill all children processes when this process exits + + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while True: + line = process.stdout.readline() + if process.poll() is not None and line == b"": + break + if line: + print(line.decode("utf-8"), end="") + + process.wait() # Wait for the process to finish + assert process.returncode == 0 + + if __name__ == "__main__": cmd = f"python {CREATE_CONFIG_FILE}" subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)