Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
eec75c6
add CI for torchtitan
wwwjn Jul 14, 2025
12086e0
lint
wwwjn Jul 14, 2025
33dff7d
refactor CI
wwwjn Jul 16, 2025
6a2f5b2
add integration test
wwwjn Jul 20, 2025
70f3338
refactor v1
wwwjn Jul 21, 2025
0019a97
remove use_for_integration_test
wwwjn Jul 21, 2025
8c5421d
rename
wwwjn Jul 21, 2025
5096c97
rebase v2
wwwjn Jul 25, 2025
89d7dea
refactor
wwwjn Jul 25, 2025
fe56c32
change commandline
wwwjn Jul 25, 2025
55ea2f2
change commandline
wwwjn Jul 25, 2025
96398d3
fix parameter name for configs
wwwjn Jul 25, 2025
be8ab9d
temporarily disable pp tests
wwwjn Jul 25, 2025
d19990c
delete filename
wwwjn Jul 28, 2025
4352b26
refactor v2
wwwjn Jul 29, 2025
7ff504b
fix readme
wwwjn Jul 29, 2025
c06bb6b
rebase
wwwjn Jul 29, 2025
17065bb
rebase to main
wwwjn Jul 29, 2025
08b845c
fix test failures
wwwjn Jul 29, 2025
e36cc57
lint
wwwjn Jul 29, 2025
ca83b11
fix CI error
wwwjn Jul 29, 2025
5065202
rebase
wwwjn Jul 30, 2025
4d52b40
rebase
wwwjn Jul 31, 2025
e0f6e3d
change badge
wwwjn Jul 31, 2025
2a3d719
rebase to main
wwwjn Jul 31, 2025
d44310f
fix readme
wwwjn Aug 6, 2025
bde53eb
refactor wip
wwwjn Aug 9, 2025
1441938
refactor logic
wwwjn Aug 11, 2025
fb0d808
rebase
wwwjn Aug 11, 2025
20849be
restore
wwwjn Aug 11, 2025
a5f4291
fix flux tests
wwwjn Aug 11, 2025
f906330
fix lint
wwwjn Aug 13, 2025
21cf580
rebase
wwwjn Aug 25, 2025
b315f19
fix
wwwjn Aug 27, 2025
847e101
fix README
wwwjn Aug 27, 2025
b80a42a
fix flux
wwwjn Aug 27, 2025
f4fe480
fix List -> list
wwwjn Aug 27, 2025
e46f7e8
fix list
wwwjn Aug 27, 2025
bb53357
fix lint
wwwjn Aug 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions .github/workflows/integration_test_8gpu_features.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: 8 GPU Feature Tests
on:
push:
branches: [ main ]
paths-ignore:
- 'torchtitan/experiments/**'
pull_request:
paths-ignore:
- 'torchtitan/experiments/**'
schedule:
# Runs every 6 hours
- cron: '0 */6 * * *'

concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
repository: pytorch/torchtitan
upload-artifact: outputs
script: |
set -eux

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"

pip config --user set global.progress_bar off

python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126

mkdir artifacts-to-be-uploaded
python -m tests.integration_tests.run_tests --test_suite features artifacts-to-be-uploaded --ngpu 8
2 changes: 1 addition & 1 deletion .github/workflows/integration_test_8gpu_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ jobs:
mkdir artifacts-to-be-uploaded

# Enable CPP stacktraces for debugging symmetric memory initialization errors.
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests_h100 artifacts-to-be-uploaded --ngpu 8
TORCH_SHOW_CPP_STACKTRACES=1 python -m tests.integration_tests.run_tests --test_suite h100 artifacts-to-be-uploaded --ngpu 8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: 8 GPU Integration Test
name: 8 GPU Model Tests

on:
push:
Expand Down Expand Up @@ -50,4 +50,4 @@ jobs:
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126

mkdir artifacts-to-be-uploaded
python -m tests.integration_tests artifacts-to-be-uploaded --ngpu 8
python -m tests.integration_tests.run_tests --test_suite models artifacts-to-be-uploaded --ngpu 8
2 changes: 1 addition & 1 deletion .github/workflows/integration_test_8gpu_torchft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ jobs:
RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000 > /dev/null 2>&1 &
echo "ft_integration_test"
# Getting error - Cuda failure 217 'peer access is not supported between these two devices'
python -m tests.integration_tests_ft artifacts-to-be-uploaded --ngpu 8
python -m tests.integration_tests.ft artifacts-to-be-uploaded --ngpu 8
# pkill -9 torchft_lighthouse
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

#### A PyTorch native platform for training generative AI models

[![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
[![8 GPU Feature Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_features.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
[![8 GPU Model Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_models.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
[![arXiv](https://img.shields.io/badge/arXiv-2410.06511-b31b1b.svg)](https://arxiv.org/abs/2410.06511)
[![ICLR](https://img.shields.io/badge/ICLR-2025-violet.svg)](https://iclr.cc/virtual/2025/poster/29620)
[![forum](https://img.shields.io/badge/pytorch-forum-DE3412.svg)](https://discuss.pytorch.org/c/distributed/torchtitan/44)
Expand Down
34 changes: 21 additions & 13 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
# Tests

This directory contains tests for the TorchTitan project, including unit tests and integration tests.
This directory contains tests for the torchtitan project, including unit tests and integration tests.

## Test Structure

- `unit_tests/`: Contains unit tests for individual components
- `integration_tests.py`: Contains integration tests that test multiple components together
- `integration_tests_h100.py`: Contains integration tests specifically designed for H100 GPUs, which utilize symmetric memory and float8.
- `integration_tests/`: Contains integration tests that test multiple components together
- `base_config.toml`: Base configuration file for integration tests
- `features.py`: Tests for torchtitan features and composability
- `ft.py`: Fault-tolerance integration tests
- `h100.py`: Tests cases for H100 GPUs
- `models.py`: Tests for specific model architectures and configurations
- `assets/`: Contains test assets and fixtures used by the tests
- `tokenizer/`: Tokenizer configuration and vocabulary files for testing
- `custom_schedule.csv`: Custom PP schedule for testing

## Running Tests

Expand All @@ -16,7 +22,7 @@ This directory contains tests for the TorchTitan project, including unit tests a
Ensure you have all development dependencies installed:

```bash
pip install -r dev-requirements.txt
pip install -r requirements-dev.txt
pip install -r requirements.txt
```

Expand All @@ -25,25 +31,27 @@ pip install -r requirements.txt
To run the integration tests:

```bash
python ./tests/integration_tests.py <output_dir> [--config_dir CONFIG_DIR] [--test TEST] [--ngpu NGPU]
python -m tests.integration_tests.run_tests <output_dir> [--config_path CONFIG_PATH] [--test_suite TEST_SUITE] [--test_name TEST_NAME] [--ngpu NGPU]
```

Arguments:
- `output_dir`: (Required) Directory where test outputs will be stored
- `--config_dir`: (Optional) Directory containing configuration files (default: "./torchtitan/models/llama3/train_configs")
- `--test`: (Optional) Specific test to run, use test names from the `build_test_list()` function (default: "all")
- `--test_suite`: (Optional) Specific test suite to run by name (default: "features")
- `--config_path`: (Optional) Path to the base config file (default: "./tests/integration_tests/base_config.toml")
- `--test_name`: (Optional) Specific test to run by name (default: "all")
- `--ngpu`: (Optional) Number of GPUs to use for testing (default: 8)

Examples:
```bash
# Run all integration tests with 8 GPUs
python ./tests/integration_tests.py ./test_output
# Run all model integration tests with 8 GPUs
python -m tests.integration_tests.run_tests test_output

# Run a specific test with 4 GPUs
python ./tests/integration_tests.py ./test_output --test default --ngpu 4
# Run only core functionality tests for features
python -m tests.integration_tests.run_tests test_output --test_suite features

# Run a specific test with 2 GPUs
python -m tests.integration_tests.run_tests test_output --test_suite features --test_name gradient_accumulation --ngpu 2

# Run all tests with a custom config directory
python ./tests/integration_tests.py ./test_output --config_dir ./my_configs
```

### Running Unit Tests
Expand Down
27 changes: 27 additions & 0 deletions tests/integration_tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass
from typing import Sequence

__all__ = [
"OverrideDefinitions",
]


@dataclass
class OverrideDefinitions:
"""
This class is used to define the override definitions for the integration tests.
"""

override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
test_descr: str = "default"
test_name: str = "default"
ngpu: int = 4

def __repr__(self):
return self.test_descr
76 changes: 76 additions & 0 deletions tests/integration_tests/base_config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
[job]
dump_folder = "./outputs"
description = "model debug training for integration test"
print_args = false

[profiling]
enable_profiling = false
save_traces_folder = "profile_trace"
profile_freq = 10
enable_memory_snapshot = false
save_memory_snapshot_folder = "memory_snapshot"

[metrics]
log_freq = 1
disable_color_printing = false
enable_tensorboard = false
save_tb_folder = "tb"
enable_wandb = false

[model]
name = "llama3"
flavor = "debugmodel"
# test folder with tokenizer.json, for debug purpose only
tokenizer_path = "./tests/assets/tokenizer"
# converters = ["float8"]

[optimizer]
name = "AdamW"
lr = 8e-4
eps = 1e-8

[lr_scheduler]
warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps
decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps
decay_type = "linear"
lr_min = 0.0

[training]
local_batch_size = 8
seq_len = 2048
max_norm = 1.0 # grad norm clipping
steps = 10
compile = false
dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)

[parallelism]
data_parallel_replicate_degree = 1
data_parallel_shard_degree = -1
fsdp_reshard_after_forward = "default" # default / never / always
tensor_parallel_degree = 1
enable_async_tensor_parallel = false
pipeline_parallel_degree = 1
context_parallel_degree = 1

[checkpoint]
enable_checkpoint = false
folder = "checkpoint"
interval = 10
last_save_model_only = false
export_dtype = "float32"
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]

[activation_checkpoint]
mode = "selective" # ["none", "selective", "full"]
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy

[float8]
enable_fsdp_float8_all_gather = false
precompute_float8_dynamic_scale_for_fsdp = false
filter_fqns = ["output"]

[validation]
enabled = false
dataset = "c4_validation"
freq = 5
steps = 10
Loading
Loading