pytorch
diff --git a/‎.github/CODEOWNERS
Lines changed: 10 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/integration_test_8gpu.yaml
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/integration_test_8gpu.yaml
Lines changed: 8 additions & 1 deletion
diff --git a/‎.github/workflows/integration_test_8gpu_flux.yaml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_flux.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_h100.yaml
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_h100.yaml
Lines changed: 49 additions & 0 deletions
diff --git a/‎.github/workflows/integration_test_8gpu_simple_fsdp.yaml
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/integration_test_8gpu_simple_fsdp.yaml
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/release.yml
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/release.yml
Lines changed: 49 additions & 0 deletions
diff --git a/‎.github/workflows/unit_test_cpu.yaml
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/unit_test_cpu.yaml
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 11 deletions b/‎README.md
Lines changed: 12 additions & 11 deletions
diff --git a/‎assets/version.txt
Lines changed: 1 addition & 1 deletion b/‎assets/version.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/README.md
Lines changed: 22 additions & 0 deletions b/‎benchmarks/README.md
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,10 @@
+# This is a CODEOWNERS file.
+# Each line is a file pattern followed by one or more owners.
+
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# they will be requested for review when someone opens a pull request.
+* @tianyu-l @fegin @wwwjn
+
+# Exclude the experiments directory by adding a pattern without owners
+/torchtitan/experiments/
@@ -3,10 +3,15 @@ name: 8 GPU Integration Test
 on:
   push:
     branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
   pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
   schedule:
     # Runs every 6 hours
     - cron: '0 */6 * * *'
+
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
@@ -17,7 +22,7 @@ defaults:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.48xlarge.nvidia.gpu
       gpu-arch-type: cuda
@@ -38,5 +43,7 @@ jobs:
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
@@ -31,6 +31,7 @@ jobs:
       docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      # delete the checkpoints in the artifacts to save CI uploading time
       script: |
         set -eux
 
@@ -44,3 +45,4 @@ jobs:
 
         mkdir artifacts-to-be-uploaded
         python -m torchtitan.experiments.flux.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
+        rm -rf artifacts-to-be-uploaded/*/checkpoint
@@ -0,0 +1,49 @@
+name: 8 GPU Integration Test on H100
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.aws.h100.8
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
@@ -0,0 +1,46 @@
+name: SimpleFSDP 8 GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/simple_fsdp/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/simple_fsdp/**'
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
@@ -0,0 +1,49 @@
+# mostly borrowed from https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/configuring-openid-connect-in-pypi
+
+name: Publish a Release to PyPI
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: build release distributions
+        run: |
+          python -m pip install build
+          python -m build
+
+      - name: upload windows dists
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      id-token: write
+    environment:
+      name: release
+      url: https://pypi.org/p/torchtitan
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
@@ -3,7 +3,11 @@ name: CPU Unit Test
 on:
   push:
     branches: [ main ]
+    paths-ignore:
+      - 'torchtitan/experiments/**'
   pull_request:
+    paths-ignore:
+      - 'torchtitan/experiments/**'
 
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
@@ -25,4 +29,7 @@ jobs:
         pip config --user set global.progress_bar off
 
         pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cpu
+
         pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
@@ -17,7 +17,7 @@ To use the latest features of `torchtitan`, we recommend using the most recent P
 
 
 ## Latest News
-- [2025/04] Our paper has been accepted by [ICLR 2025](https://iclr.cc/virtual/2025/poster/29620). The poster will be presented on Friday April 25th.
+- [2025/04] Our paper was accepted by [ICLR 2025](https://iclr.cc/virtual/2025/poster/29620).
 - [2025/04] [Llama 4](torchtitan/experiments/llama4/) initial support is available as an experiment.
 - [2025/04] Training the diffusion model [FLUX](torchtitan/experiments/flux/) with FSDP/HSDP is available as an experiment.
 - [2025/04] The frontend implementation of [SimpleFSDP](torchtitan/experiments/simple_fsdp/), a compiler-based FSDP framework, is available as an experiment.
@@ -60,27 +60,28 @@ To accelerate contributions to and innovations around torchtitan, we are hosting
 7. DDP and HSDP
 8. [TorchFT](https://github.com/pytorch/torchft) integration
 9. Checkpointable data-loading, with the C4 dataset pre-configured (144M entries) and support for [custom datasets](docs/datasets.md)
-10. Flexible learning rate scheduler (warmup-stable-decay)
-11. Loss, GPU memory, throughput (tokens/sec), TFLOPs, and MFU displayed and logged via [Tensorboard or Weights & Biases](/docs/metrics.md)
-12. [Debugging tools](docs/debugging.md) including CPU/GPU profiling, memory profiling, Flight Recorder, etc.
-13. All options easily configured via [toml files](torchtitan/models/llama3/train_configs/)
-14. [Helper scripts](scripts/) to
+10. Gradient accumulation, enabled by giving an additional `--training.global_batch_size` argument in configuration
+11. Flexible learning rate scheduler (warmup-stable-decay)
+12. Loss, GPU memory, throughput (tokens/sec), TFLOPs, and MFU displayed and logged via [Tensorboard or Weights & Biases](/docs/metrics.md)
+13. [Debugging tools](docs/debugging.md) including CPU/GPU profiling, memory profiling, Flight Recorder, etc.
+14. All options easily configured via [toml files](torchtitan/models/llama3/train_configs/)
+15. [Helper scripts](scripts/) to
     - download tokenizers from Hugging Face
     - convert original Llama 3 checkpoints into the expected DCP format
     - estimate FSDP/HSDP memory usage without materializing the model
     - run distributed inference with Tensor Parallel
 
-We report [performance](docs/performance.md) on up to 512 GPUs, and verify [loss converging](docs/converging.md) correctness of various techniques.
+We report [performance](benchmarks/llama3_h100_202412_torchtitan.md) on up to 512 GPUs, and verify [loss converging](docs/converging.md) correctness of various techniques.
 
 ### Dive into the code
 
 You may want to see how the model is defined or how parallelism techniques are applied. For a guided tour, see these files first:
 * [torchtitan/train.py](torchtitan/train.py) - the main training loop and high-level setup code
-* [torchtitan/models/llama3/model.py](torchtitan/models/llama3/model.py) - the Llama 3.1 model definition
-* [torchtitan/models/llama3/parallelize_llama.py](torchtitan/models/llama3/parallelize_llama.py) - helpers for applying Data Parallel, Tensor Parallel, activation checkpointing, and `torch.compile` to the model
-* [torchtitan/models/llama3/pipeline_llama.py](torchtitan/models/llama3/pipeline_llama.py) - helpers for applying Pipeline Parallel to the model
+* [torchtitan/models/llama3/model/model.py](torchtitan/models/llama3/model/model.py) - the Llama 3.1 model definition
+* [torchtitan/models/llama3/infra/parallelize.py](torchtitan/models/llama3/infra/parallelize.py) - helpers for applying Data Parallel, Tensor Parallel, activation checkpointing, and `torch.compile` to the model
+* [torchtitan/models/llama3/infra/pipeline.py](torchtitan/models/llama3/infra/pipeline.py) - helpers for applying Pipeline Parallel to the model
 * [torchtitan/components/checkpoint.py](torchtitan/components/checkpoint.py) - utils for saving/loading distributed checkpoints
-* [torchtitan/components/float8.py](torchtitan/components/float8.py) - utils for applying Float8 techniques
+* [torchtitan/components/quantization/float8.py](torchtitan/components/quantization/float8.py) - utils for applying Float8 techniques
 
 
 ## Installation
 
@@ -1 +1 @@
-0.0.2
+0.1.0
@@ -0,0 +1,22 @@
+We welcome the community to submit reproducible benchmarking results.
+
+## Submission Guidelines
+
+A submission should be a file / files including the following information
+
+1. Entity, which could be your name, GitHub username, company, university, team, etc.
+2. The model or theme of benchmarking, e.g. Llama 3.1, Async TP.
+3. The hardware setup, including the types of GPUs, interconnections, etc.
+4. The actual performance report with training configs, e.g. via
+   - `.toml` files / commandline arguments
+   - complete configs, which can be found in the log with [`--print_args`](https://github.com/pytorch/torchtitan/blob/e7c0cae934df78d6e9c2835f42ff1f757dc3fddc/torchtitan/config_manager.py#L47) turned on (preferred as the default value not shown in `.toml` or specified in commandline could change from time to time)
+5. The versions and date/time of `torchtitan`, `torch`, `torchao`, or any relevant dependencies.
+6. Other notes which could help reproduce the results.
+
+The name of the file should follow the format of
+```
+[model/theme]_[hardware]_[date/time]_[entity].md
+```
+For example, `llama3.1_h100_202412_pytorch.md`, `asynctp_256xh100_20250613_alice+bob.md`.
+
+An example can be found at [llama3_h100_202412_torchtitan.md](./llama3_h100_202412_torchtitan.md).