Split training tests to separate test shard (#9281)

ghpvnist · web-flow · commit 3e556dc8a4ff · 2025-06-03T18:32:51.000-07:00
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -24,6 +24,7 @@ jobs:
           - test/tpu/run_expensive_test_1.sh
           - test/tpu/run_expensive_test_2.sh
           - test/tpu/run_pallas_test.sh
+          - test/tpu/run_training_tests.sh
     steps:
       - name: Checkout actions
         if: inputs.has_code_changes == 'true'
diff --git a/examples/data_parallel/train_resnet_spmd_data_parallel.py b/examples/data_parallel/train_resnet_spmd_data_parallel.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import time
 example_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
 sys.path.append(example_folder)
 from train_resnet_base import TrainResNetBase
@@ -46,4 +47,8 @@ def __init__(self):
 
 if __name__ == '__main__':
   spmd_ddp = TrainResNetXLASpmdDDP()
+
+  start_time = time.time()
   spmd_ddp.start_training()
+  end_time = time.time()
+  print(f"Finished training in {end_time - start_time:.3f}s")
diff --git a/examples/train_decoder_only_base.py b/examples/train_decoder_only_base.py
@@ -156,7 +156,11 @@ def start_training(self):
   if decoder_cls is not None:
     params.append(decoder_cls)
   base = TrainDecoderOnlyBase(*params, num_steps=args.num_steps, config=config)
+
+  start_time = time.time()
   base.start_training()
+  end_time = time.time()
+  print(f"Finished training in {end_time - start_time:.3f}s")
 
   if args.print_metrics:
     print(torch_xla._XLAC._xla_metrics_report())
diff --git a/examples/train_resnet_amp.py b/examples/train_resnet_amp.py
@@ -1,6 +1,7 @@
 from train_resnet_base import TrainResNetBase
 
 import itertools
+import time
 
 import torch_xla
 import torch_xla.distributed.xla_multiprocessing as xmp
@@ -33,4 +34,8 @@ def train_loop_fn(self, loader, epoch):
 
 if __name__ == '__main__':
   xla_amp = TrainResNetXLAAMP()
+
+  start_time = time.time()
   xla_amp.start_training()
+  end_time = time.time()
+  print(f"Finished training in {end_time - start_time:.3f}s")
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
@@ -95,31 +95,3 @@ run_test "$_TEST_DIR/quantized_ops/test_dot_general.py"
 run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"
 run_test "$_TEST_DIR/test_data_type.py"
 run_test "$_TEST_DIR/test_compilation_cache_utils.py"
-
-# run examples, each test should takes <2 minutes
-run_test "$_TEST_DIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py"
-run_test "$_TEST_DIR/../examples/fsdp/train_decoder_only_fsdp_v2.py"
-run_test "$_TEST_DIR/../examples/train_resnet_amp.py"
-run_test "$_TEST_DIR/../examples/train_decoder_only_base.py"
-run_test "$_TEST_DIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \
-    --num-steps 30 # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead
-
-# HACK: don't confuse local `torch_xla` folder with installed package
-# Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
-# Egaer tests will take more HBM, only run them on TPU v4 CI
-TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())")
-if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then
-    run_test "$_TEST_DIR/dynamo/test_traceable_collectives.py"
-    run_test "$_TEST_DIR/../examples/data_parallel/train_resnet_xla_ddp.py"
-    run_test "$_TEST_DIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_with_compile.py"
-    run_test "$_TEST_DIR/../examples/eager/train_decoder_only_eager_multi_process.py"
-    XLA_EXPERIMENTAL=nonzero:masked_select:nms run_test "$_TEST_DIR/ds/test_dynamic_shapes.py" -v
-fi
-
-if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then
-    # Test `tpu-info` CLI compatibility
-    run_test "$_TPU_DIR/tpu_info/test_cli.py"
-fi
diff --git a/test/tpu/run_training_tests.sh b/test/tpu/run_training_tests.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -xue
+
+# Absolute path to the directory of this script.
+_TPU_DIR="$(
+    cd "$(dirname "$0")"
+    pwd -P
+)"
+
+# Absolute path to the test/ directory.
+_TEST_DIR="$(dirname "$_TPU_DIR")"
+
+# run examples, each test should takes <2 minutes
+python3 "$_TEST_DIR/../examples/data_parallel/train_resnet_spmd_data_parallel.py"
+python3 "$_TEST_DIR/../examples/fsdp/train_decoder_only_fsdp_v2.py"
+python3 "$_TEST_DIR/../examples/train_resnet_amp.py"
+python3 "$_TEST_DIR/../examples/train_decoder_only_base.py"
+python3 "$_TEST_DIR/../examples/train_decoder_only_base.py" scan.decoder_with_scan.DecoderWithScan \
+    --num-steps 30 # TODO(https://github.com/pytorch/xla/issues/8632): Reduce scan tracing overhead
+
+# HACK: don't confuse local `torch_xla` folder with installed package
+# Python 3.11 has the permanent fix: https://stackoverflow.com/a/73636559
+# Egaer tests will take more HBM, only run them on TPU v4 CI
+TPU_VERSION=$(python -c "import sys; sys.path.remove(''); import torch_xla; print(torch_xla._internal.tpu.version())")
+if [[ -n "$TPU_VERSION" && "$TPU_VERSION" == "4" ]]; then
+    python3 "$_TEST_DIR/dynamo/test_traceable_collectives.py"
+    python3 "$_TEST_DIR/../examples/data_parallel/train_resnet_xla_ddp.py"
+    python3 "$_TEST_DIR/../examples/fsdp/train_resnet_fsdp_auto_wrap.py"
+    python3 "$_TEST_DIR/../examples/eager/train_decoder_only_eager.py"
+    python3 "$_TEST_DIR/../examples/eager/train_decoder_only_eager_spmd_data_parallel.py"
+    python3 "$_TEST_DIR/../examples/eager/train_decoder_only_eager_with_compile.py"
+    python3 "$_TEST_DIR/../examples/eager/train_decoder_only_eager_multi_process.py"
+    XLA_EXPERIMENTAL=nonzero:masked_select:nms python3 "$_TEST_DIR/ds/test_dynamic_shapes.py" -v
+fi
+
+if [[ -n "$TPU_VERSION" && "$TPU_VERSION" != "6" ]]; then
+    # Test `tpu-info` CLI compatibility
+    python3 "$_TPU_DIR/tpu_info/test_cli.py"
+fi