Skip to content

Commit 41d7552

Browse files
authored
Run hvd tests without xdist (#3347)
1 parent 0f365e7 commit 41d7552

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

.github/workflows/hvd-tests.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ concurrency:
2525
jobs:
2626
horovod-tests:
2727
runs-on: ubuntu-latest
28-
timeout-minutes: 60
28+
timeout-minutes: 120
2929
strategy:
3030
matrix:
3131
python-version: ["3.11"]
@@ -64,15 +64,15 @@ jobs:
6464
#install other dependencies
6565
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
6666
pip install -r requirements-dev.txt
67-
67+
6868
# Install Horovod from source and apply a patch to build with recent pytorch
69-
# We can't use pip install <whatever> as build-env can't find pytorch and
69+
# We can't use pip install <whatever> as build-env can't find pytorch and
7070
# `--no-build-isolation` does not work with horovod setup.py
7171
git clone --recursive https://github.com/horovod/horovod.git /tmp/horovod
7272
cd /tmp/horovod
7373
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
74-
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt
75-
HOROVOD_WITH_PYTORCH=1 python setup.py install
74+
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt
75+
HOROVOD_WITH_PYTORCH=1 python setup.py install
7676
cd -
7777
# test the installation:
7878
python -c "import horovod.torch as hvd; hvd.mpi_ops.Sum"
@@ -90,11 +90,11 @@ jobs:
9090
- name: Run Tests
9191
uses: nick-fields/retry@v3
9292
with:
93-
max_attempts: 5
94-
timeout_minutes: 15
93+
max_attempts: 3
94+
timeout_minutes: 40
9595
shell: bash
96-
command: bash tests/run_cpu_tests.sh
97-
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
96+
command: USE_XDIST=0 bash tests/run_cpu_tests.sh
97+
new_command_on_retry: USE_LAST_FAILED=1 USE_XDIST=0 bash tests/run_cpu_tests.sh
9898

9999
- name: Upload coverage to Codecov
100100
uses: codecov/codecov-action@v3

tests/ignite/distributed/utils/test_horovod.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def test_idist_methods_overhead_hvd(gloo_hvd_executor):
242242
sync_model = False
243243
gloo_hvd_executor(_test_idist_methods_overhead, (ok_factor, sync_model), np=np, do_init=True)
244244

245-
ok_factor = 3.0
245+
ok_factor = 3.5
246246
sync_model = True
247247
gloo_hvd_executor(_test_idist_methods_overhead, (ok_factor, sync_model), np=np, do_init=True)
248248

tests/run_cpu_tests.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,22 @@ skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0}
66
use_last_failed=${USE_LAST_FAILED:-0}
77
match_tests_expression=${1:-""}
88

9+
use_xdist=${USE_XDIST:-1}
10+
core_args="-vvv tests/ignite"
11+
if [ "${use_xdist}" -eq "1" ]; then
12+
core_args="${core_args} --tx 4*popen//python=python"
13+
fi
14+
915
CUDA_VISIBLE_DEVICES="" run_tests \
10-
--core_args "--tx 4*popen//python=python -vvv tests/ignite" \
16+
--core_args "${core_args}" \
1117
--cache_dir ".cpu-not-distrib" \
1218
--skip_distrib_tests "${skip_distrib_tests}" \
1319
--use_coverage 1 \
1420
--match_tests_expression "${match_tests_expression}" \
1521
--use_last_failed ${use_last_failed}
1622

1723
# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
18-
if [ "${skip_distrib_tests}" -eq "1" ]; then
24+
if [ "${skip_distrib_tests}" -eq "1" ] || [ "${use_xdist}" -eq "0" ]; then
1925
exit 0
2026
fi
2127

0 commit comments

Comments
 (0)