Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 0 additions & 76 deletions .github/workflows/iris-external-validation-test.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Iris Tests with Apptainer
name: Iris Development Tests

on:
push:
Expand All @@ -15,6 +15,9 @@ jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -31,22 +34,52 @@ jobs:
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256

# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
echo "Using existing Apptainer image"
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi

# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi

test-1-2-4-ranks:
name: Test 1/2/4 Ranks (Parallel)
name: Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 20
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Run 1, 2, 4 rank tests in parallel
run: |
Expand All @@ -55,22 +88,22 @@ jobs:
# This allows tests like test_empty_device_handling to verify that
# allocating on a different device correctly raises an error.

# Create unique overlay images for isolation
OVERLAY_1="/tmp/iris_overlay_$(whoami)_1rank_$(date +%s%N).img"
OVERLAY_2="/tmp/iris_overlay_$(whoami)_2rank_$(date +%s%N).img"
OVERLAY_4="/tmp/iris_overlay_$(whoami)_4rank_$(date +%s%N).img"
# Create unique overlay images in workspace (will be auto-cleaned)
OVERLAY_1="${PWD}/iris_overlay_1rank_$(date +%s%N).img"
OVERLAY_2="${PWD}/iris_overlay_2rank_$(date +%s%N).img"
OVERLAY_4="${PWD}/iris_overlay_4rank_$(date +%s%N).img"

echo "::group::Creating overlay images"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_1}"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_2}"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_4}"
apptainer overlay create --size 4096 --create-dir /var/cache/iris "${OVERLAY_1}"
apptainer overlay create --size 4096 --create-dir /var/cache/iris "${OVERLAY_2}"
apptainer overlay create --size 4096 --create-dir /var/cache/iris "${OVERLAY_4}"
echo "::endgroup::"

echo "::group::Starting parallel tests"
echo "Starting 1-rank test on GPUs 0,1..."
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 1
" &
Expand All @@ -79,7 +112,7 @@ jobs:
echo "Starting 2-rank test on GPUs 2,3..."
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 2
" &
Expand All @@ -88,7 +121,7 @@ jobs:
echo "Starting 4-rank test on GPUs 4,5,6,7..."
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 4
" &
Expand All @@ -105,11 +138,6 @@ jobs:
wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; }
echo "::endgroup::"

# Cleanup overlay images
echo "::group::Cleaning up overlay images"
rm -f "${OVERLAY_1}" "${OVERLAY_2}" "${OVERLAY_4}"
echo "::endgroup::"

if [ $FAIL -eq 1 ]; then
echo "::error::Parallel tests failed:$FAILED_TESTS"
exit 1
Expand All @@ -118,36 +146,35 @@ jobs:
echo "✅ All parallel tests (1, 2, 4 ranks) passed!"

test-8-ranks:
name: Test 8 Ranks
name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 15
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4

with:
fetch-depth: 0
- name: Run 8-rank test
run: |
# Create unique overlay image for isolation
OVERLAY_8="/tmp/iris_overlay_$(whoami)_8rank_$(date +%s%N).img"
# Create unique overlay image in workspace (will be auto-cleaned)
OVERLAY_8="${PWD}/iris_overlay_8rank_$(date +%s%N).img"

echo "::group::Creating overlay image"
apptainer overlay create --size 1024 --create-dir /var/cache/iris "${OVERLAY_8}"
apptainer overlay create --size 4096 --create-dir /var/cache/iris "${OVERLAY_8}"
echo "::endgroup::"

echo "::group::Running 8-rank test on all GPUs"
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 8
"
echo "::endgroup::"

# Cleanup overlay image
echo "::group::Cleaning up overlay image"
rm -f "${OVERLAY_8}"
echo "::endgroup::"

echo "✅ 8-rank test passed!"
101 changes: 101 additions & 0 deletions .github/workflows/iris-tests-external.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
name: Iris External Validation

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Apptainer
run: |
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer

- name: Build Iris Apptainer container
run: |
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256

# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi

# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi

external-validation-test:
name: External Validation Test - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Run External Validation Test with Apptainer
run: |
# Create unique overlay image in workspace (will be auto-cleaned)
OVERLAY="${PWD}/iris_overlay_external_$(date +%s%N).img"

echo "::group::Creating overlay image"
apptainer overlay create --size 4096 --create-dir /var/cache/iris "${OVERLAY}"
echo "::endgroup::"

echo "::group::Running external validation test"
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
python test_iris_distributed.py
"
echo "::endgroup::"

echo "✅ External validation test passed!"
Loading
Loading