Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Iris Tests with Apptainer
name: Iris Development Tests

on:
push:
Expand All @@ -15,6 +15,9 @@ jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -31,22 +34,52 @@ jobs:
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256

# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
echo "Using existing Apptainer image"
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi

# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi

test-1-2-4-ranks:
name: Test 1/2/4 Ranks (Parallel)
name: Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 20
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Run 1, 2, 4 rank tests in parallel
run: |
Expand All @@ -70,7 +103,7 @@ jobs:
echo "Starting 1-rank test on GPUs 0,1..."
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 1
" &
Expand All @@ -79,7 +112,7 @@ jobs:
echo "Starting 2-rank test on GPUs 2,3..."
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 2
" &
Expand All @@ -88,7 +121,7 @@ jobs:
echo "Starting 4-rank test on GPUs 4,5,6,7..."
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 4
" &
Expand Down Expand Up @@ -118,15 +151,19 @@ jobs:
echo "✅ All parallel tests (1, 2, 4 ranks) passed!"

test-8-ranks:
name: Test 8 Ranks
name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 15
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
uses: actions/checkout@v4

with:
fetch-depth: 0
- name: Run 8-rank test
run: |
# Create unique overlay image for isolation
Expand All @@ -139,7 +176,7 @@ jobs:
echo "::group::Running 8-rank test on all GPUs"
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install -e .
bash .github/scripts/run_tests.sh 8
"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Iris External Validation Test
name: Iris External Validation

on:
push:
Expand All @@ -15,6 +15,9 @@ jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -31,19 +34,46 @@ jobs:
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256

# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
echo "Using existing Apptainer image"
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi

# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi

external-validation-test:
name: External Validation Test
name: External Validation Test - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -61,7 +91,7 @@ jobs:
echo "::group::Running external validation test"
apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py
python test_iris_distributed.py
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Iris Pip Install Test
name: Iris Package Tests

on:
push:
Expand All @@ -15,6 +15,9 @@ jobs:
build-apptainer-image:
runs-on: [self-hosted, mi3008x]
timeout-minutes: 90
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -31,18 +34,45 @@ jobs:
# Create persistent Apptainer directory
mkdir -p ~/apptainer

# Build Apptainer image from definition file (only if it doesn't exist)
if [ ! -f ~/apptainer/iris-dev.sif ]; then
echo "Building new Apptainer image..."
apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def
# Compute hash of the definition file
DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def"
CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}')
HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256

# Check if we need to rebuild
REBUILD=false
if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then
echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
elif [ ! -f "$HASH_FILE" ]; then
echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
REBUILD=true
else
echo "Using existing Apptainer image"
STORED_HASH=$(cat "$HASH_FILE")
if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then
echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..."
echo " Previous hash: $STORED_HASH"
echo " Current hash: $CURRENT_HASH"
REBUILD=true
else
echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)"
fi
fi

# Build if needed
if [ "$REBUILD" = true ]; then
apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE"
echo "$CURRENT_HASH" > "$HASH_FILE"
echo "Successfully built and stored hash: $CURRENT_HASH"
fi
test-1-2-4-ranks:
name: Pip Install Test 1/2/4 Ranks (Parallel)
name: Pip Install Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand Down Expand Up @@ -72,7 +102,7 @@ jobs:
echo "Starting 1-rank test on GPUs 0,1..."
apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
bash .github/scripts/run_tests.sh 1
" &
Expand All @@ -81,7 +111,7 @@ jobs:
echo "Starting 2-rank test on GPUs 2,3..."
apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
bash .github/scripts/run_tests.sh 2
" &
Expand All @@ -90,7 +120,7 @@ jobs:
echo "Starting 4-rank test on GPUs 4,5,6,7..."
apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
bash .github/scripts/run_tests.sh 4
" &
Expand Down Expand Up @@ -120,10 +150,13 @@ jobs:
echo "✅ All parallel tests (1, 2, 4 ranks) passed!"

test-8-ranks:
name: Pip Install Test 8 Ranks
name: Pip Install Test 8 Ranks - ROCm ${{ matrix.rocm_version }}
needs: build-apptainer-image
runs-on: [self-hosted, mi3008x]
timeout-minutes: 30
strategy:
matrix:
rocm_version: ["6.3.1", "7.0"]

steps:
- name: Checkout repository
Expand All @@ -143,7 +176,7 @@ jobs:
echo "::group::Running 8-rank test on all GPUs"
apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
--bind "${PWD}:/iris_workspace" --cwd /iris_workspace \
~/apptainer/iris-dev.sif bash -c "
~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c "
pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
bash .github/scripts/run_tests.sh 8
"
Expand Down
35 changes: 35 additions & 0 deletions apptainer/iris-rocm6.3.1.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SPDX-License-Identifier: MIT
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.

Bootstrap: docker
From: rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.10_pytorch

%post
/bin/bash -c "
apt-get update && apt-get install -y git
export TRITON_PATH=/workspace/triton
conda env list
source /opt/conda/bin/activate py_3.10
conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel
git clone https://github.com/triton-lang/triton.git \$TRITON_PATH
cd \$TRITON_PATH
git checkout dd5823453bcc7973eabadb65f9d827c43281c434
pip install -e .
wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.3.1/rocprofiler-systems-install.py
python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.3
"

%environment
# Define environment variables
export TRITON_PATH=/workspace/triton
export PYTHONPATH=$TRITON_PATH/python/
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export ROCM_PATH=/opt/rocm
export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH
export OMPI_MCA_mtl="^ofi"
export OMPI_MCA_pml="ob1"

%runscript
echo "Welcome to the ROCm-aware Apptainer image!"
source /opt/conda/bin/activate py_3.10
exec "$@"
Loading
Loading