Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI move to ALPS (daint-gpu -> alps_gh200) #1225

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ include:
- local: 'ci/cpu/gcc12_release_cxx20.yml'
- local: 'ci/cpu/gcc13_codecov.yml'
- local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc11_release.yml'
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release.yml'
- local: 'ci/cuda/gcc13_release_scalapack.yml'
- local: 'ci/cuda/gcc13_debug.yml'
- local: 'ci/cuda/gcc13_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
Expand Down
64 changes: 64 additions & 0 deletions ci/base-images/gh200-cray-mpich/HOWTO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Modified base image to allow building cray-mpich

## Preparation steps

```
mkdir lib64
cp -a /usr/lib64/libcuda.* lib64/
cp -a /usr/lib64/libxpmem.* lib64/
git clone https://github.com/eth-cscs/alps-cluster-config.git
cp alps-cluster-config/daint/packages.yaml packages.yaml
```

## Edit cluster config files

Modify `packages.yaml`:
```
xpmem:
buildable: false
externals:
- spec: [email protected]
prefix: /usr
libfabric:
- buildable: false
- externals:
- - spec: [email protected]
- prefix: /opt/cray/libfabric/1.15.2.0/
+ require: "@1.15.2.0"
slurm:
buildable: false
externals:
- spec: slurm@23-11-7
prefix: /usr
```
Note: The container engine (CE) will replace libfabric with the system one when running the container.
Make sure to use the same version.


Modify `alps-cluster-config/site/repo/packages/cray-gtl/package.py`
```
patchelf("--force-rpath", "--set-rpath", rpath, f, fail_on_error=False)
# The C compiler wrapper can fail because libmpi_gtl_cuda refers to the symbol
# __gxx_personality_v0 but wasn't linked against libstdc++.
- if "libmpi_gtl_cuda.so" in str(f):
- patchelf("--add-needed", "libstdc++.so", f, fail_on_error=False)
if "@8.1.27+cuda" in self.spec:
patchelf("--add-needed", "libcudart.so", f, fail_on_error=False)
patchelf("--add-needed", "libcuda.so", f, fail_on_error=False)
```
Note: the library links `libstdc++.so` from version 8.1.23. All the available aarch64 libraries already link with it,
therefore we can safely remove it for gh200.

## Build and push

```
export TAG="v1.3"
CSCS_REGISTRY="jfrog.svc.cscs.ch/docker-ci-ext/4700071344751697"
podman login jfrog.svc.cscs.ch
podman build -f build.Dockerfile -t $CSCS_REGISTRY/base-images/cuda_12.6.1-devel-ubuntu24.04:$TAG
podman push $CSCS_REGISTRY/base-images/cuda_12.6.1-devel-ubuntu24.04:$TAG
```
5 changes: 5 additions & 0 deletions ci/base-images/gh200-cray-mpich/build.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM docker.io/nvidia/cuda:12.6.1-devel-ubuntu24.04

COPY lib64 /usr/lib
COPY packages.yaml /root/.spack/packages.yaml
COPY alps-cluster-config/site /root/site
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
# dlaf-no-license-check

CSCS_REGISTRY="jfrog.svc.cscs.ch/contbuild/testing/anfink/4700071344751697"
CSCS_REGISTRY="jfrog.svc.cscs.ch/docker-ci-ext/4700071344751697"
docker build -t $CSCS_REGISTRY/rocm-patched:5.3.3 -f build.Dockerfile .
docker push $CSCS_REGISTRY/rocm-patched:5.3.3
11 changes: 11 additions & 0 deletions ci/ci-ext-custom.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

# Need custom definition as remote adds SLURM_MPI_TYPE: pmi2
.dlaf-container-runner-daint-gh200:
extends: .container-runner-daint
variables:
ARCH: 'aarch64'
USE_CE: 'YES'
NVIDIA_VISIBLE_DEVICES: 'all'
NVIDIA_DRIVER_CAPABILITIES: 'compute,utility'
68 changes: 46 additions & 22 deletions ci/common-ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- local: 'ci/ci-ext-custom.yml'

stages:
- build_deps
Expand All @@ -9,23 +10,20 @@ stages:
variables:
FF_TIMESTAMPS: true

##
## BUILDS
##
## BUILD DEPS

.build_deps_common:
extends: .container-builder
.build_deps_common_base:
stage: build_deps
timeout: 6 hours
before_script:
- echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin
- TAG_IMAGE=`echo ${BASE_IMAGE##*/} | sed 's/[:]//g'`
- TAG_APTGET=`echo ${EXTRA_APTGET} | sha256sum - | head -c 6`
- TAG_COMPILER=`echo ${COMPILER}_CXX${CXXSTD} | sed 's/[@]//g'`
- TAG_DOCKERFILE=`sha256sum $DOCKERFILE | head -c 16`
- TAG_SPACK=`echo $SPACK_SHA`
- TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 16`
- TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 16`
- TAG_DOCKERFILE=`sha256sum $DOCKERFILE | head -c 12`
- TAG_SPACK=`echo $SPACK_SHA | sed "s/develop-//g" | head -c 16`
- TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 12`
- TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 12`
- TAG=${TAG_IMAGE}-${TAG_APTGET}-${TAG_COMPILER}-MKL${USE_MKL}-${TAG_DOCKERFILE}-${TAG_SPACK}-${TAG_REPO}-${TAG_ENVIRONMENT}
- export PERSIST_IMAGE_NAME=$DEPS_IMAGE:$TAG
- echo "DEPS_IMAGE=$PERSIST_IMAGE_NAME" > build.env
Expand Down Expand Up @@ -58,11 +56,25 @@ variables:
EXTRA_APTGET: ""
CXXSTD: 17
USE_MKL: "OFF"
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
USE_CODECOV: "false"

.build_common:
extends: .container-builder
.build_deps_common:
extends:
- .container-builder-cscs-zen2
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml

.build_deps_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml

## BUILD DLAF

.build_common_base:
stage: build
timeout: 2 hours
before_script:
Expand All @@ -74,6 +86,7 @@ variables:
PERSIST_IMAGE_NAME: $DLAF_IMAGE
DOCKER_BUILD_ARGS: '[
"DEPS_IMAGE",
"DLAF_LD_PRELOAD",
"PIP_OPTS",
"NUM_PROCS=$NUM_CORES_BUILD_DLAF"
]'
Expand All @@ -85,19 +98,20 @@ variables:
paths:
- pipeline.yml

.build_for_daint-mc:
.build_common:
extends:
- .container-builder-cscs-zen2
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: mc
THREADS_MAX_PER_TASK: 72
THREADS_PER_NODE: 72
DLAF_LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"


.build_for_daint-gpu:
.build_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: gpu
THREADS_MAX_PER_TASK: 24
THREADS_PER_NODE: 24
DLAF_LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"

.build_for_eiger:
variables:
Expand All @@ -106,6 +120,16 @@ variables:
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

.build_for_alps_gh200:
variables:
RUNNER: ".dlaf-container-runner-daint-gh200"
SLURM_CONSTRAINT: gpu
# 64 / 2 to avoid ranks on multiple sockets for RANK6
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

## RUN

.run_common:
stage: test
trigger:
Expand Down
2 changes: 1 addition & 1 deletion ci/cpu/asan_ubsan_lsan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cpu asan ubsan lsan test:
ASAN_OPTIONS: "fast_unwind_on_malloc=0:strict_string_checks=1:detect_leaks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
UBSAN_OPTIONS: "halt_on_error=1:print_stacktrace=1"
# Override use of libSegFault, not necessary with sanitizers
LD_PRELOAD: ""
DLAF_LD_PRELOAD: ""
trigger:
include:
- artifact: pipeline.yml
Expand Down
6 changes: 3 additions & 3 deletions ci/ctest_to_gitlab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ ARTIFACTS="
"
fi

# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
BASE_TEMPLATE="
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
- local: 'ci/ci-ext-custom.yml'

image: $IMAGE

Expand All @@ -70,7 +70,7 @@ variables:
SLURM_EXCLUSIVE: ''
SLURM_EXACT: ''
SLURM_CONSTRAINT: $SLURM_CONSTRAINT
CRAY_CUDA_MPS: 0
CRAY_CUDA_MPS: 1
MPICH_MAX_THREAD_SAFETY: multiple

{{JOBS}}
Expand Down Expand Up @@ -104,7 +104,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do
N=`echo "$rank_label" | sed "s/RANK_//"`
C=$(( THREADS_PER_NODE / N ))
if [ $C -gt $THREADS_MAX_PER_TASK ]; then
C=$THREADS_MAX_PER_TASK
C=$THREADS_MAX_PER_TASK
fi

# Skip label combinations that match no tests
Expand Down
32 changes: 0 additions & 32 deletions ci/cuda/gcc11_codecov.yml

This file was deleted.

30 changes: 0 additions & 30 deletions ci/cuda/gcc11_debug_scalapack.yml

This file was deleted.

31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release.yml

This file was deleted.

31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release_scalapack.yml

This file was deleted.

Loading
Loading