Skip to content

Commit

Permalink
gpu to gh200
Browse files Browse the repository at this point in the history
  • Loading branch information
rasolca committed Jan 29, 2025
1 parent c6913a3 commit 5e7de35
Show file tree
Hide file tree
Showing 11 changed files with 293 additions and 90 deletions.
38 changes: 19 additions & 19 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
include:
- local: 'ci/cpu/asan_ubsan_lsan.yml'
- local: 'ci/cpu/clang15_release_cxx20.yml'
- local: 'ci/cpu/clang15_release_stdexec.yml'
- local: 'ci/cpu/clang15_release.yml'
- local: 'ci/cpu/clang16_release.yml'
- local: 'ci/cpu/clang18_release.yml'
- local: 'ci/cpu/gcc11_release_stdexec.yml'
- local: 'ci/cpu/gcc11_debug_stdexec.yml'
- local: 'ci/cpu/gcc12_release_cxx20.yml'
- local: 'ci/cpu/gcc13_codecov.yml'
- local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc11_release.yml'
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
- local: 'ci/rocm/clang15_release_stdexec.yml'
# - local: 'ci/cpu/asan_ubsan_lsan.yml'
# - local: 'ci/cpu/clang15_release_cxx20.yml'
# - local: 'ci/cpu/clang15_release_stdexec.yml'
# - local: 'ci/cpu/clang15_release.yml'
# - local: 'ci/cpu/clang16_release.yml'
# - local: 'ci/cpu/clang18_release.yml'
# - local: 'ci/cpu/gcc11_release_stdexec.yml'
# - local: 'ci/cpu/gcc11_debug_stdexec.yml'
# - local: 'ci/cpu/gcc12_release_cxx20.yml'
# - local: 'ci/cpu/gcc13_codecov.yml'
# - local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc13_release.yml'
# - local: 'ci/cuda/gcc11_release_scalapack.yml'
# - local: 'ci/cuda/gcc11_codecov.yml'
# - local: 'ci/cuda/gcc11_debug_scalapack.yml'
# - local: 'ci/cuda/gcc13_release_stdexec.yml'
# - local: 'ci/rocm/clang14_release.yml'
# - local: 'ci/rocm/clang14_release_stdexec.yml'
# - local: 'ci/rocm/clang15_release_stdexec.yml'
70 changes: 52 additions & 18 deletions ci/common-ci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

# Need custom definition as remote adds SLURM_MPI_TYPE: pmi2
.dlaf-container-runner-daint-gh200:
extends: .container-runner-daint
variables:
ARCH: 'aarch64'
USE_CE: 'YES'
NVIDIA_VISIBLE_DEVICES: 'all'
NVIDIA_DRIVER_CAPABILITIES: 'compute,utility'

stages:
- build_deps
- build
Expand All @@ -9,12 +18,9 @@ stages:
variables:
FF_TIMESTAMPS: true

##
## BUILDS
##
## BUILD DEPS

.build_deps_common:
extends: .container-builder
.build_deps_common_base:
stage: build_deps
timeout: 6 hours
before_script:
Expand Down Expand Up @@ -58,11 +64,26 @@ variables:
EXTRA_APTGET: ""
CXXSTD: 17
USE_MKL: "OFF"
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
USE_CODECOV: "false"

.build_common:
extends: .container-builder
.build_deps_common:
extends:
- .container-builder-cscs-zen2
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml

.build_deps_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_deps_common_base
variables:
DOCKERFILE: ci/docker/build-craympich.Dockerfile
COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml

## BUILD DLAF

.build_common_base:
stage: build
timeout: 2 hours
before_script:
Expand All @@ -74,6 +95,7 @@ variables:
PERSIST_IMAGE_NAME: $DLAF_IMAGE
DOCKER_BUILD_ARGS: '[
"DEPS_IMAGE",
"DLAF_LD_PRELOAD"
"PIP_OPTS",
"NUM_PROCS=$NUM_CORES_BUILD_DLAF"
]'
Expand All @@ -85,19 +107,20 @@ variables:
paths:
- pipeline.yml

.build_for_daint-mc:
.build_common:
extends:
- .container-builder-cscs-zen2
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: mc
THREADS_MAX_PER_TASK: 72
THREADS_PER_NODE: 72
DLAF_LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"


.build_for_daint-gpu:
.build_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common_base
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: gpu
THREADS_MAX_PER_TASK: 24
THREADS_PER_NODE: 24
DLAF_LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"

.build_for_eiger:
variables:
Expand All @@ -106,9 +129,20 @@ variables:
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

.build_for_alps_gh200:
variables:
RUNNER: ".dlaf-container-runner-daint-gh200"
SLURM_CONSTRAINT: gpu
# 64 / 2 to avoid ranks on multiple sockets for RANK6
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

## RUN

.run_common:
stage: test
trigger:
strategy: depend
forward:
pipeline_variables: true

2 changes: 1 addition & 1 deletion ci/cpu/asan_ubsan_lsan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cpu asan ubsan lsan test:
ASAN_OPTIONS: "fast_unwind_on_malloc=0:strict_string_checks=1:detect_leaks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"
UBSAN_OPTIONS: "halt_on_error=1:print_stacktrace=1"
# Override use of libSegFault, not necessary with sanitizers
LD_PRELOAD: ""
DLAF_LD_PRELOAD: ""
trigger:
include:
- artifact: pipeline.yml
Expand Down
5 changes: 2 additions & 3 deletions ci/ctest_to_gitlab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ ARTIFACTS="
"
fi

# CRAY_CUDA_MPS set to 0 to avoid test hanging on daint (See PR #1197)
BASE_TEMPLATE="
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
Expand All @@ -70,7 +69,7 @@ variables:
SLURM_EXCLUSIVE: ''
SLURM_EXACT: ''
SLURM_CONSTRAINT: $SLURM_CONSTRAINT
CRAY_CUDA_MPS: 0
CRAY_CUDA_MPS: 1
MPICH_MAX_THREAD_SAFETY: multiple
{{JOBS}}
Expand Down Expand Up @@ -104,7 +103,7 @@ for rank_label in `ctest --print-labels | egrep -o "RANK_[1-9][0-9]?"`; do
N=`echo "$rank_label" | sed "s/RANK_//"`
C=$(( THREADS_PER_NODE / N ))
if [ $C -gt $THREADS_MAX_PER_TASK ]; then
C=$THREADS_MAX_PER_TASK
C=$THREADS_MAX_PER_TASK
fi

# Skip label combinations that match no tests
Expand Down
31 changes: 0 additions & 31 deletions ci/cuda/gcc11_release.yml

This file was deleted.

29 changes: 29 additions & 0 deletions ci/cuda/gcc13_release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
include:
- local: 'ci/common-ci.yml'

cuda gcc13 release deps:
extends: .build_deps_common_gh200
variables:
DOCKERFILE: ci/docker/build-craympich.Dockerfile
BASE_IMAGE: $CSCS_REGISTRY_PATH/base-images/cuda_12.6.1-devel-ubuntu24.04:v1
COMPILER: gcc@13
SPACK_ENVIRONMENT: ci/docker/release-cuda-gh200.yaml
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/deps

cuda gcc13 release build:
extends:
- .build_common_gh200
- .build_for_alps_gh200
needs:
- cuda gcc13 release deps
variables:
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cuda-gh200-gcc13-release/dlaf:$CI_COMMIT_SHA

cuda gcc13 release test:
extends: .run_common
needs:
- cuda gcc13 release build
trigger:
include:
- artifact: pipeline.yml
job: cuda gcc13 release build
112 changes: 112 additions & 0 deletions ci/docker/build-craympich.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
ARG BASE_IMAGE=ubuntu:24.04

FROM $BASE_IMAGE

# set jfrog autoclean policy
LABEL com.jfrog.artifactory.retention.maxDays="21"

ENV DEBIAN_FRONTEND=noninteractive \
PATH="$PATH:/opt/spack/bin" \
SPACK_COLOR=always

# Overwrite entrypoint as NVIDIA images set a script that clog the output.
ENTRYPOINT []
CMD [ "/bin/bash" ]
SHELL ["/bin/bash", "-c"]

ARG EXTRA_APTGET
# python is needed for spack and fastcov
# codecov upload needs curl + ca-certificates
# glibc-tools is needed for libSegFault on ubuntu > 22.04
# jq, strace are needed for check-threads
# tzdata is needed to print correct time
RUN apt-get -yqq update && \
apt-get -yqq install --no-install-recommends \
software-properties-common \
build-essential gfortran \
autoconf automake libssl-dev ninja-build pkg-config \
gawk git tar \
wget curl ca-certificates gpg-agent tzdata \
python3 python3-setuptools \
glibc-tools jq strace \
patchelf unzip file gnupg2 libncurses-dev \
${EXTRA_APTGET} && \
rm -rf /var/lib/apt/lists/*

# Install MKL and remove static libs (to keep image smaller)
ARG USE_MKL=ON
ARG MKL_VERSION=2024.0
ARG MKL_SPEC=2024.0.0
RUN if [ "$USE_MKL" = "ON" ]; then \
wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB 2>/dev/null > /etc/apt/trusted.gpg.d/intel.asc && \
apt-add-repository 'deb https://apt.repos.intel.com/oneapi all main' && \
apt-get install -y -qq --no-install-recommends intel-oneapi-mkl-devel-${MKL_VERSION} && \
rm -rf /var/lib/apt/lists/* && \
find "/opt/intel/oneapi" -name "*.a" -delete ; \
fi

# This is the spack version we want to have
ARG SPACK_SHA
ENV SPACK_SHA=$SPACK_SHA

# Install the specific ref of Spack provided by the user and find compilers
RUN mkdir -p /opt/spack && \
curl -Ls "https://api.github.com/repos/spack/spack/tarball/$SPACK_SHA" | tar --strip-components=1 -xz -C /opt/spack

# Find compilers + Add gfortran to clang specs + Define which compiler we want to use
ARG COMPILER
RUN spack compiler find && \
gawk -i inplace '$0 ~ "compiler:" {flag=0} $0 ~ "spec:.*clang" {flag=1} flag == 1 && $1 ~ "^f[c7]" && $2 ~ "null" {gsub("null","/usr/bin/gfortran",$0)} {print $0}' /root/.spack/linux/compilers.yaml && \
spack config add "packages:all:require:[\"%${COMPILER}\"]"

RUN spack external find \
autoconf \
automake \
bzip2 \
cuda \
diffutils \
findutils \
git \
ninja \
m4 \
ncurses \
openssl \
perl \
pkg-config \
python \
xz && \
if [ "$USE_MKL" = "ON" ]; then \
echo -e " intel-oneapi-mkl:\n externals:\n - spec: \"intel-oneapi-mkl@$MKL_SPEC mpi_family=mpich\"\n prefix: /opt/intel/oneapi\n buildable: False" >> ~/.spack/packages.yaml ; \
fi

# Add our custom spack repo from here
ARG SPACK_DLAF_REPO
COPY $SPACK_DLAF_REPO /user_repo

RUN spack repo add --scope site /user_repo

### Workaround until CE provides full MPI substitution.
RUN spack repo add --scope site ~/site/repo

# Set this to a spack.yaml file which contains a spec
# e.g. --build-arg SPACK_ENVIRONMENT=ci/spack/my-env.yaml
ARG SPACK_ENVIRONMENT
ARG COMMON_SPACK_ENVIRONMENT
ARG ENV_VIEW=/view

# Build dependencies
# 1. Create a spack environment named `ci` from the input spack.yaml file
COPY $SPACK_ENVIRONMENT /spack_environment/spack.yaml
COPY $COMMON_SPACK_ENVIRONMENT /spack_environment/
RUN spack env create --with-view ${ENV_VIEW} ci /spack_environment/spack.yaml
# 2. Set the C++ standard
ARG CXXSTD=17
RUN spack -e ci config add "packages:dla-future:variants:cxxstd=${CXXSTD}"
# 3. Install only the dependencies of this (top level is our package)
ARG NUM_PROCS
RUN spack -e ci install --jobs ${NUM_PROCS} --fail-fast --only=dependencies

# make ctest executable available.
RUN ln -s `spack -e ci location -i cmake`/bin/ctest /usr/bin/ctest

RUN echo ${ENV_VIEW}/lib > /etc/ld.so.conf.d/dlaf.conf && ldconfig
Loading

0 comments on commit 5e7de35

Please sign in to comment.