Skip to content
This repository was archived by the owner on May 17, 2022. It is now read-only.

Commit 2f29428

Browse files
committed
Reworked building/packaging, added docker image preparation, added tests
1 parent 099ca8b commit 2f29428

File tree

7 files changed

+142
-56
lines changed

7 files changed

+142
-56
lines changed

.ci/Dockerfile.ubuntu20.04

Lines changed: 73 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,24 @@
1-
FROM nvidia/cuda:11.2.1-devel-ubuntu20.04
1+
ARG CUDA_VER='11.2.1'
2+
FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
3+
#==============================================================================
4+
ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
5+
ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
6+
ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
7+
ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
8+
ENV CUDA_HOME=/usr/local/cuda
9+
ENV UCX_BRANCH=v1.10.x
10+
ENV UCX_BUILD_TYPE=release-mt
11+
ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
12+
ENV XCCL_BUILD_TYPE=debug
13+
ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
14+
#==============================================================================
15+
RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
16+
mkdir -p ${TORCH_UCC_PKG_DIR} && \
17+
mkdir -p ${TORCH_UCC_BIN_DIR}
218

19+
COPY . ${TORCH_UCC_SRC_DIR}
20+
#==============================================================================
321
ARG DEBIAN_FRONTEND=noninteractive
4-
522
RUN apt update && \
623
apt install -y \
724
apt-utils \
@@ -14,16 +31,48 @@ RUN apt update && \
1431
ibverbs-utils \
1532
libnuma-dev \
1633
libtool-bin \
34+
ninja-build \
35+
openmpi-bin \
1736
vim \
1837
&& \
1938
rm -rf /var/lib/apt/lists/*
20-
39+
#==============================================================================
40+
# Build UCX
41+
RUN echo "INFO: Build UCX" && \
42+
cd ${TORCH_UCC_SRC_DIR}/ucx && \
43+
git checkout ${UCX_BRANCH} && \
44+
${TORCH_UCC_SRC_DIR}/ucx/autogen.sh && \
45+
mkdir -p ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
46+
cd ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
47+
${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} && \
48+
make -j install && \
49+
echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf && \
50+
ldconfig && \
51+
ldconfig -p | grep -i ucx && \
52+
cd ${UCX_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz --owner=0 --group=0 .
53+
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
54+
#==============================================================================
55+
# Build XCCL
56+
RUN echo "INFO: Build XCCL" && \
57+
cd ${TORCH_UCC_SRC_DIR}/xccl && \
58+
${TORCH_UCC_SRC_DIR}/xccl/autogen.sh && \
59+
mkdir -p ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
60+
cd ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
61+
${TORCH_UCC_SRC_DIR}/xccl/configure --with-ucx=${UCX_INSTALL_DIR} \
62+
--prefix=${XCCL_INSTALL_DIR} --enable-debug && \
63+
make -j install && \
64+
echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf && \
65+
ldconfig && \
66+
ldconfig -p | grep -i xccl && \
67+
make -C test && \
68+
cd ${XCCL_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz --owner=0 --group=0 .
69+
#==============================================================================
2170
# Install conda
2271
RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
2372
bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \
2473
rm -f Miniconda3-latest-Linux-x86_64.sh
2574
ENV PATH /opt/conda/bin:${PATH}
26-
75+
#==============================================================================
2776
# Install conda python
2877
RUN conda update -y conda && \
2978
conda install -c anaconda -y \
@@ -32,11 +81,11 @@ RUN conda update -y conda && \
3281
pip install --no-cache-dir python-hostlist
3382

3483
RUN ln -s /opt/conda/bin/python /usr/bin/python
35-
36-
WORKDIR "/workspace"
37-
38-
# Install PyTorch
39-
RUN git clone https://github.com/pytorch/pytorch.git && \
84+
RUN python3 -m pip install --user --upgrade setuptools wheel auditwheel check-wheel-contents
85+
#==============================================================================
86+
# Build and Install PyTorch
87+
RUN cd /tmp && \
88+
git clone https://github.com/pytorch/pytorch.git && \
4089
cd pytorch && \
4190
git submodule sync --recursive && \
4291
git submodule update --init --recursive && \
@@ -54,5 +103,19 @@ RUN git clone https://github.com/pytorch/pytorch.git && \
54103
USE_QNNPACK=0 \
55104
USE_XNNPACK=0 \
56105
USE_KINETO=1 \
106+
MAX_JOBS=$(($(nproc)-1)) \
57107
python setup.py install && \
58-
rm -rf /workspace/pytorch
108+
rm -rf /tmp/pytorch
109+
#==============================================================================
110+
# Install torch_ucc python module and build a wheel package
111+
RUN echo "INFO: Install Torch-UCC" && \
112+
cd ${TORCH_UCC_SRC_DIR} && \
113+
env \
114+
UCX_HOME=${UCX_INSTALL_DIR} \
115+
XCCL_HOME=${XCCL_INSTALL_DIR} \
116+
WITH_CUDA=${CUDA_HOME} \
117+
python setup.py install bdist_wheel && \
118+
pip3 list | grep torch && \
119+
python -c 'import torch, torch_ucc' && \
120+
cp ${TORCH_UCC_SRC_DIR}/dist/*.whl ${TORCH_UCC_PKG_DIR}
121+
#==============================================================================

.ci/job_matrix.yaml

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ job: 'torch-ucc'
44
registry_host: 'harbor.mellanox.com'
55
# TODO change
66
registry_path: '/swx-infra/torch-ucc'
7+
#registry_path: '/torch-ucc'
78
registry_auth: '1daaea28-800e-425f-a91f-3bd3e9136eea'
89

910
kubernetes:
@@ -16,63 +17,44 @@ volumes:
1617
- { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' }
1718

1819
env:
19-
CUDA_HOME: '/usr/local/cuda'
20-
UCX_BRANCH: 'v1.10.x'
21-
UCX_SRC_DIR: '${WORKSPACE}/ucx'
22-
UCX_BUILD_DIR: '${UCX_SRC_DIR}/build'
23-
UCX_INSTALL_DIR: '${UCX_BUILD_DIR}/_install'
24-
UCC_SRC_DIR: '${WORKSPACE}/ucc'
25-
XCCL_SRC_DIR: '${WORKSPACE}/xccl'
26-
XCCL_BUILD_DIR: '${XCCL_SRC_DIR}/build'
27-
XCCL_INSTALL_DIR: '${XCCL_BUILD_DIR}/_install'
20+
CUDA_VER: '11.2.1'
21+
TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/ubuntu20.04/cuda${CUDA_VER}'
22+
TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc'
23+
TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src'
2824

2925
runs_on_dockers:
30-
- { file: '.ci/Dockerfile.ubuntu20.04', name: 'ubuntu20.04', tag: 'latest', arch: 'x86_64' }
26+
- {
27+
file: '.ci/Dockerfile.ubuntu20.04',
28+
name: 'ubuntu20.04',
29+
tag: 'latest',
30+
arch: 'x86_64',
31+
uri: '${TORCH_UCC_URI_SUFFIX}',
32+
build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}',
33+
}
34+
35+
# TODO debug
36+
timeout_minutes: '180'
3137

3238
steps:
3339
#============================================================================
3440
- name: Check Env
35-
shell: '#!/bin/bash -eEx'
3641
run: |
3742
echo "INFO: check environment"
38-
find .
3943
printenv
4044
cat /proc/1/cgroup
4145
cat /etc/*release*
4246
id
47+
find /opt/nvidia
4348
#============================================================================
44-
- name: Build UCX
45-
shell: '#!/bin/bash -eEx'
46-
run: |
47-
echo "INFO: Build UCX"
48-
cd ${UCX_SRC_DIR}
49-
git checkout ${UCX_BRANCH}
50-
${UCX_SRC_DIR}/autogen.sh
51-
mkdir -p ${UCX_BUILD_DIR}
52-
cd ${UCX_BUILD_DIR}
53-
${UCX_SRC_DIR}/configure --enable-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR}
54-
make -j install
55-
#============================================================================
56-
- name: Build XCCL
57-
shell: '#!/bin/bash -eEx'
49+
- name: Run XCCL tests
5850
run: |
59-
echo "INFO: Build XCCL"
60-
cd ${XCCL_SRC_DIR}
61-
# TODO tmp W/A
62-
sed -i 's|NVCCFLAGS = .*|NVCCFLAGS = "${UCS_CPPFLAGS} -I${XCCL_TOP_SRCDIR}/src -I${XCCL_TOP_SRCDIR}/src/core" --compiler-options -fno-rtti,-fno-exceptions|g' ${XCCL_SRC_DIR}/src/utils/cuda/kernels/Makefile.am
63-
${XCCL_SRC_DIR}/autogen.sh
64-
mkdir -p ${XCCL_BUILD_DIR}
65-
cd ${XCCL_BUILD_DIR}
66-
${XCCL_SRC_DIR}/configure --with-cuda=${CUDA_HOME} --with-ucx=${UCX_INSTALL_DIR} \
67-
--prefix=${XCCL_INSTALL_DIR}
68-
make -j install
51+
echo "INFO: Run XCCL tests"
52+
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh
53+
#sleep 10000
6954
#============================================================================
70-
- name: Install Torch-UCC
55+
- name: Run Torch-UCC tests
7156
run: |
72-
echo "INFO: Install Torch-UCC"
73-
cd ${WORKSPACE}
74-
UCX_HOME=${UCX_INSTALL_DIR}
75-
XCCL_HOME=${XCCL_INSTALL_DIR}
76-
WITH_CUDA=${CUDA_HOME}
77-
python setup.py install
57+
echo "INFO: Run Torch-UCC tests"
58+
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh
59+
#sleep 20000
7860
#============================================================================

.ci/scripts/run_tests_torch_ucc.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/sh -eEx
2+
3+
command -v mpirun
4+
export TORCH_UCC_XCCL_TLS=ucx
5+
ucx_info -e -u t
6+
export UCX_LOG_LEVEL=info
7+
#echo "XCCL allreduce"
8+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo
9+
#echo "XCCL alltoall"
10+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo
11+
#echo "XCCL alltoallv"
12+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo
13+
#echo "XCCL barrier"
14+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_barrier_test.py --backend=gloo
15+
#echo "XCCL allgather"
16+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allgather_test.py --backend=gloo
17+
#echo "XCCL broadcast"
18+
#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_bcast_test.py --backend=gloo

.ci/scripts/run_tests_xccl.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash -eEx
2+
3+
command -v mpirun
4+
export UCX_SOCKADDR_CM_ENABLE=n
5+
#MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none -mca coll ^hcoll"
6+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce
7+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast
8+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier
9+
#
10+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce
11+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast
12+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier
13+
#
14+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce
15+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce
16+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_bcast
17+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_barrier
18+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall
19+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv
20+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allgather
21+
#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall
22+
#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv
23+
#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${XCCL_BUILD_DIR}/test/test_mpi_mt

ucx

Submodule ucx updated 71 files

0 commit comments

Comments
 (0)