Skip to content
This repository was archived by the owner on May 17, 2022. It is now read-only.

Commit 32591b2

Browse files
committed
Reworked for CentOS 8
Signed-off-by: artemry-nv <[email protected]>
2 parents 2f29428 + dec54a3 commit 32591b2

18 files changed

+549
-149
lines changed

.ci/Dockerfile.centos8

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
ARG CUDA_VER='11.2.1'
2+
FROM nvidia/cuda:${CUDA_VER}-devel-centos8
3+
#==============================================================================
4+
ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
5+
ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
6+
ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
7+
ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
8+
ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
9+
ENV CUDA_HOME=/usr/local/cuda
10+
ENV UCX_BRANCH=v1.10.x
11+
ENV UCX_BUILD_TYPE=release-mt
12+
ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
13+
ENV XCCL_BUILD_TYPE=debug
14+
ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
15+
#==============================================================================
16+
RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
17+
mkdir -p ${TORCH_UCC_PKG_DIR} && \
18+
mkdir -p ${TORCH_UCC_BIN_DIR} && \
19+
mkdir -p ${TORCH_UCC_WORKLOADS_DIR}
20+
21+
COPY . ${TORCH_UCC_SRC_DIR}
22+
#==============================================================================
23+
RUN yum groupinstall -y \
24+
'Development Tools' \
25+
'Infiniband Support'
26+
RUN yum config-manager --set-enabled powertools && yum install -y \
27+
cmake \
28+
numactl \
29+
numactl-devel \
30+
openmpi \
31+
openmpi-devel \
32+
openssh-server \
33+
protobuf-compiler \
34+
protobuf-devel \
35+
python36-devel \
36+
vim
37+
# Remove old UCX
38+
RUN rpm -e --nodeps ucx
39+
ENV PATH=/usr/lib64/openmpi/bin:${PATH}
40+
#==============================================================================
41+
# Configure SSH
42+
RUN mkdir -p /var/run/sshd && \
43+
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
44+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
45+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
46+
#==============================================================================
47+
# Build UCX
48+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
49+
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
50+
#==============================================================================
51+
# Configure Python
52+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
53+
#==============================================================================
54+
# Build XCCL
55+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
56+
#==============================================================================
57+
# Install PyTorch
58+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
59+
#==============================================================================
60+
# Install torch_ucc python module and build a wheel package
61+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
62+
#==============================================================================
63+
# Install workloads
64+
WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
65+
RUN git clone https://github.com/facebookresearch/dlrm.git && \
66+
cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
67+
git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
68+
pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
69+
pip3 install tensorboard
70+
RUN git clone https://github.com/facebookresearch/param.git && \
71+
pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt

.ci/Dockerfile.ubuntu20.04

Lines changed: 46 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
ARG CUDA_VER='11.2.1'
1+
#ARG CUDA_VER='11.2.1'
2+
ARG CUDA_VER='11.1.1'
23
FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
34
#==============================================================================
45
ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
56
ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
67
ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
78
ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
9+
ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
810
ENV CUDA_HOME=/usr/local/cuda
911
ENV UCX_BRANCH=v1.10.x
1012
ENV UCX_BUILD_TYPE=release-mt
@@ -13,109 +15,61 @@ ENV XCCL_BUILD_TYPE=debug
1315
ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
1416
#==============================================================================
1517
RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
16-
mkdir -p ${TORCH_UCC_PKG_DIR} && \
17-
mkdir -p ${TORCH_UCC_BIN_DIR}
18+
mkdir -p ${TORCH_UCC_PKG_DIR} && \
19+
mkdir -p ${TORCH_UCC_BIN_DIR} && \
20+
mkdir -p ${TORCH_UCC_WORKLOADS_DIR}
1821

1922
COPY . ${TORCH_UCC_SRC_DIR}
2023
#==============================================================================
2124
ARG DEBIAN_FRONTEND=noninteractive
2225
RUN apt update && \
23-
apt install -y \
24-
apt-utils \
25-
autoconf \
26-
build-essential \
27-
cmake \
28-
curl \
29-
git \
30-
ibverbs-providers \
31-
ibverbs-utils \
32-
libnuma-dev \
33-
libtool-bin \
34-
ninja-build \
35-
openmpi-bin \
36-
vim \
37-
&& \
38-
rm -rf /var/lib/apt/lists/*
26+
apt install -y \
27+
apt-utils \
28+
autoconf \
29+
build-essential \
30+
cmake \
31+
curl \
32+
git \
33+
ibverbs-providers \
34+
ibverbs-utils \
35+
libnuma-dev \
36+
libtool-bin \
37+
ninja-build \
38+
openmpi-bin \
39+
openssh-server \
40+
vim \
41+
&& \
42+
rm -rf /var/lib/apt/lists/*
43+
#==============================================================================
44+
# Configure SSH
45+
RUN mkdir -p /var/run/sshd && \
46+
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
47+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
48+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
3949
#==============================================================================
4050
# Build UCX
41-
RUN echo "INFO: Build UCX" && \
42-
cd ${TORCH_UCC_SRC_DIR}/ucx && \
43-
git checkout ${UCX_BRANCH} && \
44-
${TORCH_UCC_SRC_DIR}/ucx/autogen.sh && \
45-
mkdir -p ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
46-
cd ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
47-
${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} && \
48-
make -j install && \
49-
echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf && \
50-
ldconfig && \
51-
ldconfig -p | grep -i ucx && \
52-
cd ${UCX_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz --owner=0 --group=0 .
51+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
5352
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
5453
#==============================================================================
55-
# Build XCCL
56-
RUN echo "INFO: Build XCCL" && \
57-
cd ${TORCH_UCC_SRC_DIR}/xccl && \
58-
${TORCH_UCC_SRC_DIR}/xccl/autogen.sh && \
59-
mkdir -p ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
60-
cd ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
61-
${TORCH_UCC_SRC_DIR}/xccl/configure --with-ucx=${UCX_INSTALL_DIR} \
62-
--prefix=${XCCL_INSTALL_DIR} --enable-debug && \
63-
make -j install && \
64-
echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf && \
65-
ldconfig && \
66-
ldconfig -p | grep -i xccl && \
67-
make -C test && \
68-
cd ${XCCL_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz --owner=0 --group=0 .
69-
#==============================================================================
70-
# Install conda
71-
RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
72-
bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \
73-
rm -f Miniconda3-latest-Linux-x86_64.sh
54+
# Configure Python
55+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
7456
ENV PATH /opt/conda/bin:${PATH}
7557
#==============================================================================
76-
# Install conda python
77-
RUN conda update -y conda && \
78-
conda install -c anaconda -y \
79-
python \
80-
pip && \
81-
pip install --no-cache-dir python-hostlist
82-
83-
RUN ln -s /opt/conda/bin/python /usr/bin/python
84-
RUN python3 -m pip install --user --upgrade setuptools wheel auditwheel check-wheel-contents
58+
# Build XCCL
59+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
8560
#==============================================================================
86-
# Build and Install PyTorch
87-
RUN cd /tmp && \
88-
git clone https://github.com/pytorch/pytorch.git && \
89-
cd pytorch && \
90-
git submodule sync --recursive && \
91-
git submodule update --init --recursive && \
92-
pip install -r requirements.txt && \
93-
TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" \
94-
USE_GLOO=1 \
95-
USE_DISTRIBUTED=1 \
96-
USE_OPENCV=0 \
97-
USE_CUDA=1 \
98-
USE_NCCL=0 \
99-
USE_MKLDNN=0 \
100-
BUILD_TEST=0 \
101-
USE_FBGEMM=0 \
102-
USE_NNPACK=0 \
103-
USE_QNNPACK=0 \
104-
USE_XNNPACK=0 \
105-
USE_KINETO=1 \
106-
MAX_JOBS=$(($(nproc)-1)) \
107-
python setup.py install && \
108-
rm -rf /tmp/pytorch
61+
# Install PyTorch
62+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
10963
#==============================================================================
11064
# Install torch_ucc python module and build a wheel package
111-
RUN echo "INFO: Install Torch-UCC" && \
112-
cd ${TORCH_UCC_SRC_DIR} && \
113-
env \
114-
UCX_HOME=${UCX_INSTALL_DIR} \
115-
XCCL_HOME=${XCCL_INSTALL_DIR} \
116-
WITH_CUDA=${CUDA_HOME} \
117-
python setup.py install bdist_wheel && \
118-
pip3 list | grep torch && \
119-
python -c 'import torch, torch_ucc' && \
120-
cp ${TORCH_UCC_SRC_DIR}/dist/*.whl ${TORCH_UCC_PKG_DIR}
65+
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
12166
#==============================================================================
67+
# Install workloads
68+
WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
69+
RUN git clone https://github.com/facebookresearch/dlrm.git && \
70+
cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
71+
git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
72+
pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
73+
pip install tensorboard
74+
RUN git clone https://github.com/facebookresearch/param.git && \
75+
pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt

.ci/configs/swx-clx01/hostfile.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
swx-clx01
2+
swx-clx02

.ci/configs/swx-clx02/hostfile.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
swx-clx02
2+
swx-clx01

.ci/job_matrix.yaml

Lines changed: 57 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22
job: 'torch-ucc'
33

44
registry_host: 'harbor.mellanox.com'
5-
# TODO change
6-
registry_path: '/swx-infra/torch-ucc'
7-
#registry_path: '/torch-ucc'
8-
registry_auth: '1daaea28-800e-425f-a91f-3bd3e9136eea'
5+
registry_path: '/torch-ucc'
6+
registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89'
97

10-
kubernetes:
11-
cloud: 'swx-k8s'
8+
#kubernetes:
9+
# cloud: 'swx-k8s'
1210

1311
volumes:
1412
- { mountPath: '/hpc/local', hostPath: '/hpc/local' }
@@ -17,44 +15,85 @@ volumes:
1715
- { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' }
1816

1917
env:
20-
CUDA_VER: '11.2.1'
21-
TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/ubuntu20.04/cuda${CUDA_VER}'
22-
TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc'
23-
TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src'
18+
CUDA_VER: '11.1.1'
19+
# CUDA_VER: '11.2.1'
20+
TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}'
21+
TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}'
22+
TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc'
23+
TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src'
24+
XCCL_BUILD_TYPE: 'debug'
25+
26+
docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root'
2427

2528
runs_on_dockers:
2629
- {
27-
file: '.ci/Dockerfile.ubuntu20.04',
28-
name: 'ubuntu20.04',
29-
tag: 'latest',
30+
file: '.ci/Dockerfile.centos8',
31+
name: 'centos8',
32+
tag: '${BUILD_NUMBER}',
3033
arch: 'x86_64',
3134
uri: '${TORCH_UCC_URI_SUFFIX}',
3235
build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}',
36+
cloud: 'swx-k8s',
37+
nodeLabel: 'swx-clx01 || swx-clx02',
3338
}
3439

40+
# bare metal
41+
#runs_on_agents:
42+
# - nodeLabel: 'swx-clx01 || swx-clx02'
43+
3544
# TODO debug
36-
timeout_minutes: '180'
45+
timeout_minutes: '400'
3746

3847
steps:
3948
#============================================================================
4049
- name: Check Env
50+
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
4151
run: |
4252
echo "INFO: check environment"
53+
hostname
4354
printenv
4455
cat /proc/1/cgroup
4556
cat /etc/*release*
4657
id
4758
find /opt/nvidia
59+
ibv_devinfo
60+
nvidia-smi
61+
nvidia-smi topo -m
4862
#============================================================================
4963
- name: Run XCCL tests
64+
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
5065
run: |
5166
echo "INFO: Run XCCL tests"
67+
hostname
68+
cat /proc/1/cgroup
5269
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh
53-
#sleep 10000
54-
#============================================================================
70+
#============================================================================
5571
- name: Run Torch-UCC tests
72+
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
5673
run: |
5774
echo "INFO: Run Torch-UCC tests"
75+
hostname
76+
cat /proc/1/cgroup
5877
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh
59-
#sleep 20000
60-
#============================================================================
78+
#============================================================================
79+
# - name: Run FB DLRM tests
80+
# #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
81+
# run: |
82+
# echo "INFO: Run FB DLRM tests"
83+
# printenv
84+
# cat /proc/1/cgroup
85+
# cat /etc/*release*
86+
# id
87+
# find /opt/nvidia
88+
# #ibv_devinfo
89+
# #nvidia-smi
90+
# #${WORKSPACE}/.ci/scripts/run_fb_dlrm_docker.sh
91+
#============================================================================
92+
# - name: Run PARAM benchmarks
93+
# agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
94+
# run: |
95+
# echo "INFO: Run PARAM benchmarks"
96+
# hostname
97+
# cat /proc/1/cgroup
98+
# #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh
99+
#============================================================================
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
From bcd8fc065ef04a0ea8f06e61a5e2581a308719fd Mon Sep 17 00:00:00 2001
2+
From: artemry-nv <[email protected]>
3+
Date: Tue, 9 Mar 2021 00:41:16 +0300
4+
Subject: [PATCH] Added torch_ucc support
5+
6+
Signed-off-by: artemry-nv <[email protected]>
7+
---
8+
extend_distributed.py | 6 ++++++
9+
1 file changed, 6 insertions(+)
10+
11+
diff --git a/extend_distributed.py b/extend_distributed.py
12+
index adcb60b..1f2c8a5 100644
13+
--- a/extend_distributed.py
14+
+++ b/extend_distributed.py
15+
@@ -20,6 +20,12 @@ except ImportError as e:
16+
# print(e)
17+
torch_ccl = False
18+
19+
+try:
20+
+ import torch_ucc
21+
+except ImportError as e:
22+
+ torch_ucc = False
23+
+
24+
+
25+
my_rank = -1
26+
my_size = -1
27+
my_local_rank = -1
28+
--
29+
2.24.3 (Apple Git-128)
30+

0 commit comments

Comments
 (0)