1- ARG CUDA_VER='11.2.1'
1+ #ARG CUDA_VER='11.2.1'
2+ ARG CUDA_VER='11.1.1'
23FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
34#==============================================================================
45ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
56ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
67ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
78ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
9+ ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
810ENV CUDA_HOME=/usr/local/cuda
911ENV UCX_BRANCH=v1.10.x
1012ENV UCX_BUILD_TYPE=release-mt
@@ -13,109 +15,61 @@ ENV XCCL_BUILD_TYPE=debug
1315ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
1416#==============================================================================
1517RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
16- mkdir -p ${TORCH_UCC_PKG_DIR} && \
17- mkdir -p ${TORCH_UCC_BIN_DIR}
18+ mkdir -p ${TORCH_UCC_PKG_DIR} && \
19+ mkdir -p ${TORCH_UCC_BIN_DIR} && \
20+ mkdir -p ${TORCH_UCC_WORKLOADS_DIR}
1821
1922COPY . ${TORCH_UCC_SRC_DIR}
2023#==============================================================================
2124ARG DEBIAN_FRONTEND=noninteractive
2225RUN apt update && \
23- apt install -y \
24- apt-utils \
25- autoconf \
26- build-essential \
27- cmake \
28- curl \
29- git \
30- ibverbs-providers \
31- ibverbs-utils \
32- libnuma-dev \
33- libtool-bin \
34- ninja-build \
35- openmpi-bin \
36- vim \
37- && \
38- rm -rf /var/lib/apt/lists/*
26+ apt install -y \
27+ apt-utils \
28+ autoconf \
29+ build-essential \
30+ cmake \
31+ curl \
32+ git \
33+ ibverbs-providers \
34+ ibverbs-utils \
35+ libnuma-dev \
36+ libtool-bin \
37+ ninja-build \
38+ openmpi-bin \
39+ openssh-server \
40+ vim \
41+ && \
42+ rm -rf /var/lib/apt/lists/*
43+ #==============================================================================
44+ # Configure SSH
45+ RUN mkdir -p /var/run/sshd && \
46+ cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
47+ echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
48+ mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
3949#==============================================================================
4050# Build UCX
41- RUN echo "INFO: Build UCX" && \
42- cd ${TORCH_UCC_SRC_DIR}/ucx && \
43- git checkout ${UCX_BRANCH} && \
44- ${TORCH_UCC_SRC_DIR}/ucx/autogen.sh && \
45- mkdir -p ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
46- cd ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \
47- ${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} && \
48- make -j install && \
49- echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf && \
50- ldconfig && \
51- ldconfig -p | grep -i ucx && \
52- cd ${UCX_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz --owner=0 --group=0 .
51+ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
5352ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
5453#==============================================================================
55- # Build XCCL
56- RUN echo "INFO: Build XCCL" && \
57- cd ${TORCH_UCC_SRC_DIR}/xccl && \
58- ${TORCH_UCC_SRC_DIR}/xccl/autogen.sh && \
59- mkdir -p ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
60- cd ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \
61- ${TORCH_UCC_SRC_DIR}/xccl/configure --with-ucx=${UCX_INSTALL_DIR} \
62- --prefix=${XCCL_INSTALL_DIR} --enable-debug && \
63- make -j install && \
64- echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf && \
65- ldconfig && \
66- ldconfig -p | grep -i xccl && \
67- make -C test && \
68- cd ${XCCL_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz --owner=0 --group=0 .
69- #==============================================================================
70- # Install conda
71- RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
72- bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \
73- rm -f Miniconda3-latest-Linux-x86_64.sh
54+ # Configure Python
55+ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
7456ENV PATH /opt/conda/bin:${PATH}
7557#==============================================================================
76- # Install conda python
77- RUN conda update -y conda && \
78- conda install -c anaconda -y \
79- python \
80- pip && \
81- pip install --no-cache-dir python-hostlist
82-
83- RUN ln -s /opt/conda/bin/python /usr/bin/python
84- RUN python3 -m pip install --user --upgrade setuptools wheel auditwheel check-wheel-contents
58+ # Build XCCL
59+ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
8560#==============================================================================
86- # Build and Install PyTorch
87- RUN cd /tmp && \
88- git clone https://github.com/pytorch/pytorch.git && \
89- cd pytorch && \
90- git submodule sync --recursive && \
91- git submodule update --init --recursive && \
92- pip install -r requirements.txt && \
93- TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" \
94- USE_GLOO=1 \
95- USE_DISTRIBUTED=1 \
96- USE_OPENCV=0 \
97- USE_CUDA=1 \
98- USE_NCCL=0 \
99- USE_MKLDNN=0 \
100- BUILD_TEST=0 \
101- USE_FBGEMM=0 \
102- USE_NNPACK=0 \
103- USE_QNNPACK=0 \
104- USE_XNNPACK=0 \
105- USE_KINETO=1 \
106- MAX_JOBS=$(($(nproc)-1)) \
107- python setup.py install && \
108- rm -rf /tmp/pytorch
61+ # Install PyTorch
62+ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
10963#==============================================================================
11064# Install torch_ucc python module and build a wheel package
111- RUN echo "INFO: Install Torch-UCC" && \
112- cd ${TORCH_UCC_SRC_DIR} && \
113- env \
114- UCX_HOME=${UCX_INSTALL_DIR} \
115- XCCL_HOME=${XCCL_INSTALL_DIR} \
116- WITH_CUDA=${CUDA_HOME} \
117- python setup.py install bdist_wheel && \
118- pip3 list | grep torch && \
119- python -c 'import torch, torch_ucc' && \
120- cp ${TORCH_UCC_SRC_DIR}/dist/*.whl ${TORCH_UCC_PKG_DIR}
65+ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
12166#==============================================================================
67+ # Install workloads
68+ WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
69+ RUN git clone https://github.com/facebookresearch/dlrm.git && \
70+ cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
71+ git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
72+ pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
73+ pip install tensorboard
74+ RUN git clone https://github.com/facebookresearch/param.git && \
75+ pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt
0 commit comments