Skip to content

Commit

Permalink
Docker: fix NCCL building Horovod (#12318)
Browse files Browse the repository at this point in the history
* Horovod w. MPI
* nccl_built
* fix
  • Loading branch information
Borda committed Mar 18, 2022
1 parent 4277845 commit efa870e
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 19 deletions.
5 changes: 3 additions & 2 deletions dockers/base-conda/Dockerfile
Expand Up @@ -123,7 +123,7 @@ ENV \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
HOROVOD_WITH_GLOO=1 \
HOROVOD_WITHOUT_MPI=1
HOROVOD_WITH_MPI=1

RUN \
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
Expand Down Expand Up @@ -154,4 +154,5 @@ RUN \
pip list && \
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
python -c "import horovod.torch"
python -c "import horovod.torch" && \
python -c "from horovod.torch import nccl_built; nccl_built()"
35 changes: 18 additions & 17 deletions dockers/base-cuda/Dockerfile
Expand Up @@ -83,21 +83,21 @@ RUN \
python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \
python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \
python -c "print(' '.join([ln for ln in open('requirements/extra.txt').readlines() if 'horovod' in ln]))" > ./requirements/horovod.txt && \
python assistant.py requirements_prune_pkgs requirements/examples.txt "horovod" && \
python assistant.py requirements_prune_pkgs requirements/extra.txt "horovod" && \
# Install all requirements \
pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
rm -rf requirements.* && \
rm assistant.py

RUN \
apt-get purge -y cmake && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
tar -zxvf cmake-3.20.2.tar.gz && \
cd cmake-3.20.2 && \
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
make && \
make install && \
cmake --version
apt-get purge -y cmake && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
tar -zxvf cmake-3.20.2.tar.gz && \
cd cmake-3.20.2 && \
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
make && \
make install && \
cmake --version

ENV \
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
Expand All @@ -106,15 +106,15 @@ ENV \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
HOROVOD_WITH_GLOO=1 \
HOROVOD_WITHOUT_MPI=1
HOROVOD_WITH_MPI=1

RUN \
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
cat ./requirements/horovod.txt && \
cmake --version && \
pip install --no-cache-dir -r ./requirements/horovod.txt && \
rm -rf requirements/
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
cat ./requirements/horovod.txt && \
cmake --version && \
pip install --no-cache-dir -r ./requirements/horovod.txt && \
rm -rf requirements/

RUN \
CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \
Expand Down Expand Up @@ -147,4 +147,5 @@ RUN \
pip list && \
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
python -c "import horovod.torch"
python -c "import horovod.torch" && \
python -c "from horovod.torch import nccl_built; nccl_built()"

0 comments on commit efa870e

Please sign in to comment.