Skip to content

Commit

Permalink
Build Horovod with temporarily installed CMake if necessary (#3371)
Browse files Browse the repository at this point in the history
Signed-off-by: Max H. Gerlach <git@maxgerlach.de>
  • Loading branch information
maxhgerlach committed Mar 1, 2022
1 parent 7bf9b04 commit 2632c05
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 23 deletions.
4 changes: 1 addition & 3 deletions Dockerfile.test.cpu
Expand Up @@ -36,6 +36,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test
RUN apt-get update -qq && apt-get install -y --no-install-recommends \
wget \
ca-certificates \
cmake \
openssh-client \
openssh-server \
git \
Expand All @@ -57,9 +58,6 @@ RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-
# https://github.com/pytorch/pytorch/issues/72045
RUN pip install --no-cache-dir -U --force pip~=21.0.0 setuptools requests pytest mock pytest-forked parameterized

# Install recent CMake.
RUN pip install --no-cache-dir -U cmake~=3.13.0

# Add launch helper scripts
RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh
RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh
Expand Down
4 changes: 1 addition & 3 deletions Dockerfile.test.gpu
Expand Up @@ -38,6 +38,7 @@ RUN CUDNN_MAJOR=$(cut -d '.' -f 1 <<< "${CUDNN_VERSION}"); \
apt-get update -qq && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
wget \
ca-certificates \
cmake \
openssh-client \
openssh-server \
git \
Expand All @@ -62,9 +63,6 @@ RUN wget --progress=dot:mega https://bootstrap.pypa.io/get-pip.py && python get-
# https://github.com/pytorch/pytorch/issues/72045
RUN pip install --no-cache-dir -U --force pip~=21.0.0 "setuptools<60.1.0" requests pytest mock pytest-forked parameterized

# Install recent CMake.
RUN pip install --no-cache-dir -U cmake~=3.13.0

# Add launch helper scripts
RUN echo "env SPARK_HOME=/spark SPARK_DRIVER_MEM=512m PYSPARK_PYTHON=/usr/bin/python${PYTHON_VERSION} PYSPARK_DRIVER_PYTHON=/usr/bin/python${PYTHON_VERSION} \"\$@\"" > /spark_env.sh
RUN echo /spark_env.sh pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > /pytest.sh
Expand Down
1 change: 1 addition & 0 deletions docker-compose.test.yml
Expand Up @@ -59,6 +59,7 @@ services:
args:
# Tensorflow 1.15.5 is only available for Python 3.7
# Python 3.7 is only available on Ubuntu 18.04
# On Ubuntu 18.04 our setup.py will pull in a recent CMake and use that only to build Horovod
UBUNTU_VERSION: 18.04
PYTHON_VERSION: 3.7
# there is no tensorflow-cpu>1.15.0, so we use tensorflow==1.15.5
Expand Down
5 changes: 1 addition & 4 deletions docker/horovod-cpu/Dockerfile
Expand Up @@ -17,9 +17,9 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-7 \
git \
gpg \
curl \
vim \
wget \
Expand Down Expand Up @@ -56,9 +56,6 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py

# Install recent CMake.
RUN pip install --no-cache-dir -U cmake~=3.13.0

# Install PyTorch, TensorFlow, Keras and MXNet
RUN pip install --no-cache-dir torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
RUN pip install --no-cache-dir pytorch_lightning==${PYTORCH_LIGHTNING_VERSION}
Expand Down
5 changes: 1 addition & 4 deletions docker/horovod-ray/Dockerfile
Expand Up @@ -15,17 +15,14 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

RUN sudo apt-get update && DEBIAN_FRONTEND="noninteractive" sudo apt-get install -y \
build-essential \
cmake \
wget \
git \
gpg \
curl \
rsync \
vim \
&& sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*

# Install recent CMake.
RUN pip install --no-cache-dir -U cmake~=3.13.0

# Install PyTorch
RUN pip install --no-cache-dir \
torch==${PYTORCH_VERSION} \
Expand Down
5 changes: 1 addition & 4 deletions docker/horovod/Dockerfile
Expand Up @@ -23,9 +23,9 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-7 \
git \
gpg \
curl \
vim \
wget \
Expand Down Expand Up @@ -68,9 +68,6 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
rm get-pip.py && \
pip install --no-cache-dir -U --force pip~=21.0.0

# Install recent CMake.
RUN pip install --no-cache-dir -U cmake~=3.13.0

# Install PyTorch, TensorFlow, Keras and MXNet
RUN pip install --no-cache-dir \
torch==${PYTORCH_VERSION} \
Expand Down
7 changes: 4 additions & 3 deletions docs/install.rst
Expand Up @@ -17,8 +17,9 @@ For best performance on GPU:

- `NCCL 2 <https://developer.nvidia.com/nccl>`__

If Horovod in unable to find the CMake binary, you may need to set ``HOROVOD_CMAKE`` in your environment before
installing.
If Horovod cannot find CMake 3.13 or newer, the build script will attempt to pull in a recent CMake binary and run it
from a temporary location. To select a specific binary you can also set ``HOROVOD_CMAKE`` in your environment before
installing Horovod.

Horovod does not support Windows.

Expand Down Expand Up @@ -246,7 +247,7 @@ Possible values are given in curly brackets: {}.
* ``HOROVOD_GPU_BROADCAST`` - {NCCL, MPI}. Framework to use for GPU tensor broadcast.
* ``HOROVOD_ALLOW_MIXED_GPU_IMPL`` - {1}. Allow Horovod to install with NCCL allreduce and MPI GPU allgather / broadcast. Not recommended due to a possible deadlock.
* ``HOROVOD_CPU_OPERATIONS`` - {MPI, GLOO, CCL}. Framework to use for CPU tensor allreduce, allgather, and broadcast.
* ``HOROVOD_CMAKE`` - path to the CMake binary used to build Gloo (not required when using MPI).
* ``HOROVOD_CMAKE`` - path to the CMake binary used to build Horovod.
* ``HOROVOD_WITH_TENSORFLOW`` - {1}. Require Horovod to install with TensorFlow support enabled.
* ``HOROVOD_WITHOUT_TENSORFLOW`` - {1}. Skip installing TensorFlow support.
* ``HOROVOD_WITH_PYTORCH`` - {1}. Require Horovod to install with PyTorch support enabled.
Expand Down
36 changes: 34 additions & 2 deletions setup.py
Expand Up @@ -15,14 +15,19 @@
# limitations under the License.
# ==============================================================================

import atexit
import io
import os
import re
import shutil
import subprocess
import sys
import tempfile
import textwrap

from setuptools import setup, Extension, find_packages
from setuptools.command.build_ext import build_ext
from distutils.version import LooseVersion

from horovod import __version__

Expand Down Expand Up @@ -60,7 +65,34 @@ def is_build_action():
return True

def get_cmake_bin():
return os.environ.get('HOROVOD_CMAKE', 'cmake')
if 'HOROVOD_CMAKE' in os.environ:
return os.environ['HOROVOD_CMAKE']

cmake_bin = 'cmake'
try:
out = subprocess.check_output([cmake_bin, '--version'])
except OSError:
cmake_installed_version = LooseVersion("0.0")
else:
cmake_installed_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))

if cmake_installed_version < LooseVersion("3.13.0"):
print("Could not find a recent CMake to build Horovod. "
"Attempting to install CMake 3.13 to a temporary location via pip.", flush=True)
cmake_temp_dir = tempfile.TemporaryDirectory(prefix="horovod-cmake-tmp")
atexit.register(cmake_temp_dir.cleanup)
try:
_ = subprocess.check_output(["pip", "install", "--target", cmake_temp_dir.name, "cmake~=3.13.0"])
except Exception:
raise RuntimeError("Failed to install temporary CMake. "
"Please update your CMake to 3.13+ or set HOROVOD_CMAKE appropriately.")
cmake_bin = os.path.join(cmake_temp_dir.name, "bin", "run_cmake")
with io.open(cmake_bin, "w") as f_run_cmake:
f_run_cmake.write(
f"#!/bin/sh\nPYTHONPATH={cmake_temp_dir.name} {os.path.join(cmake_temp_dir.name, 'bin', 'cmake')} \"$@\"")
os.chmod(cmake_bin, 0o755)

return cmake_bin


class custom_build_ext(build_ext):
Expand Down Expand Up @@ -103,7 +135,7 @@ def build_extensions(self):
if self.verbose:
print(f"Running CMake in {cmake_build_dir}:")
for command in config_and_build_commands:
print(" ".join(command))
print(" ".join(command))
sys.stdout.flush()

# Config and build the extension
Expand Down

0 comments on commit 2632c05

Please sign in to comment.