Skip to content

Commit

Permalink
Upgrade to ubuntu 20.04 (#3393)
Browse files Browse the repository at this point in the history
* Move to Ubuntu 20.04
* Use Ubuntu 18.04 along with Python 3.7
* Use Ubuntu 18.04 along with CUDA 10.x
* Move to Python 3.8 in released images
* Run apt-get with DEBIAN_FRONTEND=noninteractive
* Set docker-compose timeout to 5m

Signed-off-by: Enrico Minack <github@enrico.minack.dev>
  • Loading branch information
EnricoMi committed Feb 3, 2022
1 parent 19040e9 commit c5af0e5
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 15 deletions.
4 changes: 4 additions & 0 deletions .buildkite/gen-pipeline.sh
Expand Up @@ -72,6 +72,8 @@ build_test() {
local test=$1
echo "- label: ':docker: Build ${test}'"
echo " env:"
echo " COMPOSE_HTTP_TIMEOUT: 300"
echo " plugins:"
echo " - docker-compose#v3.5.0:"
echo " build: ${test}"
Expand All @@ -98,6 +100,8 @@ run_test() {
echo "- label: '${label}'"
echo " command: ${command}"
echo " artifact_paths: \"artifacts/**\""
echo " env:"
echo " COMPOSE_HTTP_TIMEOUT: 300"
echo " plugins:"
echo " - docker-compose#v3.5.0:"
echo " run: ${test}"
Expand Down
7 changes: 5 additions & 2 deletions Dockerfile.test.cpu
@@ -1,9 +1,9 @@
ARG UBUNTU_VERSION=18.04
ARG UBUNTU_VERSION=20.04
FROM ubuntu:${UBUNTU_VERSION}

# Arguments for the build. UBUNTU_VERSION needs to be repeated because
# the first usage only applies to the FROM tag.
ARG UBUNTU_VERSION=18.04
ARG UBUNTU_VERSION=20.04
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=3.6
ARG GPP_VERSION=7
Expand All @@ -22,6 +22,9 @@ ARG SPARK_PACKAGE=spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
ARG CCL_PACKAGE=master
ARG HOROVOD_BUILD_FLAGS=""

# to avoid interaction with apt-get
ENV DEBIAN_FRONTEND=noninteractive

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

Expand Down
7 changes: 5 additions & 2 deletions Dockerfile.test.gpu
@@ -1,9 +1,9 @@
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu18.04
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu20.04
FROM nvidia/cuda:${CUDA_DOCKER_VERSION}

# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated because
# the first usage only applies to the FROM tag.
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu18.04
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu20.04
ARG CUDNN_VERSION=7.6.0.64-1+cuda10.0
ARG NCCL_VERSION_OVERRIDE=2.4.7-1+cuda10.0
ARG MPI_KIND=OpenMPI
Expand All @@ -23,6 +23,9 @@ ARG SPARK_PACKAGE=spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
ARG HOROVOD_BUILD_FLAGS="HOROVOD_GPU_OPERATIONS=NCCL"
ARG HOROVOD_MIXED_INSTALL=0

# to avoid interaction with apt-get
ENV DEBIAN_FRONTEND=noninteractive

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

Expand Down
21 changes: 16 additions & 5 deletions docker-compose.test.yml
Expand Up @@ -5,7 +5,7 @@ services:
context: .
dockerfile: Dockerfile.test.cpu
args:
UBUNTU_VERSION: 18.04
UBUNTU_VERSION: 20.04
GPP_VERSION: 7
MPI_KIND: None
PYTHON_VERSION: 3.8
Expand All @@ -24,6 +24,8 @@ services:
# our baseline first
test-cpu-gloo-py3_8-tf2_7_0-keras2_7_0-torch1_10_1-mxnet1_9_0-pyspark3_2_0:
extends: test-cpu-base

# permute MPI kinds
test-cpu-mpich-py3_8-tf2_7_0-keras2_7_0-torch1_10_1-mxnet1_9_0-pyspark3_2_0:
extends: test-cpu-base
build:
Expand Down Expand Up @@ -55,6 +57,9 @@ services:
extends: test-cpu-base
build:
args:
# Tensorflow 1.15.5 is only available for Python 3.7
# Python 3.7 is only available on Ubuntu 18.04
UBUNTU_VERSION: 18.04
PYTHON_VERSION: 3.7
# there is no tensorflow-cpu>1.15.0, so we use tensorflow==1.15.5
TENSORFLOW_PACKAGE: tensorflow==1.15.5
Expand Down Expand Up @@ -97,6 +102,9 @@ services:
extends: test-cpu-base
build:
args:
# Tensorflow 1.15.5 is only available for Python 3.7
# Python 3.7 is only available on Ubuntu 18.04
UBUNTU_VERSION: 18.04
PYTHON_VERSION: 3.7
PYSPARK_PACKAGE: pyspark==2.4.8
SPARK_PACKAGE: spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
Expand Down Expand Up @@ -136,6 +144,8 @@ services:
extends: test-gpu-base
build:
args:
# Tensorflow 1.15.5 is only available for Python 3.7
# Python 3.7 is only available on Ubuntu 18.04
CUDA_DOCKER_VERSION: 10.0-devel-ubuntu18.04
CUDNN_VERSION: 7.6.5.32-1+cuda10.1
NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
Expand All @@ -151,6 +161,7 @@ services:
extends: test-gpu-base
build:
args:
# CUDA 10.x devel image is only available with Ubuntu 18.04
CUDA_DOCKER_VERSION: 10.1-devel-ubuntu18.04
CUDNN_VERSION: 7.6.5.32-1+cuda10.1
NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
Expand All @@ -164,7 +175,7 @@ services:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu18.04
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu20.04
CUDNN_VERSION: 8.1.1.33-1+cuda11.2
NCCL_VERSION_OVERRIDE: 2.8.4-1+cuda11.2
TENSORFLOW_PACKAGE: tensorflow-gpu==2.6.2
Expand All @@ -177,7 +188,7 @@ services:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu18.04
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu20.04
CUDNN_VERSION: 8.1.1.33-1+cuda11.2
NCCL_VERSION_OVERRIDE: 2.8.4-1+cuda11.2
MPI_KIND: OpenMPI
Expand All @@ -191,7 +202,7 @@ services:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu18.04
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu20.04
CUDNN_VERSION: 8.1.1.33-1+cuda11.2
NCCL_VERSION_OVERRIDE: 2.8.4-1+cuda11.2
TENSORFLOW_PACKAGE: tf-nightly-gpu
Expand All @@ -205,7 +216,7 @@ services:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu18.04
CUDA_DOCKER_VERSION: 11.2.2-devel-ubuntu20.04
CUDNN_VERSION: 8.1.1.33-1+cuda11.2
NCCL_VERSION_OVERRIDE: 2.8.4-1+cuda11.2
MPI_KIND: OpenMPI
Expand Down
5 changes: 2 additions & 3 deletions docker/horovod-cpu/Dockerfile
@@ -1,4 +1,4 @@
FROM ubuntu:18.04
FROM ubuntu:20.04

# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ARG TENSORFLOW_VERSION=2.5.0
Expand All @@ -10,8 +10,7 @@ ARG MXNET_VERSION=1.8.0.post0
ARG PYSPARK_PACKAGE=pyspark==3.1.1
ARG SPARK_PACKAGE=spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# Python 3.7 is supported by Ubuntu Bionic out of the box
ARG PYTHON_VERSION=3.7
ARG PYTHON_VERSION=3.8

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
Expand Down
5 changes: 2 additions & 3 deletions docker/horovod/Dockerfile
@@ -1,4 +1,4 @@
ARG CUDA_DOCKER_VERSION=11.2.2-devel-ubuntu18.04
ARG CUDA_DOCKER_VERSION=11.2.2-devel-ubuntu20.04
FROM nvidia/cuda:${CUDA_DOCKER_VERSION}

# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated because
Expand All @@ -16,8 +16,7 @@ ARG MXNET_VERSION=1.8.0.post0
ARG PYSPARK_PACKAGE=pyspark==3.1.1
ARG SPARK_PACKAGE=spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# Python 3.7 is supported by Ubuntu Bionic out of the box
ARG PYTHON_VERSION=3.7
ARG PYTHON_VERSION=3.8

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
Expand Down
22 changes: 22 additions & 0 deletions test/single/data/expected_buildkite_gpu_heads_pipeline.yaml
@@ -1,5 +1,7 @@
steps:
- label: ':docker: Build test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0'
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
build: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -19,6 +21,8 @@ steps:
- label: ':pytest: Gloo Parallel PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "HOROVOD_TEST_GPU=1 cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -35,6 +39,8 @@ steps:
- label: ':pytest: Gloo Single PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "HOROVOD_TEST_GPU=1 cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -51,6 +57,8 @@ steps:
- label: ':pytest: Gloo Cluster PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "HOROVOD_TEST_GPU=1 /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -68,6 +76,8 @@ steps:
- label: ':tensorflow: Gloo TensorFlow 2.0 MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -84,6 +94,8 @@ steps:
- label: ':tensorflow: Gloo TensorFlow 2.0 Keras MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -100,6 +112,8 @@ steps:
- label: ':fire: Gloo PyTorch MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -116,6 +130,8 @@ steps:
- label: ':muscle: Gloo MXNet2 MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -132,6 +148,8 @@ steps:
- label: ':factory: Elastic Tests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -148,6 +166,8 @@ steps:
- label: ':spark: Spark Torch MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand All @@ -164,6 +184,8 @@ steps:
- label: ':spark: Spark Lightning MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
artifact_paths: "artifacts/**"
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
run: test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0
Expand Down

0 comments on commit c5af0e5

Please sign in to comment.