Skip to content

Commit

Permalink
CI: fix running PT 1.11 (#12304)
Browse files Browse the repository at this point in the history
* fix fire
* horovod
* assistant
* cmake
* u20
* cuda
* -j2
* fix mypy

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
  • Loading branch information
Borda and awaelchli committed Mar 12, 2022
1 parent 90a9da5 commit 7ee6907
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 40 deletions.
15 changes: 7 additions & 8 deletions .github/workflows/ci_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,24 +95,23 @@ jobs:
strategy:
fail-fast: false
matrix:
# the config used in '.github/workflows/ci_test-conda.yml'
python_version: ["3.8"]
pytorch_version: ["1.8", "1.9", "1.10"]
include:
# see: https://pytorch.org/get-started/previous-versions/
- {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"}
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"}
- {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
steps:
- name: Checkout
uses: actions/checkout@v2
- run: |
cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1)
echo "::set-output name=CUDA::$cuda"
id: extend
- name: Build Conda Docker
# publish master/release
uses: docker/build-push-action@v2
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/base-conda/Dockerfile
push: false
timeout-minutes: 75
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/ci_test-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,19 @@ jobs:
- uses: actions/checkout@v2

- name: Update dependencies
env:
HOROVOD_BUILD_ARCH_FLAGS: "-mfma"
HOROVOD_WITHOUT_MXNET: 1
HOROVOD_WITHOUT_TENSORFLOW: 1
run: |
set -e
conda info
conda list
# adjust versions according installed Torch version
python ./requirements/adjust-versions.py requirements/extra.txt
python ./requirements/adjust-versions.py requirements/examples.txt
pip install --requirement requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
# set a per-test timeout of 2.5 minutes to fail sooner. this aids with hanging tests
pip install -r requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
# set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests
pip install pytest-timeout
pip list
# sanity check
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/events-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.7", pytorch_version: "1.8"}
# latest (not used)
- {python_version: "3.9", pytorch_version: "1.10"}
- {python_version: "3.9", pytorch_version: "1.11"}

steps:
- name: Checkout
Expand Down
10 changes: 5 additions & 5 deletions dockers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ or with specific arguments
```bash
git clone <git-repository>
docker image build \
-t pytorch-lightning:base-cuda-py3.9-pt1.8 \
-t pytorch-lightning:base-cuda-py3.7-pt1.8 \
-f dockers/base-cuda/Dockerfile \
--build-arg PYTHON_VERSION=3.9 \
--build-arg PYTHON_VERSION=3.7 \
--build-arg PYTORCH_VERSION=1.8 \
.
```
Expand All @@ -26,10 +26,10 @@ or nightly version from Conda
```bash
git clone <git-repository>
docker image build \
-t pytorch-lightning:base-conda-py3.8-pt1.9 \
-t pytorch-lightning:base-conda-py3.9-pt1.11 \
-f dockers/base-conda/Dockerfile \
--build-arg PYTHON_VERSION=3.8 \
--build-arg PYTORCH_VERSION=1.9 \
--build-arg PYTHON_VERSION=3.9 \
--build-arg PYTORCH_VERSION=1.11 \
.
```

Expand Down
21 changes: 16 additions & 5 deletions dockers/base-conda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

ARG CUDA_VERSION=11.3.1

FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04

ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.8
Expand Down Expand Up @@ -59,8 +59,8 @@ ENV \
LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" \
CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
MKL_THREADING_LAYER=GNU \
MAKEFLAGS="-j$(nproc)" \
# MAKEFLAGS="-j1" \
# MAKEFLAGS="-j$(nproc)" \
MAKEFLAGS="-j2" \
TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \
CONDA_ENV=lightning

Expand All @@ -84,9 +84,9 @@ ENV \
PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \
LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"

COPY ./requirements.txt requirements.txt
COPY ./requirements/extra.txt requirements-extra.txt
COPY ./requirements/examples.txt requirements-examples.txt
COPY ./requirements/test.txt requirements-test.txt
COPY ./requirements/adjust-versions.py requirements_adjust_versions.py
COPY ./.actions/assistant.py assistant.py

Expand All @@ -95,14 +95,25 @@ RUN \
python -c "import torch; print(torch.__version__)" && \
python requirements_adjust_versions.py requirements-extra.txt && \
python -c "print(' '.join([ln for ln in open('requirements-extra.txt').readlines() if 'horovod' in ln]))" > requirements_horovod.txt && \
pip install -q fire && \
python assistant.py requirements_prune_pkgs requirements-extra.txt "horovod" && \
python requirements_adjust_versions.py requirements-examples.txt && \
# Install remaining requirements
pip install -r requirements.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \
pip install -r requirements-extra.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \
pip install -r requirements-examples.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \
pip install -r requirements-test.txt --no-cache-dir && \
rm assistant.py

RUN \
apt-get purge -y cmake && \
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
tar -zxvf cmake-3.20.2.tar.gz && \
cd cmake-3.20.2 && \
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
make && \
make install && \
cmake --version

ENV \
# if you want this environment to be the default o \ne, uncomment the following line:
CONDA_DEFAULT_ENV=${CONDA_ENV} \
Expand Down
41 changes: 22 additions & 19 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG CUDA_VERSION=10.2
ARG CUDA_VERSION=11.1

FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04

ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.8
Expand All @@ -28,8 +28,8 @@ ENV \
CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \
MKL_THREADING_LAYER=GNU \
MAKEFLAGS="-j$(nproc)"
# MAKEFLAGS="-j1"
# MAKEFLAGS="-j$(nproc)"
MAKEFLAGS="-j2"

RUN apt-get update -qq --fix-missing && \
apt-get install -y --no-install-recommends \
Expand Down Expand Up @@ -64,7 +64,7 @@ RUN apt-get update -qq --fix-missing && \

COPY ./requirements.txt requirements.txt
COPY ./requirements/ ./requirements/
COPY ./.github/prune-packages.py requirements/prune_packages.py
COPY ./.actions/assistant.py assistant.py

ENV PYTHONPATH=/usr/lib/python${PYTHON_VERSION}/site-packages

Expand All @@ -73,27 +73,21 @@ RUN \
python${PYTHON_VERSION} get-pip.py && \
rm get-pip.py && \

pip install -q fire && \
# Disable cache \
export BAGUA_CUDA_VERSION=${CUDA_VERSION//"."/""} && \
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
export BAGUA_CUDA_VERSION=$CUDA_VERSION_MM && \
pip config set global.cache-dir false && \
# set particular PyTorch version
python ./requirements/adjust-versions.py requirements.txt ${PYTORCH_VERSION} && \
python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \
python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \
python -c "print(' '.join([ln for ln in open('requirements/extra.txt').readlines() if 'horovod' in ln]))" > ./requirements/horovod.txt && \
python requirements/prune_packages.py requirements/extra.txt "horovod" && \
# Install all requirements
pip install -r requirements/devel.txt --no-cache-dir && \
rm -rf requirements.*

ENV \
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
HOROVOD_GPU_OPERATIONS=NCCL \
HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
HOROVOD_WITH_GLOO=1 \
HOROVOD_WITHOUT_MPI=1
python assistant.py requirements_prune_pkgs requirements/examples.txt "horovod" && \
# Install all requirements \
pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
rm -rf requirements.* && \
rm assistant.py

RUN \
apt-get purge -y cmake && \
Expand All @@ -105,6 +99,15 @@ RUN \
make install && \
cmake --version

ENV \
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
HOROVOD_GPU_OPERATIONS=NCCL \
HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
HOROVOD_WITH_GLOO=1 \
HOROVOD_WITHOUT_MPI=1

RUN \
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
Expand Down

0 comments on commit 7ee6907

Please sign in to comment.