Add PyTorch 2.3 to CI matrix (#19708)

Lightning-AI · Apr 29, 2024 · 49ed2b1 · 49ed2b1
1 parent 2913633
commit 49ed2b1
Show file tree

Hide file tree

Showing 19 changed files with 46 additions and 26 deletions.
diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml
@@ -46,8 +46,7 @@ jobs:
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
     container:
-      # TODO: Upgrade to Python 3.11
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
       options: "--gpus=all --shm-size=32g"
     strategy:
       matrix:

diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -56,12 +56,11 @@ jobs:
       options: "--gpus=all --shm-size=2gb  -v /var/tmp:/var/tmp"
     strategy:
       matrix:
-        # TODO: Upgrade to Python 3.11
         "Fabric | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
           PACKAGE_NAME: "fabric"
         "Lightning | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
           PACKAGE_NAME: "lightning"
     workspace:
       clean: all

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -48,12 +48,11 @@ jobs:
     cancelTimeoutInMinutes: "2"
     strategy:
       matrix:
-        # TODO: Upgrade to Python 3.11
         "PyTorch | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
           PACKAGE_NAME: "pytorch"
         "Lightning | latest":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
           PACKAGE_NAME: "lightning"
     pool: lit-rtx-3090
     variables:

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -23,14 +23,17 @@ subprojects:
       - "pl-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "pl-cpu (macOS-11, lightning, 3.10, 2.1)"
       - "pl-cpu (macOS-11, lightning, 3.10, 2.2)"
+      - "pl-cpu (macOS-14, lightning, 3.10, 2.3)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)"
       - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)"
+      - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.3)"
       - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.1)"
       - "pl-cpu (windows-2022, lightning, 3.10, 2.2)"
+      - "pl-cpu (windows-2022, lightning, 3.10, 2.3)"
       - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)"
       - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)"
       - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)"
@@ -171,14 +174,17 @@ subprojects:
       - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)"
       - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)"
       - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)"
+      - "fabric-cpu (macOS-14, lightning, 3.10, 2.3)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)"
       - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)"
+      - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)"
       - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)"
       - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)"
       - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)"
       - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)"
+      - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)"
       - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)"
       - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)"
       - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)"

diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -49,6 +49,9 @@ jobs:
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" }
           # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
           - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" }
           - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" }
@@ -83,6 +86,8 @@ jobs:
       PYPI_CACHE_DIR: "_pip-wheels"
       TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html"
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html"
+      # TODO: Remove this - Enable running MPS tests on this platform
+      DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v4
 
@@ -119,7 +124,7 @@ jobs:
       - name: Env. variables
         run: |
           # Switch PyTorch URL
-          python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV
+          python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV
           # Switch coverage scope
           python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV
           # if you install mono-package set dependency only for this subpackage

diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -53,6 +53,9 @@ jobs:
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
+          - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" }
+          - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" }
           # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
           - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" }
           - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" }
@@ -88,6 +91,8 @@ jobs:
       TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html"
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
       PYPI_CACHE_DIR: "_pip-wheels"
+      # TODO: Remove this - Enable running MPS tests on this platform
+      DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }}
     steps:
       - uses: actions/checkout@v4
 
@@ -125,7 +130,7 @@ jobs:
       - name: Env. variables
         run: |
           # Switch PyTorch URL
-          python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV
+          python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV
           # Switch coverage scope
           python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV
           # if you install mono-package set dependency only for this subpackage

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -109,6 +109,7 @@ jobs:
           - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
           - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
           - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
+          - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" }
           # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" }  # todo: pending on `onnxruntime`
     steps:
       - uses: actions/checkout@v4

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -34,13 +34,7 @@ ENV \
     MAKEFLAGS="-j2"
 
 RUN \
-    # TODO: Remove the manual key installation once the base image is updated.
-    # https://github.com/NVIDIA/nvidia-docker/issues/1631
-    # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214
     apt-get update && apt-get install -y wget && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
-    mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \
-    echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \
     apt-get update -qq --fix-missing && \
     NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM=${CUDA_VERSION%.*} && \

diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 numpy >=1.17.2, <1.27.0
-torch >=2.0.0, <2.3.0
+torch >=2.0.0, <2.4.0
 fsspec[http] >=2022.5.0, <2023.11.0
 packaging >=20.0, <=23.1
 typing-extensions >=4.4.0, <4.10.0

diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt
@@ -1,6 +1,6 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-torchvision >=0.15.0, <0.18.0
+torchvision >=0.15.0, <0.19.0
 torchmetrics >=0.10.0, <1.3.0
 lightning-utilities >=0.8.0, <0.12.0
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
@@ -2,7 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 numpy >=1.17.2, <1.27.0
-torch >=2.0.0, <2.3.0
+torch >=2.0.0, <2.4.0
 tqdm >=4.57.0, <4.67.0
 PyYAML >=5.4, <6.1.0
 fsspec[http] >=2022.5.0, <2023.11.0

diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt
@@ -2,8 +2,7 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 requests <2.32.0
-torchvision >=0.15.0, <0.18.0
-gym[classic_control] >=0.17.0, <0.27.0
+torchvision >=0.15.0, <0.19.0
 ipython[all] <8.15.0
 torchmetrics >=0.10.0, <1.3.0
 lightning-utilities >=0.8.0, <0.12.0
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -11,7 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI [#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560))
 
--
+- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708))
 
 -
 

diff --git a/src/lightning/fabric/accelerators/mps.py b/src/lightning/fabric/accelerators/mps.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import platform
 from functools import lru_cache
 from typing import List, Optional, Union
@@ -70,7 +71,8 @@ def auto_device_count() -> int:
     @lru_cache(1)
     def is_available() -> bool:
         """MPS is only available on a machine with the ARM-based Apple Silicon processors."""
-        return torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64")
+        mps_disabled = os.getenv("DISABLE_MPS", "0") == "1"
+        return not mps_disabled and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64")
 
     @classmethod
     @override

diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -16,7 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added `on_exception` hook to `LightningDataModule` ([#19601](https://github.com/Lightning-AI/pytorch-lightning/pull/19601))
 
--
+- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708))
+
 
 ### Changed
 

diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py
@@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16():
 @RunIf(min_cuda_gpus=1)
 def test_fsdp_precision_forward_context():
     """Test to ensure that the context manager correctly is set to bfloat16."""
+    from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+
     precision = FSDPPrecision(precision="16-mixed")
-    assert isinstance(precision.scaler, torch.cuda.amp.GradScaler)
+    assert isinstance(precision.scaler, ShardedGradScaler)
     assert torch.get_default_dtype() == torch.float32
     with precision.forward_context():
         assert torch.get_autocast_gpu_dtype() == torch.float16

diff --git a/tests/tests_pytorch/callbacks/test_finetuning_callback.py b/tests/tests_pytorch/callbacks/test_finetuning_callback.py
@@ -15,6 +15,7 @@
 
 import pytest
 import torch
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.pytorch import LightningModule, Trainer, seed_everything
 from lightning.pytorch.callbacks import BackboneFinetuning, BaseFinetuning, ModelCheckpoint
 from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset
@@ -359,6 +360,8 @@ def test_callbacks_restore(tmp_path):
         "foreach": None,
         "differentiable": False,
     }
+    if _TORCH_GREATER_EQUAL_2_3:
+        expected["fused"] = None
 
     assert callback._internal_optimizer_metadata[0][0] == expected
 
@@ -374,6 +377,8 @@ def test_callbacks_restore(tmp_path):
         "foreach": None,
         "differentiable": False,
     }
+    if _TORCH_GREATER_EQUAL_2_3:
+        expected["fused"] = None
 
     assert callback._internal_optimizer_metadata[0][1] == expected
 

diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
@@ -88,6 +88,7 @@ def restore_env_variables():
         "KMP_DUPLICATE_LIB_OK",  # leaked by PyTorch
         "CRC32C_SW_MODE",  # leaked by tensorboardX
         "TRITON_CACHE_DIR",  # leaked by torch.compile
+        "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR",  # leaked by torch.compile
         "OMP_NUM_THREADS",  # set by our launchers
         # leaked by XLA
         "ALLOW_MULTIPLE_LIBTPU_LOAD",

diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py
@@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16():
 @RunIf(min_cuda_gpus=1)
 def test_fsdp_precision_forward_context():
     """Test to ensure that the context manager correctly is set to bfloat16."""
+    from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+
     precision = FSDPPrecision(precision="16-mixed")
-    assert isinstance(precision.scaler, torch.cuda.amp.GradScaler)
+    assert isinstance(precision.scaler, ShardedGradScaler)
     assert torch.get_default_dtype() == torch.float32
     with precision.forward_context():
         assert torch.get_autocast_gpu_dtype() == torch.float16