diff --git a/.actions/assistant.py b/.actions/assistant.py
index 0c6b567885a2c..ce253a96c4c21 100644
--- a/.actions/assistant.py
+++ b/.actions/assistant.py
@@ -1,8 +1,9 @@
import os
import re
+import shutil
from itertools import chain
+from os.path import dirname, isfile
from pathlib import Path
-from pprint import pprint
from typing import Dict, List, Optional, Sequence, Tuple
import pkg_resources
@@ -65,6 +66,7 @@ def _replace_imports(lines: List[str], mapping: List[Tuple[str, str]]) -> List[s
def copy_replace_imports(
source_dir: str, source_imports: List[str], target_imports: List[str], target_dir: Optional[str] = None
) -> None:
+ """Copy package content with import adjustments."""
print(f"Replacing imports: {locals()}")
assert len(source_imports) == len(target_imports), (
"source and target imports must have the same length, "
@@ -75,19 +77,27 @@ def copy_replace_imports(
ls = _retrieve_files(source_dir)
for fp in ls:
- if fp.endswith(".py") or not fp.endswith(".pyc"):
- with open(fp, encoding="utf-8") as fo:
- try:
- lines = fo.readlines()
- except UnicodeDecodeError:
- # a binary file, skip
- print(f"Skipped replacing imports for {fp}")
- continue
- lines = _replace_imports(lines, list(zip(source_imports, target_imports)))
- fp_new = fp.replace(source_dir, target_dir)
- os.makedirs(os.path.dirname(fp_new), exist_ok=True)
- with open(fp_new, "w", encoding="utf-8") as fo:
- fo.writelines(lines)
+ fp_new = fp.replace(source_dir, target_dir)
+ _, ext = os.path.splitext(fp)
+ if ext in (".png", ".jpg", ".ico"):
+ os.makedirs(dirname(fp_new), exist_ok=True)
+ if not isfile(fp_new):
+ shutil.copy(fp, fp_new)
+ continue
+ elif ext in (".pyc",):
+ continue
+ # Try to parse everything else
+ with open(fp, encoding="utf-8") as fo:
+ try:
+ lines = fo.readlines()
+ except UnicodeDecodeError:
+ # a binary file, skip
+ print(f"Skipped replacing imports for {fp}")
+ continue
+ lines = _replace_imports(lines, list(zip(source_imports, target_imports)))
+ os.makedirs(os.path.dirname(fp_new), exist_ok=True)
+ with open(fp_new, "w", encoding="utf-8") as fo:
+ fo.writelines(lines)
def create_mirror_package(source_dir: str, package_mapping: Dict[str, str]) -> None:
@@ -129,7 +139,7 @@ def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
req = list(pkg_resources.parse_requirements(ln_))[0]
if req.name not in packages:
final.append(line)
- pprint(final)
+ print(final)
path.write_text("\n".join(final))
@staticmethod
@@ -147,7 +157,7 @@ def replace_oldest_ver(requirement_fnames: Sequence[str] = REQUIREMENT_FILES_ALL
def copy_replace_imports(
source_dir: str, source_import: str, target_import: str, target_dir: Optional[str] = None
) -> None:
- """Recursively replace imports in given folder."""
+ """Copy package content with import adjustments."""
source_imports = source_import.strip().split(",")
target_imports = target_import.strip().split(",")
copy_replace_imports(source_dir, source_imports, target_imports, target_dir=target_dir)
diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml
index b8c8305715300..24458c0daa385 100644
--- a/.azure/app-cloud-e2e.yml
+++ b/.azure/app-cloud-e2e.yml
@@ -27,12 +27,17 @@ pr:
- "release/*"
paths:
include:
- - ".azure/app-cloud-e2e.yml"
- - "requirements/app/**"
- - "src/lightning_app/**"
- - "examples/app_*"
- - "tests/tests_app_examples/**"
- - ".actions/**"
+ - ".azure/app-cloud-e2e.yml"
+ - "requirements/app/**"
+ - "src/lightning_app/**"
+ - "tests/tests_app/**"
+ - "examples/app_*/**" # some tests_app tests call examples files
+ - "tests/tests_app_examples/**"
+ - "setup.py"
+ - ".actions/**"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
# variables are automatically exported as environment variables so this will override pip's default cache dir
variables:
diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml
index 8fad0d69c15d1..52ad4251d4300 100644
--- a/.azure/gpu-benchmark.yml
+++ b/.azure/gpu-benchmark.yml
@@ -21,6 +21,11 @@ pr:
paths:
include:
- ".azure/gpu-benchmark.yml"
+ - "tests/tests_pytorch/benchmarks/**"
+ - "requirements/pytorch/**"
+ - "!requirements/pytorch/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
schedules:
- cron: "0 0 * * *" # At the end of every day
@@ -37,7 +42,7 @@ jobs:
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+ image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
options: "--gpus=all --shm-size=32g"
workspace:
clean: all
@@ -47,18 +52,41 @@ jobs:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
- echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
displayName: 'set env. vars'
- bash: |
- pip install -e .[strategies] --find-links ${TORCH_URL}
+ echo $CUDA_VISIBLE_DEVICES
+ echo $TORCH_URL
+ lspci | egrep 'VGA|3D'
+ whereis nvidia
+ nvidia-smi
+ which python && which pip
+ python --version
+ pip --version
pip list
+ displayName: 'Image info & NVIDIA'
+
+ - bash: |
+ python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt]
+
+ PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+ python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+ displayName: 'Adjust dependencies'
+
+ - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
- PACKAGE_NAME: pytorch
- FREEZE_REQUIREMENTS: 1
+ PACKAGE_NAME: "pytorch"
+ FREEZE_REQUIREMENTS: "1"
displayName: 'Install package'
+ - bash: |
+ set -e
+ pip list
+ python requirements/collect_env_details.py
+ python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+ displayName: 'Env details'
+
- bash: python -m pytest benchmarks -v --durations=0
env:
PL_RUNNING_BENCHMARKS: "1"
diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml
index ceb4c671cfe22..98ff44f879a71 100644
--- a/.azure/gpu-tests-lite.yml
+++ b/.azure/gpu-tests-lite.yml
@@ -21,12 +21,18 @@ pr:
paths:
include:
- ".azure/gpu-tests-lite.yml"
+ - "examples/lite/**"
+ - "examples/run_lite_examples.sh"
+ - "tests/tests_lite/run_standalone_*.sh"
+ - "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink
- "requirements/lite/**"
- "src/lightning_lite/**"
- "tests/tests_lite/**"
- - "tests/tests_pytorch/run_standalone_tests.sh"
- - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above
+ - "setup.cfg" # includes pytest config
- ".actions/**"
+ - "!requirements/lite/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
jobs:
- job: testing
@@ -38,7 +44,7 @@ jobs:
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+ image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb"
@@ -48,6 +54,14 @@ jobs:
steps:
- bash: |
+ echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+ cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
+ echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
+ displayName: 'set env. vars'
+
+ - bash: |
+ echo $CUDA_VISIBLE_DEVICES
+ echo $TORCH_URL
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
@@ -58,14 +72,13 @@ jobs:
displayName: 'Image info & NVIDIA'
- bash: |
- echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
- displayName: 'set visible devices'
+ PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+ python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION}
+ python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION}
+ displayName: 'Adjust dependencies'
- bash: |
- set -e
- CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
- pip install -e .[dev,strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
- pip list
+ pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "lite"
FREEZE_REQUIREMENTS: "1"
@@ -73,7 +86,7 @@ jobs:
- bash: |
set -e
- echo $CUDA_VISIBLE_DEVICES
+ pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index 05571269a99a7..91fe0b6107bd1 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -37,14 +37,20 @@ pr:
- "requirements/lite/**"
- "src/lightning_lite/**"
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
jobs:
- job: testing
strategy:
matrix:
- # TODO: package parametrization
- 'PyTorch - stable':
+ 'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+ scope: "strategies"
+ 'PyTorch - latest':
+ image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
+ scope: ""
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
@@ -93,11 +99,11 @@ jobs:
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
displayName: 'Adjust dependencies'
- - bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL}
+ - bash: pip install -e .[dev,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
- displayName: 'Install package'
+ displayName: 'Install package & extras'
- bash: |
set -e
@@ -109,14 +115,17 @@ jobs:
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
- pip list
- displayName: 'Install dependencies'
+ pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
+
+ python requirements/pytorch/check-avail-strategies.py
+ condition: eq(variables['scope'], 'strategies')
+ displayName: 'Install strategies'
- bash: |
set -e
+ pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
- python requirements/pytorch/check-avail-strategies.py
python requirements/pytorch/check-avail-extras.py
displayName: 'Env details'
diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
index 8281f9e5c5fa6..0c6851754f2a0 100644
--- a/.azure/hpu-tests.yml
+++ b/.azure/hpu-tests.yml
@@ -26,6 +26,9 @@ pr:
- "tests/tests_pytorch/**"
- "setup.cfg" # includes pytest config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
jobs:
- job: testing
diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml
index 972bf1e95a06b..d96adabf4a1ff 100644
--- a/.azure/ipu-tests.yml
+++ b/.azure/ipu-tests.yml
@@ -23,6 +23,9 @@ pr:
- "tests/tests_pytorch/**"
- "setup.cfg" # includes pytest config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
variables:
- name: poplar_sdk
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
index b3b0ac8e8a7e4..53e8348626c25 100644
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@@ -1,129 +1,142 @@
custom_service_name: "Lightning CI required checker"
-# For security reasons, configuration is only loaded from the repository's default branch,
-# changes made in pull requests from different branches or forks are ignored. This means that changes to this file
-# will only be used after they are merged.
subprojects:
# SECTION: pytorch_lightning
- - id: "pytorch_lightning"
+ - id: "pytorch_lightning: Tests workflow"
paths:
- # all examples don't need to be added because they aren't used in CI, but these are
- - "examples/run_ddp_examples.sh"
- - "examples/convert_from_pt_to_pl/**"
- - "examples/run_pl_examples.sh"
- - "examples/pl_basics/backbone_image_classifier.py"
- - "examples/pl_basics/autoencoder.py"
- - "examples/pl_loops/mnist_lite.py"
- - "examples/pl_fault_tolerant/automatic.py"
- - "examples/test_pl_examples.py"
- - "examples/pl_integrations/dali_image_classifier.py"
+ - ".github/workflows/ci-pytorch-tests.yml"
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
- "requirements/pytorch/**"
- "src/pytorch_lightning/**"
- "tests/tests_pytorch/**"
- "tests/legacy/back-compatible-versions.txt"
- "setup.cfg" # includes pytest config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- # Note: updates here should be applied to the lightning_lite group
- - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)"
- - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)"
- - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)"
- - "pl-cpu (windows-2022, pytorch, 3.9, 1.11)"
- - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)"
- - "pl-cpu (windows-2022, pytorch, 3.10, 1.12)"
- - "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)"
- - "pl-cpu (macOS-11, lightning, 3.10, 1.12)"
- #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)"
- #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, lightning, 3.10, 1.12)"
- #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)"
- - "pytorch-lightning (GPUs)"
- - "pytorch-lightning (HPUs)"
- - "pytorch-lightning (IPUs)"
- - "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)"
- - "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)"
- - "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)"
- # TODO: since this job has intermittent availability, it cannot be required or it will block all PL PRs from forks
- #- "test-on-tpus"
-
- - id: "pytorch_lightning: CPU workflow"
- paths:
- - ".github/workflows/ci-pytorch-tests.yml"
- checks:
- - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)"
- - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)"
- - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)"
+ - "pl-cpu (macOS-11, pytorch, 3.8, 1.11)"
+ - "pl-cpu (macOS-11, pytorch, 3.9, 1.12)"
+ - "pl-cpu (macOS-11, pytorch, 3.10, 1.13)"
+ - "pl-cpu (macOS-11, pytorch, 3.8, 1.9, oldest)"
- "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)"
- "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)"
- "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)"
+ - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.13)"
- "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)"
- "pl-cpu (windows-2022, pytorch, 3.9, 1.11)"
- - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)"
- "pl-cpu (windows-2022, pytorch, 3.10, 1.12)"
+ - "pl-cpu (windows-2022, pytorch, 3.10, 1.13)"
- "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)"
- - "pl-cpu (macOS-11, lightning, 3.10, 1.12)"
- #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)"
- #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, lightning, 3.10, 1.12)"
- #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)"
-
- - id: "pytorch_lightning: Slow workflow"
- paths:
- - ".github/workflows/ci-pytorch-tests-slow.yml"
- checks:
- "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)"
- "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)"
- "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)"
+ - "pl-cpu (macOS-11, lightning, 3.8, 1.13)"
+ - "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13)"
+ - "pl-cpu (windows-2022, lightning, 3.8, 1.13)"
- id: "pytorch_lightning: Azure GPU"
paths:
- ".azure/gpu-tests-pytorch.yml"
- - "tests/tests_pytorch/run_standalone_*.sh"
+ # only the azure GPU workflow runs the examples
+ # all examples don't need to be added because they aren't used in CI, but these are
+ - "examples/run_pl_examples.sh"
+ - "examples/pl_basics/backbone_image_classifier.py"
+ - "examples/pl_basics/autoencoder.py"
+ - "examples/pl_fault_tolerant/automatic.py"
+ - "examples/test_pl_examples.py"
+ - "examples/pl_integrations/dali_image_classifier.py"
+ - "requirements/pytorch/**"
+ - "src/pytorch_lightning/**"
+ - "tests/tests_pytorch/**"
+ - "setup.cfg" # includes pytest config
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
+ - ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "pytorch-lightning (GPUs)"
+ - id: "pytorch_lightning: Benchmarks"
+ paths:
+ - ".azure/gpu-benchmark.yml"
+ - "tests/tests_pytorch/benchmarks/**"
+ - "requirements/pytorch/**"
+ - "!requirements/pytorch/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
+ checks:
+ - "pytorch-lightning.Benchmark"
+
- id: "pytorch_lightning: Azure HPU"
paths:
- ".azure/hpu-tests.yml"
+ - "examples/pl_hpu/mnist_sample.py"
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
+ - "requirements/pytorch/**"
+ - "src/pytorch_lightning/**"
+ - "tests/tests_pytorch/**"
+ - "setup.cfg" # includes pytest config
+ - ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "pytorch-lightning (HPUs)"
- id: "pytorch_lightning: Azure IPU"
paths:
- ".azure/ipu-tests.yml"
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
+ - "requirements/pytorch/**"
+ - "src/pytorch_lightning/**"
+ - "tests/tests_pytorch/**"
+ - "setup.cfg" # includes pytest config
+ - ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "pytorch-lightning (IPUs)"
- - id: "pytorch-lightning: TPU workflow"
- paths:
- - ".github/workflows/tpu-tests.yml"
- checks:
- - "test-on-tpus"
+ # TODO: since this job has intermittent availability, it cannot be required
+ #- id: "pytorch-lightning: TPU workflow"
+ # paths:
+ # - ".github/workflows/tpu-tests.yml"
+ # - "dockers/base-xla/*"
+ # - "requirements/lite/**"
+ # - "src/lightning_lite/**"
+ # - "tests/tests_lite/**"
+ # - "requirements/pytorch/**"
+ # - "src/pytorch_lightning/**"
+ # - "tests/tests_pytorch/**"
+ # - "setup.cfg" # includes pytest config
+ # - ".actions/**"
+ # - "!requirements/**/docs.txt"
+ # - "!*.md"
+ # - "!**/*.md"
+ # checks:
+ # - "test-on-tpus"
- id: "pytorch_lightning: Docs"
paths:
- "src/pytorch_lightning/**"
- "docs/source-pytorch/**"
- - ".github/workflows/docs-*.yml"
+ - ".github/workflows/docs-checks.yml"
- "requirements/docs.txt"
- "requirements/pytorch/**"
+ - "setup.py"
+ - "setup.cfg" # includes metadata used in the package creation
+ - ".actions/**"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "make-doctest (pytorch)"
- "make-html (pytorch)"
@@ -132,141 +145,89 @@ subprojects:
paths:
- "dockers/**"
- ".github/workflows/ci-pytorch-dockers.yml"
- - "requirements.txt"
- - "requirements/*.txt"
- - "requirements/pytorch/*"
+ - "requirements/pytorch/**"
+ - "requirements/lite/**"
- "environment.yml"
- - ".github/workflows/*docker*.yml"
- "setup.py"
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "build-cuda (3.9, 1.10, 11.3.1)"
- "build-cuda (3.9, 1.11, 11.3.1)"
- "build-cuda (3.9, 1.12, 11.6.1)"
+ - "build-cuda (3.9, 1.13, 11.6.1)"
- "build-hpu (1.5.0, 1.11.0)"
- "build-ipu (3.9, 1.9)"
- "build-NGC"
- "build-pl (3.9, 1.10, 11.3.1)"
- "build-pl (3.9, 1.11, 11.3.1)"
- "build-pl (3.9, 1.12, 11.6.1)"
+ - "build-pl (3.9, 1.13, 11.6.1)"
- "build-xla (3.7, 1.12)"
# SECTION: lightning_lite
- - id: "lightning_lite"
+ - id: "lightning_lite: CPU workflow"
paths:
- "requirements/lite/**"
- "src/lightning_lite/**"
+ - "tests/tests_lite/**"
- "setup.cfg" # includes pytest config
+ - ".github/workflows/ci-lite-tests.yml"
- ".actions/**"
+ - "!requirements/lite/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- - "lite-cpu (macOS-11, lite, 3.9, 1.11)"
- - "lite-cpu (macOS-11, lite, 3.8, 1.10)"
- - "lite-cpu (macOS-11, lite, 3.10, 1.12)"
- - "lite-cpu (macOS-11, lite, 3.10, 1.13, pre)"
- - "lite-cpu (macOS-11, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.11)"
- - "lite-cpu (ubuntu-20.04, lite, 3.10, 1.12)"
- - "lite-cpu (ubuntu-20.04, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.13, pre)"
- - "lite-cpu (windows-2022, lite, 3.8, 1.9)"
- - "lite-cpu (windows-2022, lite, 3.9, 1.10)"
- - "lite-cpu (windows-2022, lite, 3.10, 1.11)"
- - "lite-cpu (windows-2022, lite, 3.10, 1.12)"
- - "lite-cpu (windows-2022, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (windows-2022, lite, 3.8, 1.13, pre)"
- - "lite-cpu (macOS-11, lightning, 3.8, 1.12)"
- - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.12)"
- - "lite-cpu (windows-2022, lightning, 3.8, 1.12)"
- - "lightning-lite (GPUs)"
- # Lite also requires PL checks as it depends on Lite
- - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)"
- - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)"
- - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)"
- - "pl-cpu (windows-2022, pytorch, 3.9, 1.11)"
- - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)"
- - "pl-cpu (windows-2022, pytorch, 3.10, 1.12)"
- - "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)"
- - "pl-cpu (macOS-11, lightning, 3.10, 1.12)"
- #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)"
- #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)"
- - "pl-cpu (windows-2022, lightning, 3.10, 1.12)"
- #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)"
- - "pytorch-lightning (GPUs)"
- - "pytorch-lightning (HPUs)"
- - "pytorch-lightning (IPUs)"
- - "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)"
- - "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)"
- - "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)"
- #- "test-on-tpus"
-
- - id: "lightning_lite: Tests"
- paths:
- - "tests/tests_lite/**"
- checks:
- - "lite-cpu (macOS-11, lite, 3.9, 1.11)"
- - "lite-cpu (macOS-11, lite, 3.8, 1.10)"
- - "lite-cpu (macOS-11, lite, 3.10, 1.12)"
- - "lite-cpu (macOS-11, lite, 3.10, 1.13, pre)"
+ - "lite-cpu (macOS-11, lite, 3.8, 1.11)"
+ - "lite-cpu (macOS-11, lite, 3.9, 1.12)"
+ - "lite-cpu (macOS-11, lite, 3.10, 1.13)"
- "lite-cpu (macOS-11, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.11)"
+ - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.10)"
+ - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.11)"
- "lite-cpu (ubuntu-20.04, lite, 3.10, 1.12)"
+ - "lite-cpu (ubuntu-20.04, lite, 3.10, 1.13)"
- "lite-cpu (ubuntu-20.04, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.13, pre)"
- - "lite-cpu (windows-2022, lite, 3.8, 1.9)"
- - "lite-cpu (windows-2022, lite, 3.9, 1.10)"
- - "lite-cpu (windows-2022, lite, 3.10, 1.11)"
+ - "lite-cpu (windows-2022, lite, 3.9, 1.11)"
- "lite-cpu (windows-2022, lite, 3.10, 1.12)"
+ - "lite-cpu (windows-2022, lite, 3.10, 1.13)"
- "lite-cpu (windows-2022, lite, 3.7, 1.9, oldest)"
- - "lite-cpu (windows-2022, lite, 3.8, 1.13, pre)"
- - "lite-cpu (macOS-11, lightning, 3.8, 1.12)"
- - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.12)"
- - "lite-cpu (windows-2022, lightning, 3.8, 1.12)"
- - "lightning-lite (GPUs)"
+ - "lite-cpu (macOS-11, lightning, 3.8, 1.13)"
+ - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.13)"
+ - "lite-cpu (windows-2022, lightning, 3.8, 1.13)"
- id: "lightning_lite: Azure GPU"
paths:
- ".azure/gpu-tests-lite.yml"
- "tests/tests_lite/run_standalone_*.sh"
- "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
+ - "tests/tests_lite/**"
+ - "setup.cfg" # includes pytest config
+ - ".actions/**"
+ - "!requirements/lite/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "lightning-lite (GPUs)"
# SECTION: lightning_app
- - id: "lightning_app"
+ - id: "lightning_app: Tests workflow"
paths:
- - "requirements/app/**"
+ - ".github/workflows/ci-app-tests.yml"
- "src/lightning_app/**"
- "tests/tests_app/**"
- "examples/app_*/**" # some tests_app tests call examples files
+ - "requirements/app/**"
- "setup.py"
- ".actions/**"
- checks:
- - "App.cloud-e2e"
- - "app-pytest (macOS-11, app, 3.8, latest)"
- - "app-pytest (macOS-11, app, 3.8, oldest)"
- - "app-pytest (macOS-11, lightning, 3.9, latest)"
- - "app-pytest (ubuntu-20.04, app, 3.8, latest)"
- - "app-pytest (ubuntu-20.04, app, 3.8, oldest)"
- - "app-pytest (ubuntu-20.04, lightning, 3.9, latest)"
- - "app-pytest (windows-2022, app, 3.8, latest)"
- - "app-pytest (windows-2022, app, 3.8, oldest)"
- - "app-pytest (windows-2022, lightning, 3.8, latest)"
-
- - id: "lightning_app: Tests workflow"
- paths:
- - ".github/workflows/ci-app-tests.yml"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "app-pytest (macOS-11, app, 3.8, latest)"
- "app-pytest (macOS-11, app, 3.8, oldest)"
@@ -280,26 +241,16 @@ subprojects:
- id: "lightning_app: Examples"
paths:
- - "requirements/app/**"
+ - ".github/workflows/ci-app-examples.yml"
- "src/lightning_app/**"
- "tests/tests_app_examples/**"
- - "examples/app_*/**"
+ - "examples/app_*"
+ - "requirements/app/**"
- "setup.py"
- ".actions/**"
- checks:
- - "app-examples (macOS-11, app, 3.9, latest)"
- - "app-examples (macOS-11, app, 3.9, oldest)"
- - "app-examples (macOS-11, lightning, 3.9, latest)"
- - "app-examples (ubuntu-20.04, app, 3.9, latest)"
- - "app-examples (ubuntu-20.04, app, 3.9, oldest)"
- - "app-examples (ubuntu-20.04, lightning, 3.9, latest)"
- - "app-examples (windows-2022, app, 3.9, latest)"
- - "app-examples (windows-2022, app, 3.9, oldest)"
- - "app-examples (windows-2022, lightning, 3.9, latest)"
-
- - id: "lightning_app: Examples workflow"
- paths:
- - ".github/workflows/ci-app-examples.yml"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "app-examples (macOS-11, app, 3.9, latest)"
- "app-examples (macOS-11, app, 3.9, oldest)"
@@ -314,6 +265,16 @@ subprojects:
- id: "lightning_app: Azure"
paths:
- ".azure/app-cloud-e2e.yml"
+ - "requirements/app/**"
+ - "src/lightning_app/**"
+ - "tests/tests_app/**"
+ - "examples/app_*/**" # some tests_app tests call examples files
+ - "tests/tests_app_examples/**"
+ - "setup.py"
+ - ".actions/**"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "App.cloud-e2e"
@@ -321,9 +282,14 @@ subprojects:
paths:
- "src/lightning_app/**"
- "docs/source-app/**"
- - ".github/workflows/docs-*.yml"
+ - ".github/workflows/docs-checks.yml"
- "requirements/docs.txt"
- "requirements/app/**"
+ - "setup.py"
+ - "setup.cfg" # includes metadata used in the package creation
+ - ".actions/**"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "make-doctest (app)"
- "make-html (app)"
@@ -337,6 +303,9 @@ subprojects:
- "src/**"
- "pyproject.toml" # includes mypy config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "mypy"
@@ -347,6 +316,9 @@ subprojects:
- "setup.py"
- "src/**"
- "requirements/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
checks:
- "install-pkg (ubuntu-22.04, app, 3.7)"
- "install-pkg (ubuntu-22.04, app, 3.10)"
diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml
index 457a01d643aab..88eadcfd920f8 100644
--- a/.github/workflows/ci-app-examples.yml
+++ b/.github/workflows/ci-app-examples.yml
@@ -12,10 +12,12 @@ on:
- "src/lightning_app/**"
- "tests/tests_app_examples/**"
- "examples/app_*"
- - "requirements/app/*"
- - "!requirements/app/docs.txt"
+ - "requirements/app/**"
- "setup.py"
- ".actions/**"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml
index 4bcf1850eff72..826dfc70b552f 100644
--- a/.github/workflows/ci-app-tests.yml
+++ b/.github/workflows/ci-app-tests.yml
@@ -12,10 +12,12 @@ on:
- "src/lightning_app/**"
- "tests/tests_app/**"
- "examples/app_*" # some tests_app tests call examples files
- - "requirements/app/*"
- - "!requirements/app/docs.txt"
+ - "requirements/app/**"
- "setup.py"
- ".actions/**"
+ - "!requirements/app/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
diff --git a/.github/workflows/ci-lite-tests.yml b/.github/workflows/ci-lite-tests.yml
index cfd9c9a0b4fb7..c6b835bba4672 100644
--- a/.github/workflows/ci-lite-tests.yml
+++ b/.github/workflows/ci-lite-tests.yml
@@ -8,13 +8,15 @@ on:
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- - "requirements/lite/*"
- - "!requirements/lite/docs.txt"
+ - "requirements/lite/**"
- "src/lightning_lite/**"
- "tests/tests_lite/**"
- "setup.cfg" # includes pytest config
- ".github/workflows/ci-lite-tests.yml"
- ".actions/**"
+ - "!requirements/lite/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
@@ -36,29 +38,26 @@ jobs:
matrix:
include:
# assign python and pytorch version combinations to operating systems (arbitrarily)
- # note: there's no distribution of Torch==1.9 for Python>=3.9 or torch==1.10 for Python>=3.10
- - {os: "macOS-11", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"}
- - {os: "macOS-11", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.10"}
- - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.11"}
- - {os: "windows-2022", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.10"}
- - {os: "windows-2022", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.9"}
- - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.11"}
- # only run PyTorch latest with Python latest
- - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"}
+ # note: there's no distribution of torch==1.10 for Python>=3.10
+ - {os: "macOS-11", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.11"}
+ - {os: "macOS-11", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.12"}
+ - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.10"}
+ - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"}
- {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"}
+ - {os: "windows-2022", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"}
- {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"}
+ # only run PyTorch latest with Python latest
+ - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"}
+ - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"}
+ - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"}
# "oldest" versions tests, only on minimum Python
- {os: "macOS-11", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- {os: "windows-2022", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- # release-candidate tests, mixed Python versions
- - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13", release: "pre"}
- - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.13", release: "pre"}
- - {os: "windows-2022", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.13", release: "pre"}
# "lightning" installs the monolithic package
- - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"}
- - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"}
- - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"}
+ - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
+ - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
+ - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
timeout-minutes: 15
diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml
index b1fad271779fc..e749b4916357f 100644
--- a/.github/workflows/ci-pkg-install.yml
+++ b/.github/workflows/ci-pkg-install.yml
@@ -13,7 +13,9 @@ on:
- "setup.py"
- "src/**"
- "requirements/**"
- - "!requirements/*/docs.txt"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
@@ -63,6 +65,8 @@ jobs:
run: python -m lightning --version
- name: DocTest package
+ env:
+ PY_IGNORE_IMPORTMISMATCH: 1
run: |
PKG_NAME=$(python -c "print({'app': 'lightning_app', 'lite': 'lightning_lite', 'pytorch': 'pytorch_lightning', 'lightning': 'lightning'}['${{matrix.pkg-name}}'])")
python -m pytest src/${PKG_NAME} --ignore-glob="**/cli/*-template/**"
diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
index 4682dd5b3b5ac..42c2998d295ee 100644
--- a/.github/workflows/ci-pytorch-dockers.yml
+++ b/.github/workflows/ci-pytorch-dockers.yml
@@ -8,14 +8,15 @@ on:
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- "dockers/**"
- - "!dockers/README.md"
- - "requirements.txt"
- - "requirements/**"
- - "!requirements/*/docs.txt"
+ - ".github/workflows/ci-pytorch-dockers.yml"
+ - "requirements/pytorch/**"
+ - "requirements/lite/**"
- "environment.yml"
- - ".github/workflows/*docker*.yml"
- "setup.py"
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
schedule:
- cron: "0 0 * * *" # at the end of every day
@@ -39,6 +40,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+ - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
@@ -100,6 +102,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+ - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
diff --git a/.github/workflows/ci-pytorch-tests.yml b/.github/workflows/ci-pytorch-tests.yml
index 91d0a73452e1c..e45e40bb2db89 100644
--- a/.github/workflows/ci-pytorch-tests.yml
+++ b/.github/workflows/ci-pytorch-tests.yml
@@ -8,16 +8,18 @@ on:
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- - "requirements/pytorch/*"
+ - "requirements/pytorch/**"
- "src/pytorch_lightning/**"
- "tests/tests_pytorch/**"
- "tests/legacy/back-compatible-versions.txt"
- "setup.cfg" # includes pytest config
- ".github/workflows/ci-pytorch-tests.yml"
- - "requirements/lite/*"
+ - "requirements/lite/**"
- "src/lightning_lite/**"
- ".actions/**"
- - "!requirements/*/docs.txt"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
@@ -40,39 +42,32 @@ jobs:
matrix:
include:
# assign python and pytorch version combinations to operating systems (arbitrarily)
- # note: there's no distribution of Torch==1.9 for Python>=3.9 or torch==1.10 for Python>=3.10
- - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"}
- - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"}
+ # note: there's no distribution of torch==1.10 for Python>=3.10
+ - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.11"}
+ - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.12"}
- {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"}
- {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"}
- - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.11"}
- - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"}
- - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"}
- - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.11"}
- # only run PyTorch latest with Python latest
- - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"}
- - {os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"}
- {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"}
- - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"}
+ - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"}
- {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"}
- - {os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"}
+ # only run PyTorch latest with Python latest
+ - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"}
+ - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"}
+ - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"}
# "oldest" versions tests, only on minimum Python
- - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- - {os: "macOS-11", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
+ - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.9", requires: "oldest"} # 3.7 hangs
- {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- {os: "windows-2022", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- - {os: "windows-2022", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"}
- # release-candidate tests, mixed Python versions
- - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13", release: "pre"}
- - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.13", release: "pre"}
- - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13", release: "pre"}
# run test under SLOW label
- {type: "slow", os: "macOS-11", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"}
- {type: "slow", os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"}
- {type: "slow", os: "windows-2022", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"}
+ # "lightning" installs the monolithic package
+ - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
+ - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
+ - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"}
- timeout-minutes: 60
+ timeout-minutes: 70 # tests with macOS-11, py3.7 oldest takes much longer then expected
steps:
- uses: actions/checkout@v3
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 40f14287c0656..2688b175063ed 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -11,6 +11,9 @@ on:
- "src/**"
- "pyproject.toml" # includes mypy config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 4169bc33e4632..1584fc5d9aa4e 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -14,8 +14,10 @@ on:
- "docs/**"
- "src/**"
- "setup.py"
- - "setup.cfg"
+ - "setup.cfg" # includes metadata used in the package creation
- ".github/workflows/docs-checks.yml"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml
index 38c5db8a8946c..415802e0bb476 100644
--- a/.github/workflows/docs-deploy.yml
+++ b/.github/workflows/docs-deploy.yml
@@ -6,6 +6,7 @@ on:
branches: ["master", "release/*"]
paths:
- ".github/workflows/docs-deploy.yml"
+ # TODO: this workflow is just for debugging. extend the paths that should trigger it
env:
FREEZE_REQUIREMENTS: 1
@@ -17,6 +18,7 @@ defaults:
jobs:
# https://github.com/marketplace/actions/deploy-to-github-pages
build-docs-deploy:
+ if: github.repository_owner == 'Lightning-AI'
runs-on: ubuntu-20.04
steps:
- name: Checkout 🛎️
@@ -75,12 +77,12 @@ jobs:
- id: 'auth'
name: 'Authenticate to Google Cloud'
- uses: google-github-actions/auth@v0
+ uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GCS_SA_KEY }}
- name: Setup gcloud
- uses: google-github-actions/setup-gcloud@v0
+ uses: google-github-actions/setup-gcloud@v1
with:
project_id: ${{ secrets.GCS_PROJECT }}
diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml
index 35b267819a0cc..15965ca7eba47 100644
--- a/.github/workflows/probot-check-group.yml
+++ b/.github/workflows/probot-check-group.yml
@@ -14,7 +14,7 @@ jobs:
if: github.event.pull_request.draft == false
timeout-minutes: 61 # in case something is wrong with the internal timeout
steps:
- - uses: Lightning-AI/probot@v2
+ - uses: Lightning-AI/probot@v4
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index eccbf608491ae..33102fd3e6705 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -19,6 +19,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+ - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -47,7 +48,7 @@ jobs:
- name: Publish Latest to Docker
uses: docker/build-push-action@v3
# Only latest Python and PyTorch
- if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
+ if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13'
with:
repository: pytorchlightning/pytorch_lightning
username: ${{ secrets.DOCKER_USERNAME }}
diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
index a7ffe3e10afe0..8f21eabee0ccf 100644
--- a/.github/workflows/tpu-tests.yml
+++ b/.github/workflows/tpu-tests.yml
@@ -8,12 +8,18 @@ on:
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- ".github/workflows/tpu-tests.yml"
- - "requirements/pytorch/*"
- - "!requirements/pytorch/docs.txt"
+ - "dockers/base-xla/*"
+ - "requirements/lite/**"
+ - "src/lightning_lite/**"
+ - "tests/tests_lite/**"
+ - "requirements/pytorch/**"
- "src/pytorch_lightning/**"
- "tests/tests_pytorch/**"
- "setup.cfg" # includes pytest config
- ".actions/**"
+ - "!requirements/**/docs.txt"
+ - "!*.md"
+ - "!**/*.md"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
@@ -31,7 +37,7 @@ jobs:
if: github.event.pull_request.draft == false
env:
PYTHON_VER: 3.7
- timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet`
+ timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`
steps:
- uses: actions/checkout@v3
@@ -62,12 +68,12 @@ jobs:
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA: ${{ github.event.pull_request.head.sha }}
run: |
- python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
+ python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
- cat dockers/tpu-tests/tpu_test_cases.jsonnet
+ cat dockers/base-xla/tpu_workflow.jsonnet
shell: bash
- - uses: google-github-actions/auth@v0
+ - uses: google-github-actions/auth@v1
with:
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
@@ -80,7 +86,7 @@ jobs:
- name: Deploy cluster
run: |
export PATH=$PATH:$HOME/go/bin
- job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -)
+ job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
diff --git a/README.md b/README.md
index 66f1f28be4275..28e588a52145c 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ ______________________________________________________________________
Docs •
Examples •
Community •
+ Contribute •
License
@@ -406,7 +407,7 @@ The lightning community is maintained by
- [10+ core contributors](https://pytorch-lightning.readthedocs.io/en/latest/governance.html) who are all a mix of professional engineers, Research Scientists, and Ph.D. students from top AI labs.
- 590+ active community contributors.
-Want to help us build Lightning and reduce boilerplate for thousands of researchers? [Learn how to make your first contribution here](https://devblog.pytorchlightning.ai/quick-contribution-guide-86d977171b3a)
+Want to help us build Lightning and reduce boilerplate for thousands of researchers? [Learn how to make your first contribution here](https://pytorch-lightning.readthedocs.io/en/stable/generated/CONTRIBUTING.html)
Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/) which requires projects to have solid testing, documentation and support.
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 9a2e0455ff40f..3aea1ca0a43b6 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -13,12 +13,13 @@
# limitations under the License.
ARG UBUNTU_VERSION=20.04
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION=11.6.1
+
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG PYTHON_VERSION=3.9
-ARG PYTORCH_VERSION=1.12
+ARG PYTORCH_VERSION=1.13
SHELL ["/bin/bash", "-c"]
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -35,7 +36,12 @@ ENV \
RUN \
# TODO: Remove the manual key installation once the base image is updated.
# https://github.com/NVIDIA/nvidia-docker/issues/1631
- apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+ # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214
+ apt-get update && apt-get install -y wget && \
+ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+ mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \
+ echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \
+ apt-get update && \
apt-get update -qq --fix-missing && \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
@@ -132,16 +138,20 @@ RUN \
RUN \
# install Bagua
- CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
- CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
- pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \
- if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
- python -c "import bagua; print(bagua.__version__)"
+ if [[ $PYTORCH_VERSION != "1.13" ]]; then \
+ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \
+ CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \
+ pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \
+ if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \
+ python -c "import bagua_core; bagua_core.install_deps()"; \
+ fi ; \
+ python -c "import bagua; print(bagua.__version__)"; \
+ fi
RUN \
# install ColossalAI
- SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
- if [[ "$SHOULD_INSTALL_COLOSSAL" = "1" ]]; then \
+ # TODO: 1.13 wheels are not released, remove skip once they are
+ if [[ $PYTORCH_VERSION != "1.13" ]]; then \
PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
@@ -152,11 +162,8 @@ RUN \
RUN \
# install rest of strategies
# remove colossalai from requirements since they are installed separately
- SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
- if [[ "$SHOULD_INSTALL_COLOSSAL" = "0" ]]; then \
- python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
- fi && \
- echo "$SHOULD_INSTALL_COLOSSAL" && \
+ python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
+ python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \
cat requirements/pytorch/strategies.txt && \
pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
@@ -170,5 +177,4 @@ RUN \
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
python requirements/pytorch/check-avail-extras.py && \
- python requirements/pytorch/check-avail-strategies.py && \
rm -rf requirements/
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/base-xla/tpu_workflow.jsonnet
similarity index 100%
rename from dockers/tpu-tests/tpu_test_cases.jsonnet
rename to dockers/base-xla/tpu_workflow.jsonnet
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
deleted file mode 100644
index e23db55bb28e9..0000000000000
--- a/dockers/tpu-tests/Dockerfile
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-ARG PYTHON_VERSION=3.9
-ARG PYTORCH_VERSION=1.9
-
-FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
-
-LABEL maintainer="Lightning-AI "
-
-COPY ./ ./lightning/
-
-# Pull the legacy checkpoints
-RUN cd lightning && \
- bash .actions/pull_legacy_checkpoints.sh
-
-RUN \
- pip install -q fire && \
- # drop unnecessary packages
- pip install -r lightning/requirements/pytorch/devel.txt --no-cache-dir
-
-COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/
-RUN chmod +x /usr/local/bin/docker-entrypoint.sh
-
-ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
-CMD ["bash"]
diff --git a/dockers/tpu-tests/docker-entrypoint.sh b/dockers/tpu-tests/docker-entrypoint.sh
deleted file mode 100644
index 57abc703c8ace..0000000000000
--- a/dockers/tpu-tests/docker-entrypoint.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-# source ~/.bashrc
-echo "running docker-entrypoint.sh"
-# conda activate container
-echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
-echo "printed TPU info"
-export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
-exec "$@"
diff --git a/docs/source-app/levels/basic/build_a_lightning_component.rst b/docs/source-app/levels/basic/build_a_lightning_component.rst
index 39522614fe03b..0d44f44442888 100644
--- a/docs/source-app/levels/basic/build_a_lightning_component.rst
+++ b/docs/source-app/levels/basic/build_a_lightning_component.rst
@@ -132,7 +132,7 @@ powerful Lightning app. Here are a few key features available to super-charge yo
:titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container
:code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py;
:highlights: 11;11;11;11;11;2,7,10, 11; 11
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 430px
diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py
index 0ba033e0d86c0..5feed8a8864c3 100644
--- a/docs/source-app/levels/basic/hello_components/pl_multinode.py
+++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py
@@ -1,6 +1,6 @@
# app.py
import lightning as L
-from lightning.app.components import PyTorchLightningMultiNode
+from lightning.app.components import LightningTrainerMultiNode
from lightning.pytorch.demos.boring_classes import BoringModel
@@ -12,9 +12,9 @@ def run():
trainer.fit(model)
# 8 GPU: (2 nodes of 4 x v100)
-component = PyTorchLightningMultiNode(
+component = LightningTrainerMultiNode(
LightningTrainerDistributed,
- num_nodes=2,
+ num_nodes=4,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
)
app = L.LightningApp(component)
diff --git a/docs/source-app/levels/basic/hello_components/run_ptl_script.py b/docs/source-app/levels/basic/hello_components/run_ptl_script.py
index 84a86ec00d470..e9bcb16c92f6a 100644
--- a/docs/source-app/levels/basic/hello_components/run_ptl_script.py
+++ b/docs/source-app/levels/basic/hello_components/run_ptl_script.py
@@ -1,5 +1,5 @@
# app.py
-# !curl https://bit.ly/demoLightningScriptpy -o pl_boring_script.py
+# !curl https://raw.githubusercontent.com/Lightning-AI/lightning/master/examples/app_multi_node/pl_boring_script.py -o pl_boring_script.py
import lightning as L
from lightning.app.components.training import LightningTrainerScript
diff --git a/docs/source-app/levels/basic/hello_components/xgboost.py b/docs/source-app/levels/basic/hello_components/xgboost.py
index 0cedda2aa45b9..fae593a206790 100644
--- a/docs/source-app/levels/basic/hello_components/xgboost.py
+++ b/docs/source-app/levels/basic/hello_components/xgboost.py
@@ -1,5 +1,5 @@
# app.py
-# !pip install sklearn xgboost
+# !pip install scikit-learn xgboost
import lightning as L
from sklearn import datasets
from sklearn.model_selection import train_test_split
diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst
index 6bb8947a1a9cd..81fecc9461403 100644
--- a/docs/source-app/levels/basic/hero_components.rst
+++ b/docs/source-app/levels/basic/hero_components.rst
@@ -1,7 +1,7 @@
.. lit_tabs::
:titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo
:code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py
- :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24
- :app_id: abc123
+ :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24
+ :enable_run: true
:tab_rows: 3
:height: 620px
diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst
index c660c1679ac72..da413f459234a 100644
--- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst
+++ b/docs/source-app/levels/basic/real_lightning_component_implementations.rst
@@ -27,7 +27,7 @@ or cloud GPUs without code changes.
:descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic.
:code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py;
:highlights: 2; 4; 10-12; 15-18; 17; 18; 20
- :app_id: abc123
+ :enable_run: true
:tab_rows: 5
:height: 420px
@@ -48,7 +48,7 @@ This example shows how to deploy PyTorch and create an API
:descriptions: Shortcut to list dependencies without a requirements.txt file.; Import one of our serving components (high-performance ones are available on the enterprise tiers); Define the setup function to load your favorite pretrained models and do any kind of pre-processing.; Define the predict function which is called when the endpoint is hit.; Initialize the server and define the type of cloud machine to use.
:code_files: /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py;
:highlights: 1; 3; 10-12; 15-25; 28-30
- :app_id: abc123
+ :enable_run: true
:tab_rows: 4
:height: 620px
diff --git a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst b/docs/source-app/levels/basic/save_money_on_cloud_costs.rst
index 2218dc382fbee..5e752954da644 100644
--- a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst
+++ b/docs/source-app/levels/basic/save_money_on_cloud_costs.rst
@@ -18,7 +18,7 @@ Here are a few features that will enable you save a lot on your cloud costs:
:titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container
:code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py;
:highlights: 11;11;11;11;11;1,7, 10, 11; 11
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 430px
diff --git a/docs/source-app/levels/intermediate/connect_lightning_components.rst b/docs/source-app/levels/intermediate/connect_lightning_components.rst
index 9e9a2f0667842..14c2e9d793ba2 100644
--- a/docs/source-app/levels/intermediate/connect_lightning_components.rst
+++ b/docs/source-app/levels/intermediate/connect_lightning_components.rst
@@ -37,7 +37,7 @@ on a separate CPU machine. We save money by stopping the GPU machine when the wo
:descriptions: First, import Lightning; This component trains a model on a GPU machine; This component analyzes a model on a CPU machine; Define the LightningFlow that orchestrates components; Component 1 will run on a CPU machine; Component 2 will run on an accelerated GPU machine; Describe the workflow in the run method; Training runs first and completes; Analyze runs after training completes; This allows the app to be runnable
:code_files: ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py
:highlights: 2; 5-7; 9-11; 13; 16; 17; 19; 20; 21; 23
- :app_id: abc123
+ :enable_run: true
:tab_rows: 4
:height: 460px
diff --git a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst b/docs/source-app/levels/intermediate/debug_a_lightning_app.rst
index ae0e0496991ef..856be5a182c58 100644
--- a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst
+++ b/docs/source-app/levels/intermediate/debug_a_lightning_app.rst
@@ -16,7 +16,7 @@ To enable a breakpoint, use `L.pdb.set_trace()` (note direct python pdb support
:descriptions: Toy app; Add a breakpoint. When the program runs, it will stop at this line.
:code_files: ./debug_app_scripts/toy_app_1_component.py; ./debug_app_scripts/toy_app_1_component_pdb.py
:highlights: ; 7
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 350px
diff --git a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst b/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst
index c4b8b168204d3..b0ce06dae2a41 100644
--- a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst
+++ b/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst
@@ -18,7 +18,7 @@ Lightning sends the variables across the machines for you automatically.
:descriptions: Remember this component may live on its own machine; The flow may be on a separate machine as well; This variable is on the flow machine; When passed to the work component, it is actually sent across the network under the hood.; When it prints here, it prints on the work component machine (not the flow machine); The second string was directly created on machine 1
:code_files: ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py
:highlights: 4-7; 9-16; 15; 16; 6; 7;
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 380px
@@ -55,7 +55,7 @@ Example Continuous deployment: Every time a model saves a checkpoint, we redeplo
:descriptions: Define a component that simulates training; Define a component that simulates deployment; Training will happen in parallel over a long period; The deployment server also runs in parallel forever; Start training in parallel (could take months); Whenever the model has a checkpoint deploy; When the checkpoint is updated, model re-deploys
:code_files: ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py
:highlights: 5-18; 20-22; 27; 28; 31; 32, 33; 33
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 690px
@@ -110,7 +110,7 @@ transfering them across components.
:descriptions: Let's define a component to simulate generating embeddings (from a DB, feature store, etc...); This component simulates a server that will use the embeddings.; Run the component to generate the embeddings; Simulate embeddings as an array. Here you would query a DB, load from a feature store or disk or even use a neural network to extract the embedding.; Allow the embeddings to be transfered efficiently by wrapping them in the Payload object.; Pass the variable to the EmbeddingServer (just the pointer).; The data gets transfered once you use the .value attribute in the other component.
:code_files: ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py;
:highlights: 5-13; 15-19; 28; 12; 13; 29; 18
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 600px
diff --git a/docs/source-app/workflows/add_web_ui/index_content.rst b/docs/source-app/workflows/add_web_ui/index_content.rst
index 4e95e6c2a70c2..f3d516c5af546 100644
--- a/docs/source-app/workflows/add_web_ui/index_content.rst
+++ b/docs/source-app/workflows/add_web_ui/index_content.rst
@@ -13,7 +13,7 @@ Web UIs for non Javascript Developers
:header: Dash
:description: Learn how to add a web UI built in Python with Dash.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/dash/index.html
+ :button_link: ../../workflows/add_web_ui/dash/index.html
:height: 150
:tag: basic
@@ -21,7 +21,7 @@ Web UIs for non Javascript Developers
:header: Gradio
:description: Learn how to add a web UI built in Python with Gradio.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/gradio/index.html
+ :button_link: ../../workflows/add_web_ui/gradio/index.html
:height: 150
:tag: basic
@@ -29,7 +29,7 @@ Web UIs for non Javascript Developers
:header: Panel
:description: Learn how to add a web UI built in Python with Panel.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/panel/index.html
+ :button_link: ../../workflows/add_web_ui/panel/index.html
:height: 150
:tag: basic
@@ -37,7 +37,7 @@ Web UIs for non Javascript Developers
:header: Jupyter Notebook
:description: Learn how to enable a web UI that is a Jupyter Notebook.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/jupyter_basic.html
+ :button_link: ../../workflows/add_web_ui/jupyter_basic.html
:height: 150
:tag: [docs coming soon]
@@ -45,7 +45,7 @@ Web UIs for non Javascript Developers
:header: Streamlit
:description: Learn how to add a web UI built in Python with Streamlit.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/streamlit/index.html
+ :button_link: ../../workflows/add_web_ui/streamlit/index.html
:height: 150
:tag: basic
@@ -53,7 +53,7 @@ Web UIs for non Javascript Developers
:header: JustPy
:description: Learn how to add a web UI built in Python with JustPy.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/justpy/index.html
+ :button_link: ../../workflows/add_web_ui/justpy/index.html
:height: 150
:tag: basic
@@ -79,7 +79,7 @@ Web UIs for Javascript Developers
:header: Any javascript framework
:description: Learn how to link up any javascript framework to a Lightning app.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/integrate_any_javascript_framework.html
+ :button_link: ../../workflows/add_web_ui/integrate_any_javascript_framework.html
:height: 150
:tag: advanced
@@ -87,7 +87,7 @@ Web UIs for Javascript Developers
:header: Angular.js
:description: Learn how to add a web UI built in Javascript with Angular.js
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/angular_js_intermediate.html
+ :button_link: ../../workflows/add_web_ui/angular_js_intermediate.html
:height: 150
:tag: [Docs coming soon]
@@ -95,7 +95,7 @@ Web UIs for Javascript Developers
:header: HTML
:description: Learn how to add a web UI built with html.
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/html/index.html
+ :button_link: ../../workflows/add_web_ui/html/index.html
:height: 150
:tag: basic
@@ -103,7 +103,7 @@ Web UIs for Javascript Developers
:header: React.js
:description: Learn how to add a web UI built in Javascript with React.js
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/react/index.html
+ :button_link: ../../workflows/add_web_ui/react/index.html
:height: 150
:tag: intermediate
@@ -111,7 +111,7 @@ Web UIs for Javascript Developers
:header: Vue.js
:description: Learn how to add a web UI built in Javascript with Vue.js
:col_css: col-md-4
- :button_link: /workflows/add_web_ui/vue_js_intermediate.html
+ :button_link: ../../workflows/add_web_ui/vue_js_intermediate.html
:height: 150
:tag: [Docs coming soon]
diff --git a/docs/source-app/workflows/run_work_in_parallel_content.rst b/docs/source-app/workflows/run_work_in_parallel_content.rst
index 467f64f165043..1c8d5b374dbb2 100644
--- a/docs/source-app/workflows/run_work_in_parallel_content.rst
+++ b/docs/source-app/workflows/run_work_in_parallel_content.rst
@@ -20,7 +20,7 @@ to wait for the first one to finish.
:descriptions: No parallel components; Allow the train component to run in parallel; When the component runs, it will run in parallel; The next component is unblocked and can now immediately run.
:code_files: /workflows/scripts/parallel/toy_app.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py;
:highlights: ; 18; 23; 24;
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 540px
@@ -36,6 +36,6 @@ allows the third component to run without waiting for the others to finish.
:descriptions: No parallel components; Enable 2 components to run in parallel; Start both components together in parallel; Last component is not blocked and can start immediately.
:code_files: /workflows/scripts/parallel/toy_two_parallel_not_started.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py
:highlights: ; 18, 19; 23, 24; 25
- :app_id: abc123
+ :enable_run: true
:tab_rows: 3
:height: 540px
diff --git a/docs/source-pytorch/model/train_model_basic.rst b/docs/source-pytorch/model/train_model_basic.rst
index 92f4a0a40fa7b..e5bce7dfdf1c1 100644
--- a/docs/source-pytorch/model/train_model_basic.rst
+++ b/docs/source-pytorch/model/train_model_basic.rst
@@ -20,7 +20,7 @@ Add the relevant imports at the top of the file
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
- from torch.utils.data import DataLoader, random_split
+ from torch.utils.data import DataLoader
import pytorch_lightning as pl
----
diff --git a/docs/source-pytorch/starter/lightning_lite.rst b/docs/source-pytorch/starter/lightning_lite.rst
index bab4581fa91a4..bc097a02571d5 100644
--- a/docs/source-pytorch/starter/lightning_lite.rst
+++ b/docs/source-pytorch/starter/lightning_lite.rst
@@ -1,6 +1,6 @@
-###########################################
-LightningLite (Stepping Stone to Lightning)
-###########################################
+##############
+Lightning Lite
+##############
:class:`~pytorch_lightning.lite.LightningLite` enables pure PyTorch users to scale their existing code
@@ -32,7 +32,7 @@ Learn by example
My Existing PyTorch Code
========================
-The ``run`` function contains custom training loop used to train ``MyModel`` on ``MyDataset`` for ``num_epochs`` epochs.
+The ``train`` function contains a standard training loop used to train ``MyModel`` on ``MyDataset`` for ``num_epochs`` epochs.
.. code-block:: python
@@ -49,7 +49,7 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on
...
- def run(args):
+ def train(args):
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MyModel(...).to(device)
@@ -67,7 +67,7 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on
optimizer.step()
- run(args)
+ train(args)
----------
@@ -75,13 +75,12 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on
Convert to LightningLite
========================
-Here are five required steps to convert to :class:`~pytorch_lightning.lite.LightningLite`.
+Here are five easy steps to let :class:`~pytorch_lightning.lite.LightningLite` scale your PyTorch models.
-1. Subclass :class:`~pytorch_lightning.lite.LightningLite` and override its :meth:`~pytorch_lightning.lite.LightningLite.run` method.
-2. Move the body of your existing ``run`` function into :class:`~pytorch_lightning.lite.LightningLite` ``run`` method.
-3. Remove all ``.to(...)``, ``.cuda()`` etc calls since :class:`~pytorch_lightning.lite.LightningLite` will take care of it.
-4. Apply :meth:`~pytorch_lightning.lite.LightningLite.setup` over each model and optimizers pair and :meth:`~pytorch_lightning.lite.LightningLite.setup_dataloaders` on all your dataloaders and replace ``loss.backward()`` by ``self.backward(loss)``.
-5. Instantiate your :class:`~pytorch_lightning.lite.LightningLite` subclass and call its :meth:`~pytorch_lightning.lite.LightningLite.run` method.
+1. Create the :class:`~pytorch_lightning.lite.LightningLite` object at the beginning of your training code.
+2. Remove all ``.to`` and ``.cuda`` calls since :class:`~pytorch_lightning.lite.LightningLite` will take care of it.
+3. Apply :meth:`~pytorch_lightning.lite.LightningLite.setup` over each model and optimizers pair and :meth:`~pytorch_lightning.lite.LightningLite.setup_dataloaders` on all your dataloaders and replace ``loss.backward()`` by ``lite.backward(loss)``.
+4. Run the script from the terminal using ``lightning run model path/to/train.py`` or use the :meth:`~pytorch_lightning.lite.LightningLite.launch` method in a notebook.
|
@@ -90,7 +89,7 @@ Here are five required steps to convert to :class:`~pytorch_lightning.lite.Light
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
- from pytorch_lightning.lite import LightningLite
+ from lightning.lite import LightningLite
class MyModel(nn.Module):
@@ -101,108 +100,88 @@ Here are five required steps to convert to :class:`~pytorch_lightning.lite.Light
...
- class Lite(LightningLite):
- def run(self, args):
+ def train(args):
- model = MyModel(...)
- optimizer = torch.optim.SGD(model.parameters(), ...)
- model, optimizer = self.setup(model, optimizer) # Scale your model / optimizers
+ lite = LightningLite()
- dataloader = DataLoader(MyDataset(...), ...)
- dataloader = self.setup_dataloaders(dataloader) # Scale your dataloaders
+ model = MyModel(...)
+ optimizer = torch.optim.SGD(model.parameters(), ...)
+ model, optimizer = lite.setup(model, optimizer) # Scale your model / optimizers
- model.train()
- for epoch in range(args.num_epochs):
- for batch in dataloader:
- optimizer.zero_grad()
- loss = model(batch)
- self.backward(loss) # instead of loss.backward()
- optimizer.step()
+ dataloader = DataLoader(MyDataset(...), ...)
+ dataloader = lite.setup_dataloaders(dataloader) # Scale your dataloaders
+ model.train()
+ for epoch in range(args.num_epochs):
+ for batch in dataloader:
+ optimizer.zero_grad()
+ loss = model(batch)
+ lite.backward(loss) # instead of loss.backward()
+ optimizer.step()
- Lite(...).run(args)
+ train(args)
-That's all. You can now train on any kind of device and scale your training. Check out `this `_ full MNIST training example with LightningLite.
-:class:`~pytorch_lightning.lite.LightningLite` takes care of device management, so you don't have to.
-You should remove any device-specific logic within your code.
+That's all you need to do to your code. You can now train on any kind of device and scale your training.
+Check out `this `_ full MNIST training example with LightningLite.
Here is how to train on eight GPUs with `torch.bfloat16 `_ precision:
-.. code-block:: python
+.. code-block:: bash
- Lite(strategy="ddp", devices=8, accelerator="gpu", precision="bf16").run(10)
+ lightning run model ./path/to/train.py --strategy=ddp --devices=8 --accelerator=cuda --precision="bf16"
-Here is how to use `DeepSpeed Zero3 `_ with eight GPUs and precision 16:
+Here is how to use `DeepSpeed Zero3 `_ with eight GPUs and mixed precision:
-.. code-block:: python
+.. code-block:: bash
- Lite(strategy="deepspeed", devices=8, accelerator="gpu", precision=16).run(10)
+ lightning run model ./path/to/train.py --strategy=deepspeed --devices=8 --accelerator=cuda --precision=16
:class:`~pytorch_lightning.lite.LightningLite` can also figure it out automatically for you!
-.. code-block:: python
+.. code-block:: bash
+
+ lightning run model ./path/to/train.py --devices=auto --accelerator=auto --precision=16
- Lite(devices="auto", accelerator="auto", precision=16).run(10)
You can also easily use distributed collectives if required.
-Here is an example while running on 256 GPUs (eight GPUs times 32 nodes).
.. code-block:: python
- class Lite(LightningLite):
- def run(self):
-
- # Transfer and concatenate tensors across processes
- self.all_gather(...)
-
- # Transfer an object from one process to all the others
- self.broadcast(..., src=...)
-
- # The total number of processes running across all devices and nodes.
- self.world_size
-
- # The global index of the current process across all devices and nodes.
- self.global_rank
-
- # The index of the current process among the processes running on the local node.
- self.local_rank
+ lite = LightningLite()
- # The index of the current node.
- self.node_rank
+ # Transfer and concatenate tensors across processes
+ lite.all_gather(...)
- # Wether this global rank is rank zero.
- if self.is_global_zero:
- # do something on rank 0
- ...
+ # Transfer an object from one process to all the others
+ lite.broadcast(..., src=...)
- # Wait for all processes to enter this call.
- self.barrier()
+ # The total number of processes running across all devices and nodes.
+ lite.world_size
+ # The global index of the current process across all devices and nodes.
+ lite.global_rank
- Lite(strategy="ddp", devices=8, num_nodes=32, accelerator="gpu").run()
+ # The index of the current process among the processes running on the local node.
+ lite.local_rank
+ # The index of the current node.
+ lite.node_rank
-If you require custom data or model device placement, you can deactivate
-:class:`~pytorch_lightning.lite.LightningLite` automatic placement by doing
-``self.setup_dataloaders(..., move_to_device=False)`` for the data and
-``self.setup(..., move_to_device=False)`` for the model.
-Furthermore, you can access the current device from ``self.device`` or
-rely on :meth:`~pytorch_lightning.lite.LightningLite.to_device`
-utility to move an object to the current device.
+ # Whether this global rank is rank zero.
+ if lite.is_global_zero:
+ # do something on rank 0
+ ...
+ # Wait for all processes to enter this call.
+ lite.barrier()
-.. note:: We recommend instantiating the models within the :meth:`~pytorch_lightning.lite.LightningLite.run` method as large models would cause an out-of-memory error otherwise.
-.. tip::
+The code stays agnostic, whether you are running on CPU, on two GPUS or on multiple machines with many GPUs.
- If you have hundreds or thousands of lines within your :meth:`~pytorch_lightning.lite.LightningLite.run` function
- and you are feeling unsure about them, then that is the correct feeling.
- In 2019, our :class:`~pytorch_lightning.core.module.LightningModule` was getting larger
- and we got the same feeling, so we started to organize our code for simplicity, interoperability and standardization.
- This is definitely a good sign that you should consider refactoring your code and / or switching to
- :class:`~pytorch_lightning.core.module.LightningModule` ultimately.
+If you require custom data or model device placement, you can deactivate :class:`~pytorch_lightning.lite.LightningLite`'s automatic placement by doing ``lite.setup_dataloaders(..., move_to_device=False)`` for the data and ``lite.setup(..., move_to_device=False)`` for the model.
+Furthermore, you can access the current device from ``lite.device`` or rely on :meth:`~pytorch_lightning.lite.LightningLite.to_device` utility to move an object to the current device.
----------
@@ -211,8 +190,7 @@ utility to move an object to the current device.
Distributed Training Pitfalls
=============================
-The :class:`~pytorch_lightning.lite.LightningLite` provides you with the tools to scale your training,
-but there are several major challenges ahead of you now:
+The :class:`~pytorch_lightning.lite.LightningLite` provides you with the tools to scale your training, but there are several major challenges ahead of you now:
.. list-table::
@@ -236,103 +214,6 @@ but there are several major challenges ahead of you now:
If you are facing one of those challenges, then you are already meeting the limit of :class:`~pytorch_lightning.lite.LightningLite`.
We recommend you to convert to :doc:`Lightning <../starter/introduction>`, so you never have to worry about those.
-----------
-
-Convert to Lightning
-====================
-
-:class:`~pytorch_lightning.lite.LightningLite` is a stepping stone to transition fully to the Lightning API and benefit
-from its hundreds of features.
-
-You can see our :class:`~pytorch_lightning.lite.LightningLite` class as a
-future :class:`~pytorch_lightning.core.module.LightningModule`, and slowly refactor your code into its API.
-Below, the :meth:`~pytorch_lightning.core.module.LightningModule.training_step`, :meth:`~pytorch_lightning.core.module.LightningModule.forward`,
-:meth:`~pytorch_lightning.core.module.LightningModule.configure_optimizers`, :meth:`~pytorch_lightning.core.module.LightningModule.train_dataloader` methods
-are implemented.
-
-
-.. code-block:: python
-
- class Lite(LightningLite):
-
- # 1. This would become the LightningModule `__init__` function.
- def run(self, args):
- self.args = args
-
- self.model = MyModel(...)
-
- self.fit() # This would be automated by the Lightning Trainer.
-
- # 2. This can be fully removed as Lightning creates its own fitting loop,
- # and sets up the model, optimizer, dataloader, etc for you.
- def fit(self):
- # setup everything
- optimizer = self.configure_optimizers()
- self.model, optimizer = self.setup(self.model, optimizer)
- dataloader = self.setup_dataloaders(self.train_dataloader())
-
- # start fitting
- self.model.train()
- for epoch in range(num_epochs):
- for batch in enumerate(dataloader):
- optimizer.zero_grad()
- loss = self.training_step(batch, batch_idx)
- self.backward(loss)
- optimizer.step()
-
- # 3. This stays here as it belongs to the LightningModule.
- def forward(self, x):
- return self.model(x)
-
- def training_step(self, batch, batch_idx):
- return self.forward(batch)
-
- def configure_optimizers(self):
- return torch.optim.SGD(self.model.parameters(), ...)
-
- # 4. [Optionally] This can stay here or be extracted to the LightningDataModule to enable higher composability.
- def train_dataloader(self):
- return DataLoader(MyDataset(...), ...)
-
-
- Lite(...).run(args)
-
-
-Finally, change the :meth:`~pytorch_lightning.lite.LightningLite.run` into a
-:meth:`~pytorch_lightning.core.module.LightningModule.__init__` and drop the ``fit`` call from inside.
-
-.. code-block:: python
-
- from pytorch_lightning import LightningDataModule, LightningModule, Trainer
-
-
- class LightningModel(LightningModule):
- def __init__(self, args):
- super().__init__()
- self.model = MyModel(...)
-
- def forward(self, x):
- return self.model(x)
-
- def training_step(self, batch, batch_idx):
- loss = self(batch)
- self.log("train_loss", loss)
- return loss
-
- def configure_optimizers(self):
- return torch.optim.SGD(self.model.parameters(), lr=0.001)
-
-
- class BoringDataModule(LightningDataModule):
- def train_dataloader(self):
- return DataLoader(MyDataset(...), ...)
-
-
- trainer = Trainer(max_epochs=10)
- trainer.fit(LightningModel(), datamodule=BoringDataModule())
-
-
-You have successfully converted to PyTorch Lightning, and can now benefit from its hundred of features!
----------
@@ -538,33 +419,6 @@ Lightning Lite Methods
**********************
-run
-===
-
-The run method serves two purposes:
-
-1. Override this method from the :class:`~pytorch_lightning.lite.lite.LightningLite` class and put your
- training (or inference) code inside.
-2. Launch the training procedure by calling the run method. Lite will take care of setting up the distributed backend.
-
-You can optionally pass arguments to the run method. For example, the hyperparameters or a backbone for the model.
-
-.. code-block:: python
-
- from pytorch_lightning.lite import LightningLite
-
-
- class Lite(LightningLite):
-
- # Input arguments are optional; put whatever you need
- def run(self, learning_rate, num_layers):
- """Here goes your training loop"""
-
-
- lite = Lite(accelerator="gpu", devices=2)
- lite.run(learning_rate=0.01, num_layers=12)
-
-
setup
=====
@@ -577,10 +431,10 @@ Moves the model and optimizer to the correct device automatically.
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# Set up model and optimizer for accelerated training
- model, optimizer = self.setup(model, optimizer)
+ model, optimizer = lite.setup(model, optimizer)
# If you don't want Lite to set the device
- model, optimizer = self.setup(model, optimizer, move_to_device=False)
+ model, optimizer = lite.setup(model, optimizer, move_to_device=False)
The setup method also prepares the model for the selected precision choice so that operations during ``forward()`` get
@@ -598,13 +452,13 @@ data tensors to the correct device automatically.
train_data = torch.utils.DataLoader(train_dataset, ...)
test_data = torch.utils.DataLoader(test_dataset, ...)
- train_data, test_data = self.setup_dataloaders(train_data, test_data)
+ train_data, test_data = lite.setup_dataloaders(train_data, test_data)
# If you don't want Lite to move the data to the device
- train_data, test_data = self.setup_dataloaders(train_data, test_data, move_to_device=False)
+ train_data, test_data = lite.setup_dataloaders(train_data, test_data, move_to_device=False)
# If you don't want Lite to replace the sampler in the context of distributed training
- train_data, test_data = self.setup_dataloaders(train_data, test_data, replace_sampler=False)
+ train_data, test_data = lite.setup_dataloaders(train_data, test_data, replace_sampler=False)
backward
@@ -618,7 +472,7 @@ This replaces any occurrences of ``loss.backward()`` and makes your code acceler
loss = loss_fn(output, target)
# loss.backward()
- self.backward(loss)
+ lite.backward(loss)
to_device
@@ -632,7 +486,7 @@ device, so calling this method is only necessary for manual operation when neede
.. code-block:: python
data = torch.load("dataset.pt")
- data = self.to_device(data)
+ data = lite.to_device(data)
seed_everything
@@ -643,7 +497,7 @@ Make your code reproducible by calling this method at the beginning of your run.
.. code-block:: python
# Instead of `torch.manual_seed(...)`, call:
- self.seed_everything(1234)
+ lite.seed_everything(1234)
This covers PyTorch, NumPy and Python random number generators. In addition, Lite takes care of properly initializing
@@ -659,15 +513,15 @@ You need this only if you wish to autocast more operations outside the ones in m
.. code-block:: python
- model, optimizer = self.setup(model, optimizer)
+ model, optimizer = lite.setup(model, optimizer)
# Lite handles precision automatically for the model
output = model(inputs)
- with self.autocast(): # optional
+ with lite.autocast(): # optional
loss = loss_function(output, target)
- self.backward(loss)
+ lite.backward(loss)
...
@@ -681,7 +535,7 @@ This avoids excessive printing and logs when running on multiple devices/nodes.
.. code-block:: python
# Print only on the main process
- self.print(f"{epoch}/{num_epochs}| Train Epoch Loss: {loss}")
+ lite.print(f"{epoch}/{num_epochs}| Train Epoch Loss: {loss}")
save
@@ -693,7 +547,7 @@ handling the saving part correctly, no matter if you are running a single device
.. code-block:: python
# Instead of `torch.save(...)`, call:
- self.save(model.state_dict(), "path/to/checkpoint.ckpt")
+ lite.save(model.state_dict(), "path/to/checkpoint.ckpt")
load
@@ -705,7 +559,7 @@ handling the loading part correctly, no matter if you are running a single devic
.. code-block:: python
# Instead of `torch.load(...)`, call:
- self.load("path/to/checkpoint.ckpt")
+ lite.load("path/to/checkpoint.ckpt")
barrier
@@ -718,11 +572,11 @@ the data is written to disk.
.. code-block:: python
# Download data only on one process
- if self.global_rank == 0:
+ if lite.global_rank == 0:
download_data("http://...")
# Wait until all processes meet up here
- self.barrier()
+ lite.barrier()
# All processes are allowed to read the data now
@@ -738,10 +592,10 @@ It will speed up your training loop by cutting redundant communication between p
# Accumulate gradient 8 batches at a time
is_accumulating = batch_idx % 8 != 0
- with self.no_backward_sync(model, enabled=is_accumulating):
+ with lite.no_backward_sync(model, enabled=is_accumulating):
output = model(input)
loss = ...
- self.backward(loss)
+ lite.backward(loss)
...
# Step the optimizer every 8 batches
@@ -749,7 +603,7 @@ It will speed up your training loop by cutting redundant communication between p
optimizer.step()
optimizer.zero_grad()
-Both the model's `.forward()` and the `self.backward()` call need to run under this context as shown in the example above.
+Both the model's `.forward()` and the `lite.backward()` call need to run under this context as shown in the example above.
For single-device strategies, it is a no-op. There are strategies that don't support this:
- deepspeed
diff --git a/examples/app_multi_node/README.md b/examples/app_multi_node/README.md
index 23e7afa23d68e..0fd2f369bb786 100644
--- a/examples/app_multi_node/README.md
+++ b/examples/app_multi_node/README.md
@@ -28,9 +28,9 @@ lightning run app train_lite.py
Using Lite, you retain control over your loops while accessing in a minimal way all Lightning distributed strategies.
-## Multi Node with PyTorch Lightning
+## Multi Node with Lightning Trainer
-Lightning supports running PyTorch Lightning from a script or within a Lightning Work.
+Lightning supports running Lightning Trainer from a script or within a Lightning Work.
You can either run a script directly
diff --git a/examples/app_multi_node/train_lite.py b/examples/app_multi_node/train_lite.py
index feb8ac2226b77..8e546b270a693 100644
--- a/examples/app_multi_node/train_lite.py
+++ b/examples/app_multi_node/train_lite.py
@@ -6,23 +6,26 @@
class LitePyTorchDistributed(L.LightningWork):
- @staticmethod
- def run():
- # 1. Create LightningLite.
- lite = LightningLite(strategy="ddp", precision=16)
+ def run(self):
+ # 1. Prepare the model
+ model = torch.nn.Sequential(
+ torch.nn.Linear(1, 1),
+ torch.nn.ReLU(),
+ torch.nn.Linear(1, 1),
+ )
- # 2. Prepare distributed model and optimizer.
- model = torch.nn.Linear(32, 2)
- optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
- model, optimizer = lite.setup(model, optimizer)
+ # 2. Create LightningLite.
+ lite = LightningLite(strategy="ddp", precision=16)
+ model, optimizer = lite.setup(model, torch.optim.SGD(model.parameters(), lr=0.01))
criterion = torch.nn.MSELoss()
- # 3. Train the model for 50 steps.
- for step in range(50):
+ # 3. Train the model for 1000 steps.
+ for step in range(1000):
model.zero_grad()
- x = torch.randn(64, 32).to(lite.device)
+ x = torch.tensor([0.8]).to(lite.device)
+ target = torch.tensor([1.0]).to(lite.device)
output = model(x)
- loss = criterion(output, torch.ones_like(output))
+ loss = criterion(output, target)
print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}")
lite.backward(loss)
optimizer.step()
diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py
index 5cbee32dd8132..c9e2f62392a56 100644
--- a/examples/app_multi_node/train_lt.py
+++ b/examples/app_multi_node/train_lt.py
@@ -4,11 +4,10 @@
class LightningTrainerDistributed(L.LightningWork):
- @staticmethod
- def run():
+ def run(self):
model = BoringModel()
trainer = L.Trainer(
- max_epochs=10,
+ max_steps=1000,
strategy="ddp",
)
trainer.fit(model)
diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py
index 825112a9c17f1..9ce662fa40009 100644
--- a/examples/app_multi_node/train_pytorch.py
+++ b/examples/app_multi_node/train_pytorch.py
@@ -18,29 +18,28 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no
init_method=f"tcp://{main_address}:{main_port}",
)
- # 2. Prepare distributed model
- model = torch.nn.Linear(32, 2)
+ # 2. Prepare the model
+ model = torch.nn.Sequential(
+ torch.nn.Linear(1, 1),
+ torch.nn.ReLU(),
+ torch.nn.Linear(1, 1),
+ )
# 3. Setup distributed training
- if torch.cuda.is_available():
- device = torch.device(f"cuda:{local_rank}")
- torch.cuda.set_device(device)
- else:
- device = torch.device("cpu")
-
- model = model.to(device)
- model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None)
+ device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+ model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None)
# 4. Prepare loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
- # 5. Train the model for 50 steps.
- for step in range(50):
+ # 5. Train the model for 1000 steps.
+ for step in range(1000):
model.zero_grad()
- x = torch.randn(64, 32).to(device)
+ x = torch.tensor([0.8]).to(device)
+ target = torch.tensor([1.0]).to(device)
output = model(x)
- loss = criterion(output, torch.ones_like(output))
+ loss = criterion(output, target)
print(f"global_rank: {global_rank} step: {step} loss: {loss}")
loss.backward()
optimizer.step()
diff --git a/examples/app_multi_node/train_pytorch_spawn.py b/examples/app_multi_node/train_pytorch_spawn.py
index dd3f9442dd829..d29ec83562ffb 100644
--- a/examples/app_multi_node/train_pytorch_spawn.py
+++ b/examples/app_multi_node/train_pytorch_spawn.py
@@ -6,38 +6,37 @@
class PyTorchDistributed(L.LightningWork):
-
- # Note: Only staticmethod are support for now with `PyTorchSpawnMultiNode`
- @staticmethod
def run(
+ self,
world_size: int,
node_rank: int,
global_rank: str,
local_rank: int,
):
- # 1. Prepare distributed model
- model = torch.nn.Linear(32, 2)
+ # 1. Prepare the model
+ model = torch.nn.Sequential(
+ torch.nn.Linear(1, 1),
+ torch.nn.ReLU(),
+ torch.nn.Linear(1, 1),
+ )
# 2. Setup distributed training
- if torch.cuda.is_available():
- device = torch.device(f"cuda:{local_rank}")
- torch.cuda.set_device(device)
- else:
- device = torch.device("cpu")
-
- model = model.to(device)
- model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None)
+ device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu")
+ model = DistributedDataParallel(
+ model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None
+ )
# 3. Prepare loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
- # 4. Train the model for 50 steps.
- for step in range(50):
+ # 4. Train the model for 1000 steps.
+ for step in range(1000):
model.zero_grad()
- x = torch.randn(64, 32).to(device)
+ x = torch.tensor([0.8]).to(device)
+ target = torch.tensor([1.0]).to(device)
output = model(x)
- loss = criterion(output, torch.ones_like(output))
+ loss = criterion(output, target)
print(f"global_rank: {global_rank} step: {step} loss: {loss}")
loss.backward()
optimizer.step()
diff --git a/pyproject.toml b/pyproject.toml
index bc8d9c7658dcd..5e8a2bfa0e481 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ module = [
"lightning_app.components.serve.streamlit",
"lightning_app.components.serve.types.image",
"lightning_app.components.serve.types.type",
+ "lightning_app.components.serve.python_server",
"lightning_app.components.training",
"lightning_app.core.api",
"lightning_app.core.app",
diff --git a/requirements/app/test.txt b/requirements/app/test.txt
index 3c67611b2dfc5..4b50f1fff4285 100644
--- a/requirements/app/test.txt
+++ b/requirements/app/test.txt
@@ -1,4 +1,4 @@
-coverage==6.4.2
+coverage==6.5.0
codecov==2.1.12
pytest==7.1.3
pytest-timeout==2.1.0
diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt
index b342ecacc0927..fa7182be0f9a3 100644
--- a/requirements/lite/base.txt
+++ b/requirements/lite/base.txt
@@ -2,7 +2,7 @@
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
numpy>=1.17.2, <1.23.1
-torch>=1.9.*, <1.13.0
+torch>=1.9.*, <=1.13.0
fsspec[http]>2021.06.0, <2022.6.0
packaging>=17.0, <=21.3
typing-extensions>=4.0.0, <=4.4.0
diff --git a/requirements/lite/test.txt b/requirements/lite/test.txt
index fde73e54556f8..01759799ff133 100644
--- a/requirements/lite/test.txt
+++ b/requirements/lite/test.txt
@@ -1,4 +1,4 @@
-coverage==6.4.2
+coverage==6.5.0
codecov==2.1.12
pytest==7.1.3
pytest-cov==4.0.0
diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py
index 9d9f4047e6fc4..69d61e130ca4b 100644
--- a/requirements/pytorch/adjust-versions.py
+++ b/requirements/pytorch/adjust-versions.py
@@ -5,8 +5,8 @@
# IMPORTANT: this list needs to be sorted in reverse
VERSIONS = [
- dict(torch="1.13.0", torchvision="0.14.0"), # RC
- dict(torch="1.12.1", torchvision="0.13.1"), # stable
+ dict(torch="1.13.0", torchvision="0.14.0"), # stable
+ dict(torch="1.12.1", torchvision="0.13.1"),
dict(torch="1.12.0", torchvision="0.13.0"),
dict(torch="1.11.0", torchvision="0.12.0"),
dict(torch="1.10.2", torchvision="0.11.3"),
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
index e3eae1cd66ce8..2f2b9306bd22a 100644
--- a/requirements/pytorch/base.txt
+++ b/requirements/pytorch/base.txt
@@ -2,7 +2,7 @@
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
numpy>=1.17.2, <1.23.1
-torch>=1.9.*, <1.13.0
+torch>=1.9.*, <=1.13.0
tqdm>=4.57.0, <4.65.0
PyYAML>=5.4, <=6.0
fsspec[http]>2021.06.0, <2022.8.0
diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py
index 9af53010b605b..3ab8d2848c3f0 100644
--- a/requirements/pytorch/check-avail-extras.py
+++ b/requirements/pytorch/check-avail-extras.py
@@ -1,5 +1,6 @@
-import hydra # noqa: F401
-import jsonargparse # noqa: F401
-import matplotlib # noqa: F401
-import omegaconf # noqa: F401
-import rich # noqa: F401
+if __name__ == "__main__":
+ import hydra # noqa: F401
+ import jsonargparse # noqa: F401
+ import matplotlib # noqa: F401
+ import omegaconf # noqa: F401
+ import rich # noqa: F401
diff --git a/requirements/pytorch/check-avail-strategies.py b/requirements/pytorch/check-avail-strategies.py
index db28a1a1fe051..ffe12d024199b 100644
--- a/requirements/pytorch/check-avail-strategies.py
+++ b/requirements/pytorch/check-avail-strategies.py
@@ -1,7 +1,8 @@
-import bagua # noqa: F401
-import deepspeed # noqa: F401
-import fairscale # noqa: F401
-import horovod.torch
+if __name__ == "__main__":
+ import bagua # noqa: F401
+ import deepspeed # noqa: F401
+ import fairscale # noqa: F401
+ import horovod.torch
-# returns an error code
-assert horovod.torch.nccl_built()
+ # returns an error code
+ assert horovod.torch.nccl_built()
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 37a75ba9f45bd..2f0cce54f4158 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -5,5 +5,5 @@ colossalai>=0.1.10
fairscale>=0.4.5, <=0.4.6
deepspeed>=0.6.0, <=0.7.0
# no need to install with [pytorch] as pytorch is already installed
-horovod>=0.21.2, !=0.24.0, <0.25.1
+horovod>=0.21.2, !=0.24.0, <=0.26.1
hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
index fbd65ff0ef729..5ba99b269e002 100644
--- a/requirements/pytorch/test.txt
+++ b/requirements/pytorch/test.txt
@@ -1,4 +1,4 @@
-coverage==6.4.2
+coverage==6.5.0
codecov==2.1.12
pytest==7.1.3
pytest-cov==4.0.0
@@ -9,7 +9,7 @@ pre-commit==2.20.0
# needed in tests
cloudpickle>=1.3, <=2.1.0
scikit-learn>0.22.1, <1.1.3
-onnxruntime<1.13.0
+onnxruntime<1.14.0
psutil<5.9.4 # for `DeviceStatsMonitor`
pandas>1.0, <1.5.2 # needed in benchmarks
fastapi<0.87.0
diff --git a/src/lightning/__init__.py b/src/lightning/__init__.py
index 30950d8c6bdbb..2755ce57e48b3 100644
--- a/src/lightning/__init__.py
+++ b/src/lightning/__init__.py
@@ -36,6 +36,7 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None:
from lightning.app.perf import pdb # noqa: E402
from lightning.app.utilities.packaging.build_config import BuildConfig # noqa: E402
from lightning.app.utilities.packaging.cloud_compute import CloudCompute # noqa: E402
+from lightning.lite.lite import LightningLite # noqa: E402
from lightning.pytorch.callbacks import Callback # noqa: E402
from lightning.pytorch.core import LightningDataModule, LightningModule # noqa: E402
from lightning.pytorch.trainer import Trainer # noqa: E402
@@ -59,6 +60,7 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None:
"LightningModule",
"Callback",
"seed_everything",
+ "LightningLite",
"storage",
"pdb",
]
diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py
index 6f30e218ab6e5..6254445efea1b 100644
--- a/src/lightning/__setup__.py
+++ b/src/lightning/__setup__.py
@@ -106,5 +106,6 @@ def _setup_args(**kwargs: Any) -> Dict[str, Any]:
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
], # todo: consider aggregation/union of tags from particular packages
)
diff --git a/src/lightning/__version__.py b/src/lightning/__version__.py
index 72126ce16b766..ba22724db3594 100644
--- a/src/lightning/__version__.py
+++ b/src/lightning/__version__.py
@@ -1 +1 @@
-version = "1.8.1"
+version = "1.8.2"
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 28445aec7df7b..f1b6740a9a344 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -4,8 +4,31 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
-## [1.8.1] - 2022-11-10
+## [UnReleased] - 2022-11-DD
+
+### Added
+- Added title and description to ServeGradio ([#15639](https://github.com/Lightning-AI/lightning/pull/15639))
+- Added a friendly error message when attempting to run the default cloud compute with a custom base image configured ([#14929](https://github.com/Lightning-AI/lightning/pull/14929))
+
+### Changed
+
+- Improved support for running apps when dependencies aren't installed ([#15711](https://github.com/Lightning-AI/lightning/pull/15711))
+- Changed the root directory of the app (which gets uploaded) to be the folder containing the app file, rather than any parent folder containing a `.lightning` file ([#15654](https://github.com/Lightning-AI/lightning/pull/15654))
+- Enabled MultiNode Components to support state broadcasting ([#15607](https://github.com/Lightning-AI/lightning/pull/15607))
+- Prevent artefactual "running from outside your current environment" error ([#15647](https://github.com/Lightning-AI/lightning/pull/15647))
+- Rename failed -> error in tables ([#15608](https://github.com/Lightning-AI/lightning/pull/15608))
+
+### Fixed
+
+- Fixed race condition to over-write the frontend with app infos ([#15398](https://github.com/Lightning-AI/lightning/pull/15398))
+- Fixed bi-directional queues sending delta with Drive Component name changes ([#15642](https://github.com/Lightning-AI/lightning/pull/15642))
+- Fixed CloudRuntime works collection with structures and accelerated multi node startup time ([#15650](https://github.com/Lightning-AI/lightning/pull/15650))
+- Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712))
+- Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714))
+
+
+## [1.8.1] - 2022-11-10
### Added
@@ -38,7 +61,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed an issue with the `lightning` CLI taking a long time to error out when the cloud is not reachable ([#15412](https://github.com/Lightning-AI/lightning/pull/15412))
-
## [1.8.0] - 2022-11-01
### Added
diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py
index 72126ce16b766..ba22724db3594 100644
--- a/src/lightning_app/__version__.py
+++ b/src/lightning_app/__version__.py
@@ -1 +1 @@
-version = "1.8.1"
+version = "1.8.2"
diff --git a/src/lightning_app/cli/app-template/tests/requirements.txt b/src/lightning_app/cli/app-template/tests/requirements.txt
index 984d177fbe16c..3185d1c44f033 100644
--- a/src/lightning_app/cli/app-template/tests/requirements.txt
+++ b/src/lightning_app/cli/app-template/tests/requirements.txt
@@ -1,8 +1,8 @@
coverage
codecov>=2.1
-pytest>=3.0.5
+pytest>=5.0.0
pytest-cov
pytest-flake8
flake8
check-manifest
-twine==1.13.0
+twine==4.0.1
diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py
index 4bb8b1fdb793f..e76b9c0695842 100644
--- a/src/lightning_app/cli/cmd_clusters.py
+++ b/src/lightning_app/cli/cmd_clusters.py
@@ -42,7 +42,7 @@ def as_table(self) -> Table:
V1ClusterState.QUEUED: Text("queued", style="bold yellow"),
V1ClusterState.PENDING: Text("pending", style="bold yellow"),
V1ClusterState.RUNNING: Text("running", style="bold green"),
- V1ClusterState.FAILED: Text("failed", style="bold red"),
+ V1ClusterState.FAILED: Text("error", style="bold red"),
V1ClusterState.DELETED: Text("deleted", style="bold red"),
}
diff --git a/src/lightning_app/cli/commands/logs.py b/src/lightning_app/cli/commands/logs.py
index 9d53601da0698..fb0746dd50fff 100644
--- a/src/lightning_app/cli/commands/logs.py
+++ b/src/lightning_app/cli/commands/logs.py
@@ -71,6 +71,7 @@ def _show_logs(app_name: str, components: List[str], follow: bool) -> None:
works = client.lightningwork_service_list_lightningwork(
project_id=project.project_id, app_id=apps[app_name].id
).lightningworks
+
app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works]
if not components:
diff --git a/src/lightning_app/cli/component-template/tests/requirements.txt b/src/lightning_app/cli/component-template/tests/requirements.txt
index 984d177fbe16c..3185d1c44f033 100644
--- a/src/lightning_app/cli/component-template/tests/requirements.txt
+++ b/src/lightning_app/cli/component-template/tests/requirements.txt
@@ -1,8 +1,8 @@
coverage
codecov>=2.1
-pytest>=3.0.5
+pytest>=5.0.0
pytest-cov
pytest-flake8
flake8
check-manifest
-twine==1.13.0
+twine==4.0.1
diff --git a/src/lightning_app/components/database/server.py b/src/lightning_app/components/database/server.py
index a5499aaae17b8..01bd8f3b12033 100644
--- a/src/lightning_app/components/database/server.py
+++ b/src/lightning_app/components/database/server.py
@@ -4,6 +4,7 @@
import sys
import tempfile
import threading
+import traceback
from typing import List, Optional, Type, Union
import uvicorn
@@ -36,6 +37,9 @@ def install_signal_handlers(self):
"""Ignore Uvicorn Signal Handlers."""
+_lock = threading.Lock()
+
+
class Database(LightningWork):
def __init__(
self,
@@ -146,25 +150,29 @@ class CounterModel(SQLModel, table=True):
self._exit_event = None
def store_database(self):
- with tempfile.TemporaryDirectory() as tmpdir:
- tmp_db_filename = os.path.join(tmpdir, os.path.basename(self.db_filename))
+ try:
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmp_db_filename = os.path.join(tmpdir, os.path.basename(self.db_filename))
- source = sqlite3.connect(self.db_filename)
- dest = sqlite3.connect(tmp_db_filename)
+ source = sqlite3.connect(self.db_filename)
+ dest = sqlite3.connect(tmp_db_filename)
- source.backup(dest)
+ source.backup(dest)
- source.close()
- dest.close()
+ source.close()
+ dest.close()
- drive = Drive("lit://database", component_name=self.name, root_folder=tmpdir)
- drive.put(os.path.basename(tmp_db_filename))
+ drive = Drive("lit://database", component_name=self.name, root_folder=tmpdir)
+ drive.put(os.path.basename(tmp_db_filename))
- print("Stored the database to the Drive.")
+ print("Stored the database to the Drive.")
+ except Exception:
+ print(traceback.print_exc())
def periodic_store_database(self, store_interval):
while not self._exit_event.is_set():
- self.store_database()
+ with _lock:
+ self.store_database()
self._exit_event.wait(store_interval)
def run(self, token: Optional[str] = None) -> None:
@@ -210,4 +218,5 @@ def db_url(self) -> Optional[str]:
def on_exit(self):
self._exit_event.set()
- self.store_database()
+ with _lock:
+ self.store_database()
diff --git a/src/lightning_app/components/multi_node/base.py b/src/lightning_app/components/multi_node/base.py
index 02adf218d3e36..4f2005771212a 100644
--- a/src/lightning_app/components/multi_node/base.py
+++ b/src/lightning_app/components/multi_node/base.py
@@ -3,7 +3,6 @@
from lightning_app import structures
from lightning_app.core.flow import LightningFlow
from lightning_app.core.work import LightningWork
-from lightning_app.utilities.enum import WorkStageStatus
from lightning_app.utilities.packaging.cloud_compute import CloudCompute
@@ -52,46 +51,31 @@ def run(
work_kwargs: Keywords arguments to be provided to the work on instantiation.
"""
super().__init__()
- self.ws = structures.List()
- self._work_cls = work_cls
- self.num_nodes = num_nodes
- self._cloud_compute = cloud_compute
- self._work_args = work_args
- self._work_kwargs = work_kwargs
- self.has_started = False
+ self.ws = structures.List(
+ *[
+ work_cls(
+ *work_args,
+ cloud_compute=cloud_compute,
+ **work_kwargs,
+ parallel=True,
+ )
+ for _ in range(num_nodes)
+ ]
+ )
def run(self) -> None:
- if not self.has_started:
-
- # 1. Create & start the works
- if not self.ws:
- for node_rank in range(self.num_nodes):
- self.ws.append(
- self._work_cls(
- *self._work_args,
- cloud_compute=self._cloud_compute,
- **self._work_kwargs,
- parallel=True,
- )
- )
-
- # Starting node `node_rank`` ...
- self.ws[-1].start()
-
- # 2. Wait for all machines to be started !
- if not all(w.status.stage == WorkStageStatus.STARTED for w in self.ws):
- return
-
- self.has_started = True
+ # 1. Wait for all works to be started !
+ if not all(w.internal_ip for w in self.ws):
+ return
- # Loop over all node machines
- for node_rank in range(self.num_nodes):
+ # 2. Loop over all node machines
+ for node_rank in range(len(self.ws)):
# 3. Run the user code in a distributed way !
self.ws[node_rank].run(
main_address=self.ws[0].internal_ip,
main_port=self.ws[0].port,
- num_nodes=self.num_nodes,
+ num_nodes=len(self.ws),
node_rank=node_rank,
)
diff --git a/src/lightning_app/components/multi_node/lite.py b/src/lightning_app/components/multi_node/lite.py
index 5295d0beb869e..2a9b33b0880d1 100644
--- a/src/lightning_app/components/multi_node/lite.py
+++ b/src/lightning_app/components/multi_node/lite.py
@@ -7,7 +7,6 @@
from lightning_app.components.multi_node.base import MultiNode
from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor
from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
from lightning_app.utilities.packaging.cloud_compute import CloudCompute
from lightning_app.utilities.tracer import Tracer
@@ -82,11 +81,6 @@ def __init__(
**work_kwargs: Any,
) -> None:
assert issubclass(work_cls, _LiteWorkProtocol)
- if not is_static_method(work_cls, "run"):
- raise TypeError(
- f"The provided {work_cls} run method needs to be static for now."
- "HINT: Remove `self` and add staticmethod decorator."
- )
# Note: Private way to modify the work run executor
# Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/components/multi_node/pytorch_spawn.py b/src/lightning_app/components/multi_node/pytorch_spawn.py
index 62ccfb95174eb..3119ffc51e0b5 100644
--- a/src/lightning_app/components/multi_node/pytorch_spawn.py
+++ b/src/lightning_app/components/multi_node/pytorch_spawn.py
@@ -3,10 +3,10 @@
from typing_extensions import Protocol, runtime_checkable
from lightning_app.components.multi_node.base import MultiNode
+from lightning_app.core.queues import MultiProcessQueue
from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
from lightning_app.utilities.packaging.cloud_compute import CloudCompute
-from lightning_app.utilities.proxies import WorkRunExecutor
+from lightning_app.utilities.proxies import _proxy_setattr, unwrap, WorkRunExecutor, WorkStateObserver
@runtime_checkable
@@ -22,6 +22,9 @@ def run(
class _PyTorchSpawnRunExecutor(WorkRunExecutor):
+
+ enable_start_observer: bool = False
+
def __call__(
self,
main_address: str,
@@ -31,10 +34,31 @@ def __call__(
):
import torch
- nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
- torch.multiprocessing.spawn(
- self.run, args=(self.work_run, main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs
- )
+ with self.enable_spawn():
+ nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1
+ queue = self.delta_queue if isinstance(self.delta_queue, MultiProcessQueue) else self.delta_queue.to_dict()
+ torch.multiprocessing.spawn(
+ self.dispatch_run,
+ args=(self.__class__, self.work, queue, main_address, main_port, num_nodes, node_rank, nprocs),
+ nprocs=nprocs,
+ )
+
+ @staticmethod
+ def dispatch_run(local_rank, cls, work, delta_queue, *args, **kwargs):
+ if local_rank == 0:
+ if isinstance(delta_queue, dict):
+ delta_queue = cls.process_queue(delta_queue)
+ work._request_queue = cls.process_queue(work._request_queue)
+ work._response_queue = cls.process_queue(work._response_queue)
+
+ state_observer = WorkStateObserver(work, delta_queue=delta_queue)
+ state_observer.start()
+ _proxy_setattr(work, delta_queue, state_observer)
+
+ cls.run(local_rank, unwrap(work.run), *args, **kwargs)
+
+ if local_rank == 0:
+ state_observer.join(0)
@staticmethod
def run(
@@ -46,6 +70,7 @@ def run(
node_rank: int,
nprocs: int,
):
+
import torch
# 1. Setting distributed environment
@@ -76,11 +101,6 @@ def __init__(
**work_kwargs: Any,
) -> None:
assert issubclass(work_cls, _PyTorchSpawnWorkProtocol)
- if not is_static_method(work_cls, "run"):
- raise TypeError(
- f"The provided {work_cls} run method needs to be static for now."
- "HINT: Remove `self` and add staticmethod decorator."
- )
# Note: Private way to modify the work run executor
# Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py
index ea33106a7ece9..222f71ce59557 100644
--- a/src/lightning_app/components/multi_node/trainer.py
+++ b/src/lightning_app/components/multi_node/trainer.py
@@ -7,7 +7,6 @@
from lightning_app.components.multi_node.base import MultiNode
from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor
from lightning_app.core.work import LightningWork
-from lightning_app.utilities.app_helpers import is_static_method
from lightning_app.utilities.packaging.cloud_compute import CloudCompute
from lightning_app.utilities.tracer import Tracer
@@ -81,11 +80,6 @@ def __init__(
**work_kwargs: Any,
) -> None:
assert issubclass(work_cls, _LightningTrainerWorkProtocol)
- if not is_static_method(work_cls, "run"):
- raise TypeError(
- f"The provided {work_cls} run method needs to be static for now."
- "HINT: Remove `self` and add staticmethod decorator."
- )
# Note: Private way to modify the work run executor
# Probably exposed to the users in the future if needed.
diff --git a/src/lightning_app/components/serve/gradio.py b/src/lightning_app/components/serve/gradio.py
index 7e7801925937f..328e70e743b43 100644
--- a/src/lightning_app/components/serve/gradio.py
+++ b/src/lightning_app/components/serve/gradio.py
@@ -31,6 +31,8 @@ class ServeGradio(LightningWork, abc.ABC):
outputs: Any
examples: Optional[List] = None
enable_queue: bool = False
+ title: Optional[str] = None
+ description: Optional[str] = None
def __init__(self, *args, **kwargs):
requires("gradio")(super().__init__(*args, **kwargs))
@@ -58,7 +60,14 @@ def run(self, *args, **kwargs):
self._model = self.build_model()
fn = partial(self.predict, *args, **kwargs)
fn.__name__ = self.predict.__name__
- gradio.Interface(fn=fn, inputs=self.inputs, outputs=self.outputs, examples=self.examples).launch(
+ gradio.Interface(
+ fn=fn,
+ inputs=self.inputs,
+ outputs=self.outputs,
+ examples=self.examples,
+ title=self.title,
+ description=self.description,
+ ).launch(
server_name=self.host,
server_port=self.port,
enable_queue=self.enable_queue,
diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py
index 03b0ceb26058f..f0361f9db5046 100644
--- a/src/lightning_app/components/serve/python_server.py
+++ b/src/lightning_app/components/serve/python_server.py
@@ -14,12 +14,6 @@
logger = Logger(__name__)
-def image_to_base64(image_path):
- with open(image_path, "rb") as image_file:
- encoded_string = base64.b64encode(image_file.read())
- return encoded_string.decode("UTF-8")
-
-
class _DefaultInputData(BaseModel):
payload: str
@@ -33,7 +27,7 @@ class Image(BaseModel):
@staticmethod
def _get_sample_data() -> Dict[Any, Any]:
- imagepath = Path(__file__).absolute().parent / "catimage.png"
+ imagepath = Path(__file__).parent / "catimage.png"
with open(imagepath, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
return {"image": encoded_string.decode("UTF-8")}
diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py
index ef848cce54dba..255f498507f67 100644
--- a/src/lightning_app/core/app.py
+++ b/src/lightning_app/core/app.py
@@ -24,7 +24,7 @@
from lightning_app.core.queues import BaseQueue, SingleProcessQueue
from lightning_app.core.work import LightningWork
from lightning_app.frontend import Frontend
-from lightning_app.storage import Drive, Path
+from lightning_app.storage import Drive, Path, Payload
from lightning_app.storage.path import _storage_root_dir
from lightning_app.utilities import frontend
from lightning_app.utilities.app_helpers import (
@@ -100,6 +100,7 @@ def __init__(
"""
self.root_path = root_path # when running behind a proxy
+ self.info = info
from lightning_app.core.flow import _RootFlow
@@ -168,9 +169,10 @@ def __init__(
logger.debug(f"ENV: {os.environ}")
+ def _update_index_file(self):
# update index.html,
# this should happen once for all apps before the ui server starts running.
- frontend.update_index_file(FRONTEND_DIR, info=info, root_path=root_path)
+ frontend.update_index_file(FRONTEND_DIR, info=self.info, root_path=self.root_path)
if _should_dispatch_app():
os.environ["LIGHTNING_DISPATCHED"] = "1"
@@ -470,6 +472,8 @@ def _run(self) -> bool:
self._original_state = deepcopy(self.state)
done = False
+ self._start_with_flow_works()
+
if self.should_publish_changes_to_api and self.api_publish_state_queue:
logger.debug("Publishing the state with changes")
# Push two states to optimize start in the cloud.
@@ -628,8 +632,16 @@ def _extract_vars_from_component_name(component_name: str, state):
else:
return None
- # Note: Remove private keys
- return {k: v for k, v in child["vars"].items() if not k.startswith("_")}
+ # Filter private keys and drives
+ return {
+ k: v
+ for k, v in child["vars"].items()
+ if (
+ not k.startswith("_")
+ and not (isinstance(v, dict) and v.get("type", None) == "__drive__")
+ and not (isinstance(v, (Payload, Path)))
+ )
+ }
def _send_flow_to_work_deltas(self, state) -> None:
if not self.flow_to_work_delta_queues:
@@ -650,10 +662,6 @@ def _send_flow_to_work_deltas(self, state) -> None:
if state_work is None or last_state_work is None:
continue
- # Note: The flow shouldn't update path or drive manually.
- last_state_work = apply_to_collection(last_state_work, (Path, Drive), lambda x: None)
- state_work = apply_to_collection(state_work, (Path, Drive), lambda x: None)
-
deep_diff = DeepDiff(last_state_work, state_work, verbose_level=2).to_dict()
if "unprocessed" in deep_diff:
@@ -662,3 +670,11 @@ def _send_flow_to_work_deltas(self, state) -> None:
if deep_diff:
logger.debug(f"Sending deep_diff to {w.name} : {deep_diff}")
self.flow_to_work_delta_queues[w.name].put(deep_diff)
+
+ def _start_with_flow_works(self):
+ for w in self.works:
+ if w._start_with_flow:
+ parallel = w.parallel
+ w._parallel = True
+ w.start()
+ w._parallel = parallel
diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py
index 5d8f4e06ad429..a7fee9a3b6e12 100644
--- a/src/lightning_app/core/queues.py
+++ b/src/lightning_app/core/queues.py
@@ -235,12 +235,12 @@ def __init__(
"""
if name is None:
raise ValueError("You must specify a name for the queue")
- host = host or REDIS_HOST
- port = port or REDIS_PORT
- password = password or REDIS_PASSWORD
+ self.host = host or REDIS_HOST
+ self.port = port or REDIS_PORT
+ self.password = password or REDIS_PASSWORD
self.name = name
self.default_timeout = default_timeout
- self.redis = redis.Redis(host=host, port=port, password=password)
+ self.redis = redis.Redis(host=self.host, port=self.port, password=self.password)
def put(self, item: Any) -> None:
from lightning_app import LightningWork
@@ -329,6 +329,20 @@ def is_running(self) -> bool:
except redis.exceptions.ConnectionError:
return False
+ def to_dict(self):
+ return {
+ "type": "redis",
+ "name": self.name,
+ "default_timeout": self.default_timeout,
+ "host": self.host,
+ "port": self.port,
+ "password": self.password,
+ }
+
+ @classmethod
+ def from_dict(cls, state):
+ return cls(**state)
+
class HTTPQueue(BaseQueue):
def __init__(self, name: str, default_timeout: float):
@@ -414,6 +428,17 @@ def _split_app_id_and_queue_name(queue_name):
app_id, queue_name = queue_name.split("_", 1)
return app_id, queue_name
+ def to_dict(self):
+ return {
+ "type": "http",
+ "name": self.name,
+ "default_timeout": self.default_timeout,
+ }
+
+ @classmethod
+ def from_dict(cls, state):
+ return cls(**state)
+
def debug_log_callback(message: str, *args: Any, **kwargs: Any) -> None:
if QUEUE_DEBUG_ENABLED or (Path(LIGHTNING_DIR) / "QUEUE_DEBUG_ENABLED").exists():
diff --git a/src/lightning_app/frontend/panel/app_state_watcher.py b/src/lightning_app/frontend/panel/app_state_watcher.py
index 2c886bae341f5..2253312a13565 100644
--- a/src/lightning_app/frontend/panel/app_state_watcher.py
+++ b/src/lightning_app/frontend/panel/app_state_watcher.py
@@ -1,9 +1,9 @@
-"""The AppStateWatcher enables a Frontend to.
+"""The ``AppStateWatcher`` enables a Frontend to:
- subscribe to App state changes
- to access and change the App state.
-This is particularly useful for the PanelFrontend but can be used by other Frontends too.
+This is particularly useful for the ``PanelFrontend`` but can be used by other frontends too.
"""
from __future__ import annotations
@@ -26,15 +26,16 @@
class AppStateWatcher(Parameterized):
- """The AppStateWatcher enables a Frontend to:
+ """The `AppStateWatcher` enables a Frontend to:
- Subscribe to any App state changes.
- To access and change the App state from the UI.
- This is particularly useful for the PanelFrontend, but can be used by
- other Frontend's too.
+ This is particularly useful for the `PanelFrontend , but can be used by
+ other frontends too.
- Example:
+ Example
+ -------
.. code-block:: python
@@ -54,10 +55,10 @@ def update(state):
This would print ``The counter was updated to 2``.
- The AppStateWatcher is built on top of Param which is a framework like dataclass, attrs and
+ The ``AppStateWatcher`` is built on top of Param, which is a framework like dataclass, attrs and
Pydantic which additionally provides powerful and unique features for building reactive apps.
- Please note the AppStateWatcher is a singleton, i.e. only one instance is instantiated
+ Please note the ``AppStateWatcher`` is a singleton, i.e., only one instance is instantiated
"""
state: AppState = ClassSelector(
@@ -75,7 +76,7 @@ def __new__(cls):
@requires("param")
def __init__(self):
- # It's critical to initialize only once
+ # It is critical to initialize only once
# See https://github.com/holoviz/param/issues/643
if not hasattr(self, "_initialized"):
super().__init__(name="singleton")
diff --git a/src/lightning_app/frontend/panel/panel_frontend.py b/src/lightning_app/frontend/panel/panel_frontend.py
index 359dca28b2766..48af9235fa796 100644
--- a/src/lightning_app/frontend/panel/panel_frontend.py
+++ b/src/lightning_app/frontend/panel/panel_frontend.py
@@ -27,17 +27,28 @@ def _has_panel_autoreload() -> bool:
class PanelFrontend(Frontend):
- """The PanelFrontend enables you to serve Panel code as a Frontend for your LightningFlow.
+ """The `PanelFrontend` enables you to serve Panel code as a Frontend for your LightningFlow.
- To use this frontend, you must first install the `panel` package:
+ Reference: https://lightning.ai/lightning-docs/workflows/add_web_ui/panel/
+
+ Args:
+ entry_point: The path to a .py or .ipynb file, or a pure function. The file or function must contain your Panel
+ code. The function can optionally accept an ``AppStateWatcher`` argument.
+
+ Raises:
+ TypeError: Raised if the ``entry_point`` provided is a class method
+
+ Example:
+
+ To use the `PanelFrontend`, you must first install the `panel` package:
.. code-block:: bash
pip install panel
- Example:
+ Create the files `panel_app_basic.py` and `app_basic.py` with the content below.
- `panel_app_basic.py`
+ **panel_app_basic.py**
.. code-block:: python
@@ -45,7 +56,7 @@ class PanelFrontend(Frontend):
pn.panel("Hello **Panel ⚡** World").servable()
- `app_basic.py`
+ **app_basic.py**
.. code-block:: python
@@ -69,20 +80,15 @@ def configure_layout(self):
app = L.LightningApp(LitApp())
- You can start the Lightning server with Panel autoreload by setting the `PANEL_AUTORELOAD`
- environment variable to 'yes': `PANEL_AUTORELOAD=yes lightning run app app_basic.py`.
+ Start the Lightning server with `lightning run app app_basic.py`.
- Args:
- entry_point: A pure function or the path to a .py or .ipynb file.
- The function must be a pure function that contains your Panel code.
- The function can optionally accept an `AppStateWatcher` argument.
-
- Raises:
- TypeError: Raised if the entry_point is a class method
+ For development you can get Panel autoreload by setting the ``PANEL_AUTORELOAD``
+ environment variable to 'yes', i.e. run
+ ``PANEL_AUTORELOAD=yes lightning run app app_basic.py``
"""
@requires("panel")
- def __init__(self, entry_point: Callable | str):
+ def __init__(self, entry_point: str | Callable):
super().__init__()
if inspect.ismethod(entry_point):
diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py
index 81e9e10ebb14a..af3eca424282d 100644
--- a/src/lightning_app/runners/cloud.py
+++ b/src/lightning_app/runners/cloud.py
@@ -4,7 +4,6 @@
import string
import sys
import time
-import traceback
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, List, Optional, Union
@@ -43,6 +42,7 @@
)
from lightning_cloud.openapi.rest import ApiException
+from lightning_app import LightningWork
from lightning_app.core.app import LightningApp
from lightning_app.core.constants import (
CLOUD_QUEUE_TYPE,
@@ -62,8 +62,8 @@
from lightning_app.utilities.app_helpers import Logger
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.dependency_caching import get_hash
-from lightning_app.utilities.load_app import _prettifiy_exception, load_app_from_file
-from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file
+from lightning_app.utilities.load_app import load_app_from_file
+from lightning_app.utilities.packaging.app_config import _get_config_file, AppConfig
from lightning_app.utilities.packaging.lightning_utils import _prepare_lightning_wheels_and_requirements
from lightning_app.utilities.secrets import _names_to_ids
@@ -95,10 +95,11 @@ def dispatch(
# TODO: verify lightning version
# _verify_lightning_version()
- config_file = find_config_file(self.entrypoint_file)
- app_config = AppConfig.load_from_file(config_file) if config_file else AppConfig()
- root = config_file.parent if config_file else Path(self.entrypoint_file).absolute().parent
+ config_file = _get_config_file(self.entrypoint_file)
+ app_config = AppConfig.load_from_file(config_file) if config_file.exists() else AppConfig()
+ root = Path(self.entrypoint_file).absolute().parent
cleanup_handle = _prepare_lightning_wheels_and_requirements(root)
+ self.app._update_index_file()
repo = LocalSourceCodeDir(path=root)
self._check_uploaded_folder(root, repo)
requirements_file = root / "requirements.txt"
@@ -141,78 +142,79 @@ def dispatch(
v1_env_vars.append(V1EnvVar(name="ENABLE_PUSHING_STATE_ENDPOINT", value="0"))
works: List[V1Work] = []
- for flow in self.app.flows:
- for work in flow.works(recurse=False):
- if not work._start_with_flow:
- continue
-
- work_requirements = "\n".join(work.cloud_build_config.requirements)
- build_spec = V1BuildSpec(
- commands=work.cloud_build_config.build_commands(),
- python_dependencies=V1PythonDependencyInfo(
- package_manager=V1PackageManager.PIP, packages=work_requirements
- ),
- image=work.cloud_build_config.image,
- )
- user_compute_config = V1UserRequestedComputeConfig(
- name=work.cloud_compute.name,
- count=1,
- disk_size=work.cloud_compute.disk_size,
- preemptible=work.cloud_compute.preemptible,
- shm_size=work.cloud_compute.shm_size,
- )
+ for work in self.app.works:
+ _validate_build_spec_and_compute(work)
- drive_specs: List[V1LightningworkDrives] = []
- for drive_attr_name, drive in [
- (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive)
- ]:
- if drive.protocol == "lit://":
- drive_type = V1DriveType.NO_MOUNT_S3
- source_type = V1SourceType.S3
- else:
- raise RuntimeError(
- f"unknown drive protocol `{drive.protocol}`. Please verify this "
- f"drive type has been configured for use in the cloud dispatcher."
- )
+ if not work._start_with_flow:
+ continue
- drive_specs.append(
- V1LightningworkDrives(
- drive=V1Drive(
- metadata=V1Metadata(
- name=f"{work.name}.{drive_attr_name}",
- ),
- spec=V1DriveSpec(
- drive_type=drive_type,
- source_type=source_type,
- source=f"{drive.protocol}{drive.id}",
- ),
- status=V1DriveStatus(),
+ work_requirements = "\n".join(work.cloud_build_config.requirements)
+ build_spec = V1BuildSpec(
+ commands=work.cloud_build_config.build_commands(),
+ python_dependencies=V1PythonDependencyInfo(
+ package_manager=V1PackageManager.PIP, packages=work_requirements
+ ),
+ image=work.cloud_build_config.image,
+ )
+ user_compute_config = V1UserRequestedComputeConfig(
+ name=work.cloud_compute.name,
+ count=1,
+ disk_size=work.cloud_compute.disk_size,
+ preemptible=work.cloud_compute.preemptible,
+ shm_size=work.cloud_compute.shm_size,
+ )
+
+ drive_specs: List[V1LightningworkDrives] = []
+ for drive_attr_name, drive in [
+ (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive)
+ ]:
+ if drive.protocol == "lit://":
+ drive_type = V1DriveType.NO_MOUNT_S3
+ source_type = V1SourceType.S3
+ else:
+ raise RuntimeError(
+ f"unknown drive protocol `{drive.protocol}`. Please verify this "
+ f"drive type has been configured for use in the cloud dispatcher."
+ )
+
+ drive_specs.append(
+ V1LightningworkDrives(
+ drive=V1Drive(
+ metadata=V1Metadata(
+ name=f"{work.name}.{drive_attr_name}",
),
- mount_location=str(drive.root_folder),
+ spec=V1DriveSpec(
+ drive_type=drive_type,
+ source_type=source_type,
+ source=f"{drive.protocol}{drive.id}",
+ ),
+ status=V1DriveStatus(),
),
- )
+ mount_location=str(drive.root_folder),
+ ),
+ )
- # TODO: Move this to the CloudCompute class and update backend
- if work.cloud_compute.mounts is not None:
- mounts = work.cloud_compute.mounts
- if isinstance(mounts, Mount):
- mounts = [mounts]
- for mount in mounts:
- drive_specs.append(
- _create_mount_drive_spec(
- work_name=work.name,
- mount=mount,
- )
+ # TODO: Move this to the CloudCompute class and update backend
+ if work.cloud_compute.mounts is not None:
+ mounts = work.cloud_compute.mounts
+ if isinstance(mounts, Mount):
+ mounts = [mounts]
+ for mount in mounts:
+ drive_specs.append(
+ _create_mount_drive_spec(
+ work_name=work.name,
+ mount=mount,
)
+ )
- random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
- work_spec = V1LightningworkSpec(
- build_spec=build_spec,
- drives=drive_specs,
- user_requested_compute_config=user_compute_config,
- network_config=[V1NetworkConfig(name=random_name, port=work.port)],
- )
- works.append(V1Work(name=work.name, spec=work_spec))
+ random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
+ work_spec = V1LightningworkSpec(
+ build_spec=build_spec,
+ drives=drive_specs,
+ user_requested_compute_config=user_compute_config,
+ network_config=[V1NetworkConfig(name=random_name, port=work.port)],
+ )
+ works.append(V1Work(name=work.name, spec=work_spec))
# We need to collect a spec for each flow that contains a frontend so that the backend knows
# for which flows it needs to start servers by invoking the cli (see the serve_frontend() method below)
@@ -472,26 +474,17 @@ def _project_has_sufficient_credits(self, project: V1Membership, app: Optional[L
@classmethod
def load_app_from_file(cls, filepath: str) -> "LightningApp":
- """This is meant to use only locally for cloud runtime."""
+ """Load a LightningApp from a file, mocking the imports."""
try:
- app = load_app_from_file(filepath, raise_exception=True)
- except ModuleNotFoundError:
- # this is very generic exception.
- logger.info("Could not load the app locally. Starting the app directly on the cloud.")
- # we want to format the exception as if no frame was on top.
- exp, val, tb = sys.exc_info()
- listing = traceback.format_exception(exp, val, tb)
- # remove the entry for the first frame
- del listing[1]
- from lightning_app.testing.helpers import EmptyFlow
-
- # Create a mocking app.
- app = LightningApp(EmptyFlow())
-
+ app = load_app_from_file(filepath, raise_exception=True, mock_imports=True)
except FileNotFoundError as e:
raise e
except Exception:
- _prettifiy_exception(filepath)
+ from lightning_app.testing.helpers import EmptyFlow
+
+ # Create a generic app.
+ logger.info("Could not load the app locally. Starting the app directly on the cloud.")
+ app = LightningApp(EmptyFlow())
return app
@@ -519,3 +512,12 @@ def _create_mount_drive_spec(work_name: str, mount: Mount) -> V1LightningworkDri
),
mount_location=str(mount.mount_path),
)
+
+
+def _validate_build_spec_and_compute(work: LightningWork) -> None:
+ if work.cloud_build_config.image is not None and work.cloud_compute.name == "default":
+ raise ValueError(
+ f"You requested a custom base image for the Work with name '{work.name}', but custom images are currently"
+ " not supported on the default cloud compute instance. Please choose a different configuration, for example"
+ " `CloudCompute('cpu-medium')`."
+ )
diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py
index 1bc8c7b5cf178..8abd0a443ac32 100644
--- a/src/lightning_app/runners/multiprocess.py
+++ b/src/lightning_app/runners/multiprocess.py
@@ -30,9 +30,11 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg
"""Method to dispatch and run the LightningApp."""
try:
_set_flow_context()
+
self.app.backend = self.backend
self.backend._prepare_queues(self.app)
self.backend.resolve_url(self.app, "http://127.0.0.1")
+ self.app._update_index_file()
# set env variables
os.environ.update(self.env_vars)
diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py
index f4c8c001acad7..43aa7c55be728 100644
--- a/src/lightning_app/testing/testing.py
+++ b/src/lightning_app/testing/testing.py
@@ -361,7 +361,7 @@ def run_app_in_cloud(
except playwright._impl._api_types.TimeoutError:
print("'Create Project' dialog not visible, skipping.")
- admin_page.locator(f"text={name}").click()
+ admin_page.locator(f"role=link[name='{name}']").click()
sleep(5)
# Scroll to the bottom of the page. Used to capture all logs.
admin_page.evaluate(
@@ -431,7 +431,18 @@ def fetch_logs(component_names: Optional[List[str]] = None) -> Generator:
project_id=project.project_id,
app_id=app_id,
).lightningworks
+
component_names = ["flow"] + [w.name for w in works]
+ else:
+
+ def add_prefix(c: str) -> str:
+ if c == "flow":
+ return c
+ if not c.startswith("root."):
+ return "root." + c
+ return c
+
+ component_names = [add_prefix(c) for c in component_names]
gen = _app_logs_reader(
logs_api_client=logs_api_client,
diff --git a/src/lightning_app/utilities/app_commands.py b/src/lightning_app/utilities/app_commands.py
index 011cb071299a1..3ec4c6d67dc57 100644
--- a/src/lightning_app/utilities/app_commands.py
+++ b/src/lightning_app/utilities/app_commands.py
@@ -38,12 +38,13 @@ def _extract_commands_from_file(file_name: str) -> CommandLines:
file_lines = f.readlines()
for line_number, line in enumerate(file_lines):
- if line.strip() in APP_COMMAND_LINES_TO_IGNORE:
+ line = line.strip()
+ if line in APP_COMMAND_LINES_TO_IGNORE:
continue
# stop parsing at first non-comment line at top of file
if not line.startswith("#"):
- break
+ continue
# remove comment marker and any leading / trailing whitespaces
line = line.lstrip("#").strip()
diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py
index 3f2de886bcc64..d63e33db6addb 100644
--- a/src/lightning_app/utilities/app_helpers.py
+++ b/src/lightning_app/utilities/app_helpers.py
@@ -1,5 +1,6 @@
import abc
import asyncio
+import builtins
import enum
import functools
import inspect
@@ -10,9 +11,11 @@
import threading
import time
from abc import ABC, abstractmethod
+from contextlib import contextmanager
from copy import deepcopy
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, Type, TYPE_CHECKING
+from unittest.mock import MagicMock
import websockets
from deepdiff import Delta
@@ -486,6 +489,29 @@ def _load_state_dict(root_flow: "LightningFlow", state: Dict[str, Any], strict:
raise Exception(f"The component {component_name} was re-created during state reloading.")
+class _MagicMockJsonSerializable(MagicMock):
+ @staticmethod
+ def __json__():
+ return "{}"
+
+
+def _mock_import(*args, original_fn=None):
+ try:
+ return original_fn(*args)
+ except Exception:
+ return _MagicMockJsonSerializable()
+
+
+@contextmanager
+def _mock_missing_imports():
+ original_fn = builtins.__import__
+ builtins.__import__ = functools.partial(_mock_import, original_fn=original_fn)
+ try:
+ yield
+ finally:
+ builtins.__import__ = original_fn
+
+
def is_static_method(klass_or_instance, attr) -> bool:
return isinstance(inspect.getattr_static(klass_or_instance, attr), staticmethod)
diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py
index b3b59aa106074..293944ca82c50 100644
--- a/src/lightning_app/utilities/cli_helpers.py
+++ b/src/lightning_app/utilities/cli_helpers.py
@@ -299,10 +299,11 @@ def _check_environment_and_redirect():
If not, this utility tries to redirect the ``lightning`` call to the environment executable (prompting the user to
install lightning for them there if needed).
"""
- env_executable = shutil.which("python")
+ env_executable = os.path.realpath(shutil.which("python"))
+ sys_executable = os.path.realpath(sys.executable)
# on windows, the extension might be different, where one uses `.EXE` and the other `.exe`
- if env_executable.lower() != sys.executable.lower():
+ if env_executable.lower() != sys_executable.lower():
logger.info(
"Lightning is running from outside your current environment. Switching to your current environment."
)
diff --git a/src/lightning_app/utilities/layout.py b/src/lightning_app/utilities/layout.py
index ca12ab8b7a616..9235993ad31d1 100644
--- a/src/lightning_app/utilities/layout.py
+++ b/src/lightning_app/utilities/layout.py
@@ -4,6 +4,7 @@
import lightning_app
from lightning_app.frontend.frontend import Frontend
+from lightning_app.utilities.app_helpers import _MagicMockJsonSerializable
from lightning_app.utilities.cloud import is_running_in_cloud
@@ -39,6 +40,9 @@ def _collect_layout(app: "lightning_app.LightningApp", flow: "lightning_app.Ligh
# When running locally, the target will get overwritten by the dispatcher when launching the frontend servers
# When running in the cloud, the frontend code will construct the URL based on the flow name
return flow._layout
+ elif isinstance(layout, _MagicMockJsonSerializable):
+ # Do nothing
+ pass
elif isinstance(layout, dict):
layout = _collect_content_layout([layout], flow)
elif isinstance(layout, (list, tuple)) and all(isinstance(item, dict) for item in layout):
@@ -103,6 +107,9 @@ def _collect_content_layout(layout: List[Dict], flow: "lightning_app.LightningFl
else:
entry["content"] = ""
entry["target"] = ""
+ elif isinstance(entry["content"], _MagicMockJsonSerializable):
+ # Do nothing
+ pass
else:
m = f"""
A dictionary returned by `{flow.__class__.__name__}.configure_layout()` contains an unsupported entry.
diff --git a/src/lightning_app/utilities/load_app.py b/src/lightning_app/utilities/load_app.py
index 2182162f3e0c3..43a6776721cbb 100644
--- a/src/lightning_app/utilities/load_app.py
+++ b/src/lightning_app/utilities/load_app.py
@@ -4,6 +4,7 @@
import traceback
import types
from contextlib import contextmanager
+from copy import copy
from typing import Dict, List, TYPE_CHECKING, Union
from lightning_app.utilities.exceptions import MisconfigurationException
@@ -11,7 +12,7 @@
if TYPE_CHECKING:
from lightning_app import LightningApp, LightningFlow, LightningWork
-from lightning_app.utilities.app_helpers import Logger
+from lightning_app.utilities.app_helpers import _mock_missing_imports, Logger
logger = Logger(__name__)
@@ -30,7 +31,7 @@ def _prettifiy_exception(filepath: str):
sys.exit(1)
-def load_app_from_file(filepath: str, raise_exception: bool = False) -> "LightningApp":
+def load_app_from_file(filepath: str, raise_exception: bool = False, mock_imports: bool = False) -> "LightningApp":
"""Load a LightningApp from a file.
Arguments:
@@ -50,7 +51,11 @@ def load_app_from_file(filepath: str, raise_exception: bool = False) -> "Lightni
module = _create_fake_main_module(filepath)
try:
with _patch_sys_argv():
- exec(code, module.__dict__)
+ if mock_imports:
+ with _mock_missing_imports():
+ exec(code, module.__dict__)
+ else:
+ exec(code, module.__dict__)
except Exception as e:
if raise_exception:
raise e
@@ -140,7 +145,7 @@ def _patch_sys_argv():
"""
from lightning_app.cli.lightning_cli import run_app
- original_argv = sys.argv
+ original_argv = copy(sys.argv)
# 1: Remove the CLI command
if sys.argv[:3] == ["lightning", "run", "app"]:
sys.argv = sys.argv[3:]
diff --git a/src/lightning_app/utilities/packaging/app_config.py b/src/lightning_app/utilities/packaging/app_config.py
index 59d05debc088c..c3e44159ffb4e 100644
--- a/src/lightning_app/utilities/packaging/app_config.py
+++ b/src/lightning_app/utilities/packaging/app_config.py
@@ -28,7 +28,7 @@ def save_to_file(self, path: Union[str, pathlib.Path]) -> None:
def save_to_dir(self, directory: Union[str, pathlib.Path]) -> None:
"""Save the configuration to a file '.lightning' to the given folder in YAML format."""
- self.save_to_file(pathlib.Path(directory, _APP_CONFIG_FILENAME))
+ self.save_to_file(_get_config_file(directory))
@classmethod
def load_from_file(cls, path: Union[str, pathlib.Path]) -> "AppConfig":
@@ -47,22 +47,14 @@ def load_from_dir(cls, directory: Union[str, pathlib.Path]) -> "AppConfig":
return cls.load_from_file(pathlib.Path(directory, _APP_CONFIG_FILENAME))
-def find_config_file(source_path: pathlib.Path = pathlib.Path.cwd()) -> Optional[pathlib.Path]:
- """Search for the Lightning app config file '.lightning' at the given source path.
-
- Relative to the given path, it will search for the '.lightning' config file by going up the directory structure
- until found. Returns ``None`` if no config file is found in any of the parent directories.
+def _get_config_file(source_path: Union[str, pathlib.Path]) -> pathlib.Path:
+ """Get the Lightning app config file '.lightning' at the given source path.
Args:
- source_path: A path to a folder or a file. The search for the config file will start relative to this path.
+ source_path: A path to a folder or a file.
"""
source_path = pathlib.Path(source_path).absolute()
if source_path.is_file():
source_path = source_path.parent
- candidate = pathlib.Path(source_path / _APP_CONFIG_FILENAME)
- if candidate.is_file():
- return candidate
-
- if source_path.parents:
- return find_config_file(source_path.parent)
+ return pathlib.Path(source_path / _APP_CONFIG_FILENAME)
diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py
index 16f7044f0c3cb..07b03da7d9201 100644
--- a/src/lightning_app/utilities/proxies.py
+++ b/src/lightning_app/utilities/proxies.py
@@ -7,11 +7,12 @@
import time
import traceback
import warnings
+from contextlib import contextmanager
from copy import deepcopy
from dataclasses import dataclass, field
from functools import partial
from threading import Event, Thread
-from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, TYPE_CHECKING, Union
+from typing import Any, Callable, Dict, Generator, Optional, Set, Tuple, Type, TYPE_CHECKING, Union
from deepdiff import DeepDiff, Delta
from lightning_utilities.core.apply_func import apply_to_collection
@@ -102,8 +103,6 @@ class ProxyWorkRun:
caller_queue: "BaseQueue"
def __post_init__(self):
- self.cache_calls = self.work.cache_calls
- self.parallel = self.work.parallel
self.work_state = None
def __call__(self, *args, **kwargs):
@@ -122,7 +121,7 @@ def __call__(self, *args, **kwargs):
# The if/else conditions are left un-compressed to simplify readability
# for the readers.
- if self.cache_calls:
+ if self.work.cache_calls:
if not entered or stopped_on_sigterm:
_send_data_to_caller_queue(self, self.work, self.caller_queue, data, call_hash)
else:
@@ -136,7 +135,7 @@ def __call__(self, *args, **kwargs):
# the previous task has completed and we can re-queue the next one.
# overriding the return value for next loop iteration.
_send_data_to_caller_queue(self, self.work, self.caller_queue, data, call_hash)
- if not self.parallel:
+ if not self.work.parallel:
raise CacheMissException("Task never called before. Triggered now")
def _validate_call_args(self, args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> None:
@@ -314,7 +313,7 @@ def run(self):
work_name: str
work: "LightningWork"
delta_queue: "BaseQueue"
- state_observer: "WorkStateObserver"
+ state_observer: Optional["WorkStateObserver"]
def __call__(self, name: str, value: Any) -> None:
logger.debug(f"Setting {name}: {value}")
@@ -329,7 +328,8 @@ def __call__(self, name: str, value: Any) -> None:
self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta))
# add the delta to the buffer to let WorkStateObserver know we already sent this one to the Flow
- self.state_observer._delta_memory.append(delta)
+ if self.state_observer:
+ self.state_observer._delta_memory.append(delta)
@dataclass
@@ -343,10 +343,36 @@ class WorkRunExecutor:
work: "LightningWork"
work_run: Callable
+ delta_queue: "BaseQueue"
+ enable_start_observer: bool = True
def __call__(self, *args, **kwargs):
return self.work_run(*args, **kwargs)
+ @contextmanager
+ def enable_spawn(self) -> Generator:
+ self.work._setattr_replacement = None
+ self.work._backend = None
+ self._clean_queues()
+ yield
+
+ def _clean_queues(self):
+ if "LIGHTNING_APP_STATE_URL" in os.environ:
+ self.work._request_queue = self.work._request_queue.to_dict()
+ self.work._response_queue = self.work._response_queue.to_dict()
+
+ @staticmethod
+ def process_queue(queue):
+ from lightning_app.core.queues import HTTPQueue, RedisQueue
+
+ if isinstance(queue, dict):
+ queue_type = queue.pop("type")
+ if queue_type == "redis":
+ return RedisQueue.from_dict(queue)
+ else:
+ return HTTPQueue.from_dict(queue)
+ return queue
+
@dataclass
class WorkRunner:
@@ -442,12 +468,13 @@ def run_once(self):
self._transfer_path_attributes()
# 6. Create the state observer thread.
- self.state_observer = WorkStateObserver(
- self.work,
- delta_queue=self.delta_queue,
- flow_to_work_delta_queue=self.flow_to_work_delta_queue,
- error_queue=self.error_queue,
- )
+ if self.run_executor_cls.enable_start_observer:
+ self.state_observer = WorkStateObserver(
+ self.work,
+ delta_queue=self.delta_queue,
+ flow_to_work_delta_queue=self.flow_to_work_delta_queue,
+ error_queue=self.error_queue,
+ )
# 7. Deepcopy the work state and send the first `RUNNING` status delta to the flow.
reference_state = deepcopy(self.work.state)
@@ -478,12 +505,13 @@ def run_once(self):
# 11. Start the state observer thread. It will look for state changes and send them back to the Flow
# The observer has to be initialized here, after the set_state call above so that the thread can start with
# the proper initial state of the work
- self.state_observer.start()
+ if self.run_executor_cls.enable_start_observer:
+ self.state_observer.start()
# 12. Run the `work_run` method.
# If an exception is raised, send a `FAILED` status delta to the flow and call the `on_exception` hook.
try:
- ret = self.run_executor_cls(self.work, work_run)(*args, **kwargs)
+ ret = self.run_executor_cls(self.work, work_run, self.delta_queue)(*args, **kwargs)
except LightningSigtermStateException as e:
raise e
except BaseException as e:
@@ -500,7 +528,7 @@ def run_once(self):
used_runpy = True
if user_exception:
trace.append(p)
- if "ret = self.run_executor_cls(self.work, work_run)(*args, **kwargs)" in p:
+ if "ret = self.run_executor_cls(" in p:
user_exception = True
if used_runpy:
@@ -525,7 +553,8 @@ def run_once(self):
return
# 13. Destroy the state observer.
- self.state_observer.join(0)
+ if self.run_executor_cls.enable_start_observer:
+ self.state_observer.join(0)
self.state_observer = None
# 14. Copy all artifacts to the shared storage so other Works can access them while this Work gets scaled down
@@ -574,14 +603,7 @@ def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None:
raise LightningSigtermStateException(0)
def _proxy_setattr(self, cleanup: bool = False):
- if cleanup:
- setattr_proxy = None
- else:
- assert self.state_observer
- setattr_proxy = LightningWorkSetAttrProxy(
- self.work_name, self.work, delta_queue=self.delta_queue, state_observer=self.state_observer
- )
- self.work._setattr_replacement = setattr_proxy
+ _proxy_setattr(self.work, self.delta_queue, self.state_observer, cleanup=cleanup)
def _process_call_args(
self, args: Tuple[Any, ...], kwargs: Dict[str, Any]
@@ -688,3 +710,16 @@ def persist_artifacts(work: "LightningWork") -> None:
f"All {destination_paths} artifacts from Work {work.name} successfully "
"stored at {artifacts_path(work.name)}."
)
+
+
+def _proxy_setattr(work, delta_queue, state_observer: Optional[WorkStateObserver], cleanup: bool = False):
+ if cleanup:
+ setattr_proxy = None
+ else:
+ setattr_proxy = LightningWorkSetAttrProxy(
+ work.name,
+ work,
+ delta_queue=delta_queue,
+ state_observer=state_observer,
+ )
+ work._setattr_replacement = setattr_proxy
diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md
index 03d371681a95d..61a6bfe685c69 100644
--- a/src/lightning_lite/CHANGELOG.md
+++ b/src/lightning_lite/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
+## [1.8.2] - 2022-11-17
+
+### Fixed
+
+- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
+
## [1.8.1] - 2022-11-10
diff --git a/src/lightning_lite/__version__.py b/src/lightning_lite/__version__.py
index 72126ce16b766..ba22724db3594 100644
--- a/src/lightning_lite/__version__.py
+++ b/src/lightning_lite/__version__.py
@@ -1 +1 @@
-version = "1.8.1"
+version = "1.8.2"
diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py
index 788e4f9529115..738f7cc661b05 100644
--- a/src/lightning_lite/connector.py
+++ b/src/lightning_lite/connector.py
@@ -395,7 +395,10 @@ def _check_strategy_and_fallback(self) -> None:
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
- TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
+ TorchElasticEnvironment.detect()
+ or KubeflowEnvironment.detect()
+ or SLURMEnvironment.detect()
+ or LSFEnvironment.detect()
):
strategy_flag = "ddp"
if strategy_flag == "dp" and self._accelerator_flag == "cpu":
diff --git a/src/lightning_lite/lite.py b/src/lightning_lite/lite.py
index a25655a5ba409..e6890742e42d9 100644
--- a/src/lightning_lite/lite.py
+++ b/src/lightning_lite/lite.py
@@ -123,7 +123,7 @@ def world_size(self) -> int:
@property
def is_global_zero(self) -> bool:
- """Wether this rank is rank zero."""
+ """Whether this rank is rank zero."""
return self._strategy.is_global_zero
@abstractmethod
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index bd9a346931e2b..21d881f29e8bd 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
+## [1.8.2] - 2022-11-17
+
+### Fixed
+
+- Make sure save_dir can be empty str ([#15638](https://github.com/PyTorchLightning/pytorch-lightning/issues/15638))
+- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
+
## [1.8.1] - 2022-11-10
diff --git a/src/pytorch_lightning/__setup__.py b/src/pytorch_lightning/__setup__.py
index 442bda630b884..a7ecff67d0630 100644
--- a/src/pytorch_lightning/__setup__.py
+++ b/src/pytorch_lightning/__setup__.py
@@ -124,5 +124,6 @@ def _setup_args(**__: Any) -> Dict[str, Any]:
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
],
)
diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py
index 72126ce16b766..ba22724db3594 100644
--- a/src/pytorch_lightning/__version__.py
+++ b/src/pytorch_lightning/__version__.py
@@ -1 +1 @@
-version = "1.8.1"
+version = "1.8.2"
diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py
index 8d0596e3bdccd..2e7b9bbb27b29 100644
--- a/src/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/src/pytorch_lightning/callbacks/model_checkpoint.py
@@ -574,7 +574,10 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> _PATH:
return self.dirpath
if len(trainer.loggers) > 0:
- save_dir = trainer.loggers[0].save_dir or trainer.default_root_dir
+ if trainer.loggers[0].save_dir is not None:
+ save_dir = trainer.loggers[0].save_dir
+ else:
+ save_dir = trainer.default_root_dir
name = trainer.loggers[0].name
version = trainer.loggers[0].version
version = version if isinstance(version, str) else f"version_{version}"
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
index d00e13a3194a9..fd8d2d4f4aa76 100644
--- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None:
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
- TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
+ TorchElasticEnvironment.detect()
+ or KubeflowEnvironment.detect()
+ or SLURMEnvironment.detect()
+ or LSFEnvironment.detect()
):
strategy_flag = "ddp"
if strategy_flag == "dp" and self._accelerator_flag == "cpu":
diff --git a/tests/tests_app/components/database/test_client_server.py b/tests/tests_app/components/database/test_client_server.py
index 6ebec90ff9b1e..7b193d8f74c20 100644
--- a/tests/tests_app/components/database/test_client_server.py
+++ b/tests/tests_app/components/database/test_client_server.py
@@ -2,6 +2,7 @@
import sys
import tempfile
import time
+import traceback
from pathlib import Path
from time import sleep
from typing import List, Optional
@@ -197,7 +198,9 @@ def run(self):
assert len(self._client.select_all()) == 1
self._exit()
- with tempfile.TemporaryDirectory() as tmpdir:
-
- app = LightningApp(Flow(tmpdir))
- MultiProcessRuntime(app).dispatch()
+ try:
+ with tempfile.TemporaryDirectory() as tmpdir:
+ app = LightningApp(Flow(tmpdir))
+ MultiProcessRuntime(app).dispatch()
+ except Exception:
+ print(traceback.print_exc())
diff --git a/tests/tests_app/components/serve/test_gradio.py b/tests/tests_app/components/serve/test_gradio.py
index 8dcdeec70a341..0b57656e6aa31 100644
--- a/tests/tests_app/components/serve/test_gradio.py
+++ b/tests/tests_app/components/serve/test_gradio.py
@@ -27,4 +27,6 @@ def predict(self, *args, **kwargs):
comp.run()
assert comp.model == "model"
assert comp.predict() == "prediction"
- gradio_mock.Interface.assert_called_once_with(fn=ANY, inputs=ANY, outputs=ANY, examples=ANY)
+ gradio_mock.Interface.assert_called_once_with(
+ fn=ANY, inputs=ANY, outputs=ANY, examples=ANY, title=None, description=None
+ )
diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py
index a0069f1314841..d81c72c06f071 100644
--- a/tests/tests_app/core/test_lightning_api.py
+++ b/tests/tests_app/core/test_lightning_api.py
@@ -42,7 +42,7 @@
class WorkA(LightningWork):
def __init__(self):
- super().__init__(parallel=True)
+ super().__init__(parallel=True, start_with_flow=False)
self.var_a = 0
self.drive = Drive("lit://test_app_state_api")
diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py
index d95cac9899761..1b438f14632bb 100644
--- a/tests/tests_app/core/test_lightning_app.py
+++ b/tests/tests_app/core/test_lightning_app.py
@@ -247,10 +247,9 @@ def test_get_component_by_name_raises():
app.get_component_by_name("root.b.w_b.c")
-@pytest.mark.parametrize("runtime_cls", [SingleProcessRuntime, MultiProcessRuntime])
-def test_nested_component(runtime_cls):
+def test_nested_component():
app = LightningApp(A(), log_level="debug")
- runtime_cls(app, start_server=False).dispatch()
+ MultiProcessRuntime(app, start_server=False).dispatch()
assert app.root.w_a.c == 1
assert app.root.b.w_b.c == 1
assert app.root.b.c.w_c.c == 1
@@ -601,9 +600,10 @@ def run(self):
class CheckpointFlow(LightningFlow):
- def __init__(self, work: LightningWork, depth=0):
+ def __init__(self, work: CheckpointCounter, depth=0):
super().__init__()
self.depth = depth
+
if depth == 0:
self.counter = 0
@@ -613,10 +613,9 @@ def __init__(self, work: LightningWork, depth=0):
self.flow = CheckpointFlow(work, depth + 1)
def run(self):
- if hasattr(self, "counter"):
- self.counter += 1
- if self.counter > 5:
- self._exit()
+ if self.works()[0].counter == 5:
+ self._exit()
+
if self.depth >= 10:
self.work.run()
else:
@@ -627,19 +626,16 @@ def test_lightning_app_checkpointing_with_nested_flows():
work = CheckpointCounter()
app = LightningApp(CheckpointFlow(work))
app.checkpointing = True
- SingleProcessRuntime(app, start_server=False).dispatch()
+ MultiProcessRuntime(app, start_server=False).dispatch()
- assert app.root.counter == 6
assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 5
work = CheckpointCounter()
app = LightningApp(CheckpointFlow(work))
- assert app.root.counter == 0
assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 0
app.load_state_dict_from_checkpoint_dir(app.checkpoint_dir)
# The counter was increment to 6 after the latest checkpoints was created.
- assert app.root.counter == 5
assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 5
@@ -956,8 +952,8 @@ def run(self):
def test_state_size_constant_growth():
app = LightningApp(SizeFlow())
MultiProcessRuntime(app, start_server=False).dispatch()
- assert app.root._state_sizes[0] <= 6952
- assert app.root._state_sizes[20] <= 26080
+ assert app.root._state_sizes[0] <= 7824
+ assert app.root._state_sizes[20] <= 26500
class FlowUpdated(LightningFlow):
diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py
index 23a465968efc8..25bc590893280 100644
--- a/tests/tests_app/runners/test_cloud.py
+++ b/tests/tests_app/runners/test_cloud.py
@@ -1,5 +1,6 @@
import logging
import os
+import sys
from copy import copy
from pathlib import Path
from unittest import mock
@@ -39,10 +40,11 @@
V1Work,
)
-from lightning_app import _PROJECT_ROOT, LightningApp, LightningWork
+from lightning_app import _PROJECT_ROOT, BuildConfig, LightningApp, LightningWork
from lightning_app.runners import backends, cloud, CloudRuntime
+from lightning_app.runners.cloud import _validate_build_spec_and_compute
from lightning_app.storage import Drive, Mount
-from lightning_app.testing.helpers import EmptyFlow
+from lightning_app.testing.helpers import EmptyFlow, EmptyWork
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.dependency_caching import get_hash
from lightning_app.utilities.packaging.cloud_compute import CloudCompute
@@ -54,8 +56,8 @@ def run(self):
class WorkWithSingleDrive(LightningWork):
- def __init__(self):
- super().__init__()
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
self.drive = None
def run(self):
@@ -63,8 +65,8 @@ def run(self):
class WorkWithTwoDrives(LightningWork):
- def __init__(self):
- super().__init__()
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
self.lit_drive_1 = None
self.lit_drive_2 = None
@@ -402,18 +404,16 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
app = mock.MagicMock()
- flow = mock.MagicMock()
- work = MyWork(start_with_flow=start_with_flow)
- monkeypatch.setattr(work, "_name", "test-work")
- monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"])
- monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"])
- monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image")
- monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
- monkeypatch.setattr(work, "_port", 8080)
+ work = MyWork(start_with_flow=start_with_flow, cloud_compute=CloudCompute("custom"))
+ work._name = "test-work"
+ work._cloud_build_config.build_commands = lambda: ["echo 'start'"]
+ work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"]
+ work._cloud_build_config.image = "random_base_public_image"
+ work._cloud_compute.disk_size = 0
+ work._port = 8080
- flow.works = lambda recurse: [work]
- app.flows = [flow]
+ app.works = [work]
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
monkeypatch.setattr(
"lightning_app.runners.cloud._get_project",
@@ -452,7 +452,7 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t
),
drives=[],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default",
+ name="custom",
count=1,
disk_size=0,
shm_size=0,
@@ -575,7 +575,6 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
app = mock.MagicMock()
- flow = mock.MagicMock()
mocked_drive = MagicMock(spec=Drive)
setattr(mocked_drive, "id", "foobar")
@@ -588,7 +587,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch
# should be the results of the deepcopy operation (an instance of the original class)
mocked_drive.__deepcopy__.return_value = copy(mocked_drive)
- work = WorkWithSingleDrive()
+ work = WorkWithSingleDrive(cloud_compute=CloudCompute("custom"))
monkeypatch.setattr(work, "drive", mocked_drive)
monkeypatch.setattr(work, "_state", {"_port", "drive"})
monkeypatch.setattr(work, "_name", "test-work")
@@ -598,8 +597,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch
monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
monkeypatch.setattr(work, "_port", 8080)
- flow.works = lambda recurse: [work]
- app.flows = [flow]
+ app.works = [work]
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
monkeypatch.setattr(
"lightning_app.runners.cloud._get_project",
@@ -650,7 +648,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch
),
],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default",
+ name="custom",
count=1,
disk_size=0,
shm_size=0,
@@ -712,19 +710,17 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
app = mock.MagicMock()
- flow = mock.MagicMock()
- work = MyWork()
- monkeypatch.setattr(work, "_state", {"_port"})
- monkeypatch.setattr(work, "_name", "test-work")
- monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"])
- monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"])
- monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image")
- monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
- monkeypatch.setattr(work, "_port", 8080)
+ work = MyWork(cloud_compute=CloudCompute("custom"))
+ work._state = {"_port"}
+ work._name = "test-work"
+ work._cloud_build_config.build_commands = lambda: ["echo 'start'"]
+ work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"]
+ work._cloud_build_config.image = "random_base_public_image"
+ work._cloud_compute.disk_size = 0
+ work._port = 8080
- flow.works = lambda recurse: [work]
- app.flows = [flow]
+ app.works = [work]
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
monkeypatch.setattr(
"lightning_app.runners.cloud._get_project",
@@ -761,7 +757,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin
),
drives=[],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default", count=1, disk_size=0, shm_size=0, preemptible=mock.ANY
+ name="custom", count=1, disk_size=0, shm_size=0, preemptible=mock.ANY
),
network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)],
cluster_id=mock.ANY,
@@ -829,7 +825,6 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
app = mock.MagicMock()
- flow = mock.MagicMock()
mocked_lit_drive = MagicMock(spec=Drive)
setattr(mocked_lit_drive, "id", "foobar")
@@ -842,19 +837,18 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo
# should be the results of the deepcopy operation (an instance of the original class)
mocked_lit_drive.__deepcopy__.return_value = copy(mocked_lit_drive)
- work = WorkWithTwoDrives()
- monkeypatch.setattr(work, "lit_drive_1", mocked_lit_drive)
- monkeypatch.setattr(work, "lit_drive_2", mocked_lit_drive)
- monkeypatch.setattr(work, "_state", {"_port", "_name", "lit_drive_1", "lit_drive_2"})
- monkeypatch.setattr(work, "_name", "test-work")
- monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"])
- monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"])
- monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image")
- monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
- monkeypatch.setattr(work, "_port", 8080)
-
- flow.works = lambda recurse: [work]
- app.flows = [flow]
+ work = WorkWithTwoDrives(cloud_compute=CloudCompute("custom"))
+ work.lit_drive_1 = mocked_lit_drive
+ work.lit_drive_2 = mocked_lit_drive
+ work._state = {"_port", "_name", "lit_drive_1", "lit_drive_2"}
+ work._name = "test-work"
+ work._cloud_build_config.build_commands = lambda: ["echo 'start'"]
+ work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"]
+ work._cloud_build_config.image = "random_base_public_image"
+ work._cloud_compute.disk_size = 0
+ work._port = 8080
+
+ app.works = [work]
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
monkeypatch.setattr(
"lightning_app.runners.cloud._get_project",
@@ -922,7 +916,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo
),
drives=[lit_drive_2_spec, lit_drive_1_spec],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default",
+ name="custom",
count=1,
disk_size=0,
shm_size=0,
@@ -961,7 +955,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo
),
drives=[lit_drive_1_spec, lit_drive_2_spec],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default",
+ name="custom",
count=1,
disk_size=0,
shm_size=0,
@@ -1034,7 +1028,6 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
app = mock.MagicMock()
- flow = mock.MagicMock()
mocked_drive = MagicMock(spec=Drive)
setattr(mocked_drive, "id", "foobar")
@@ -1052,7 +1045,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
setattr(mocked_mount, "mount_path", "/content/foo")
setattr(mocked_mount, "protocol", "s3://")
- work = WorkWithSingleDrive()
+ work = WorkWithSingleDrive(cloud_compute=CloudCompute("custom"))
monkeypatch.setattr(work, "drive", mocked_drive)
monkeypatch.setattr(work, "_state", {"_port", "drive"})
monkeypatch.setattr(work, "_name", "test-work")
@@ -1063,8 +1056,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
monkeypatch.setattr(work._cloud_compute, "mounts", mocked_mount)
monkeypatch.setattr(work, "_port", 8080)
- flow.works = lambda recurse: [work]
- app.flows = [flow]
+ app.works = [work]
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
monkeypatch.setattr(
"lightning_app.runners.cloud._get_project",
@@ -1129,7 +1121,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
),
],
user_requested_compute_config=V1UserRequestedComputeConfig(
- name="default",
+ name="custom",
count=1,
disk_size=0,
shm_size=0,
@@ -1237,3 +1229,74 @@ def test_load_app_from_file_module_error():
empty_app = CloudRuntime.load_app_from_file(os.path.join(_PROJECT_ROOT, "examples", "app_v0", "app.py"))
assert isinstance(empty_app, LightningApp)
assert isinstance(empty_app.root, EmptyFlow)
+
+
+@pytest.mark.parametrize(
+ "lines",
+ [
+ [
+ "import this_package_is_not_real",
+ "from lightning_app import LightningApp",
+ "from lightning_app.testing.helpers import EmptyWork",
+ "app = LightningApp(EmptyWork())",
+ ],
+ [
+ "from this_package_is_not_real import this_module_is_not_real",
+ "from lightning_app import LightningApp",
+ "from lightning_app.testing.helpers import EmptyWork",
+ "app = LightningApp(EmptyWork())",
+ ],
+ [
+ "import this_package_is_not_real",
+ "from this_package_is_not_real import this_module_is_not_real",
+ "from lightning_app import LightningApp",
+ "from lightning_app.testing.helpers import EmptyWork",
+ "app = LightningApp(EmptyWork())",
+ ],
+ [
+ "import this_package_is_not_real",
+ "from lightning_app import LightningApp",
+ "from lightning_app.core.flow import _RootFlow",
+ "from lightning_app.testing.helpers import EmptyWork",
+ "class MyFlow(_RootFlow):",
+ " def configure_layout(self):",
+ " return [{'name': 'test', 'content': this_package_is_not_real()}]",
+ "app = LightningApp(MyFlow(EmptyWork()))",
+ ],
+ ],
+)
+@pytest.mark.skipif(sys.platform != "linux", reason="Causing conflicts on non-linux")
+def test_load_app_from_file_mock_imports(tmpdir, lines):
+ path = copy(sys.path)
+ app_file = os.path.join(tmpdir, "app.py")
+
+ with open(app_file, "w") as f:
+ f.write("\n".join(lines))
+
+ app = CloudRuntime.load_app_from_file(app_file)
+ assert isinstance(app, LightningApp)
+ assert isinstance(app.root.work, EmptyWork)
+
+ # Cleanup PATH to prevent conflict with other tests
+ sys.path = path
+ os.remove(app_file)
+
+
+def test_incompatible_cloud_compute_and_build_config():
+ """Test that an exception is raised when a build config has a custom image defined, but the cloud compute is
+ the default.
+
+ This combination is not supported by the platform.
+ """
+
+ class Work(LightningWork):
+ def __init__(self):
+ super().__init__()
+ self.cloud_compute = CloudCompute(name="default")
+ self.cloud_build_config = BuildConfig(image="custom")
+
+ def run(self):
+ pass
+
+ with pytest.raises(ValueError, match="You requested a custom base image for the Work with name"):
+ _validate_build_spec_and_compute(Work())
diff --git a/tests/tests_app/storage/test_drive.py b/tests/tests_app/storage/test_drive.py
index bee8de5e093a8..d39623bd74296 100644
--- a/tests/tests_app/storage/test_drive.py
+++ b/tests/tests_app/storage/test_drive.py
@@ -50,7 +50,8 @@ def test_synchronization_lit_drive(tmpdir):
os.remove("a.txt")
app = LightningApp(SyncFlowLITDrives(tmpdir))
MultiProcessRuntime(app, start_server=False).dispatch()
- os.remove("a.txt")
+ if os.path.exists("a.txt"):
+ os.remove("a.txt")
class LITDriveWork(LightningWork):
diff --git a/tests/tests_app/utilities/packaging/test_app_config.py b/tests/tests_app/utilities/packaging/test_app_config.py
index 2666f0a769ace..60da494a47fb8 100644
--- a/tests/tests_app/utilities/packaging/test_app_config.py
+++ b/tests/tests_app/utilities/packaging/test_app_config.py
@@ -1,6 +1,6 @@
import pathlib
-from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file
+from lightning_app.utilities.packaging.app_config import _get_config_file, AppConfig
def _make_empty_config_file(folder):
@@ -10,24 +10,12 @@ def _make_empty_config_file(folder):
return file
-def test_find_config_file(tmpdir, monkeypatch):
- monkeypatch.chdir(pathlib.Path("/"))
- assert find_config_file() is None
-
- monkeypatch.chdir(pathlib.Path.home())
- assert find_config_file() is None
-
+def test_get_config_file(tmpdir):
_ = _make_empty_config_file(tmpdir)
- config_file1 = _make_empty_config_file(tmpdir / "a" / "b")
-
- assert find_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning")
- assert find_config_file(config_file1) == pathlib.Path(tmpdir, "a", "b", ".lightning")
- assert find_config_file(pathlib.Path(tmpdir, "a")) == pathlib.Path(tmpdir, ".lightning")
+ config_file1 = _make_empty_config_file(tmpdir)
- # the config must be a file, a folder of the same name gets ignored
- fake_config_folder = pathlib.Path(tmpdir, "fake", ".lightning")
- fake_config_folder.mkdir(parents=True)
- assert find_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning")
+ assert _get_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning")
+ assert _get_config_file(config_file1) == pathlib.Path(tmpdir, ".lightning")
def test_app_config_save_load(tmpdir):
diff --git a/tests/tests_app/utilities/test_app_commands.py b/tests/tests_app/utilities/test_app_commands.py
index 35f08509dca62..7e3b9beed4104 100644
--- a/tests/tests_app/utilities/test_app_commands.py
+++ b/tests/tests_app/utilities/test_app_commands.py
@@ -14,7 +14,7 @@
("multiple_commands.txt", ['echo "foo"', 'echo "bar"'], [1, 2]),
("commands_with_mixed_comments_1.txt", ['echo "foo"', 'echo "bar"'], [1, 3]),
("commands_with_mixed_comments_2.txt", ['echo "foo"', 'echo "bar"'], [2, 4]),
- ("command_after_first_non_comment_line.txt", ['echo "foo"'], [1]),
+ ("command_after_first_non_comment_line.txt", ['echo "foo"', 'echo "bar"'], [2, 4]),
("bang_not_at_start_of_line.txt", ['echo "foo"'], [2]),
("space_between_bang_and_command.txt", ['echo "foo"'], [1]),
("multiple_spaces_between_band_and_command.txt", ['echo "foo"'], [1]),
diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py
index fccbaaa671588..4b8a5f25f71e3 100644
--- a/tests/tests_app/utilities/test_proxies.py
+++ b/tests/tests_app/utilities/test_proxies.py
@@ -14,7 +14,7 @@
from lightning_app import LightningApp, LightningFlow, LightningWork
from lightning_app.runners import MultiProcessRuntime
-from lightning_app.storage import Path
+from lightning_app.storage import Drive, Path
from lightning_app.storage.path import _artifacts_path
from lightning_app.storage.requests import _GetRequest
from lightning_app.testing.helpers import _MockQueue, EmptyFlow
@@ -67,6 +67,7 @@ def proxy_setattr():
@pytest.mark.parametrize("parallel", [True, False])
@pytest.mark.parametrize("cache_calls", [False, True])
+@pytest.mark.skipif(sys.platform == "win32", reason="TODO (@ethanwharris): Fix this on Windows")
def test_work_runner(parallel, cache_calls):
"""This test validates the `WorkRunner` runs the work.run method and properly populates the `delta_queue`,
`error_queue` and `readiness_queue`."""
@@ -216,7 +217,7 @@ def __init__(self):
class WorkTimeout(LightningWork):
def __init__(self):
- super().__init__(parallel=True)
+ super().__init__(parallel=True, start_with_flow=False)
self.counter = 0
def run(self):
@@ -761,3 +762,31 @@ def test_bi_directional_proxy_forbidden(monkeypatch):
MultiProcessRuntime(app, start_server=False).dispatch()
assert app.stage == AppStage.FAILED
assert "A forbidden operation to update the work" in str(app.exception)
+
+
+class WorkDrive(LightningFlow):
+ def __init__(self, drive):
+ super().__init__()
+ self.drive = drive
+ self.path = Path("data")
+
+ def run(self):
+ pass
+
+
+class FlowDrive(LightningFlow):
+ def __init__(self):
+ super().__init__()
+ self.data = Drive("lit://data")
+ self.counter = 0
+
+ def run(self):
+ if not hasattr(self, "w"):
+ self.w = WorkDrive(self.data)
+ self.counter += 1
+
+
+def test_bi_directional_proxy_filtering():
+ app = LightningApp(FlowDrive())
+ app.root.run()
+ assert app._extract_vars_from_component_name(app.root.w.name, app.state) == {}
diff --git a/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt b/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt
index c9d90d8eff892..1cd80f15779df 100644
--- a/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt
+++ b/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt
@@ -1,3 +1,4 @@
+
# !echo "foo"
import lighting
# !echo "bar"
diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py
index 1455709e5c82f..af023504b5473 100644
--- a/tests/tests_lite/conftest.py
+++ b/tests/tests_lite/conftest.py
@@ -54,6 +54,7 @@ def restore_env_variables():
"HOROVOD_FUSION_THRESHOLD",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
+ "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
}
leaked_vars.difference_update(allowlist)
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
diff --git a/tests/tests_lite/plugins/precision/test_native_amp.py b/tests/tests_lite/plugins/precision/test_native_amp.py
index dbf2e1c9ec5c0..c64e0e8f19df0 100644
--- a/tests/tests_lite/plugins/precision/test_native_amp.py
+++ b/tests/tests_lite/plugins/precision/test_native_amp.py
@@ -43,19 +43,21 @@ def test_native_amp_precision_bf16_min_torch():
@RunIf(min_torch="1.10")
def test_native_amp_precision_forward_context():
- """Test to ensure that the context manager correctly is set to CPU + bfloat16."""
+ """Test to ensure that the context manager correctly is set to bfloat16 on CPU and CUDA."""
precision = NativeMixedPrecision(precision=16, device="cuda")
assert precision.device == "cuda"
assert isinstance(precision.scaler, torch.cuda.amp.GradScaler)
assert torch.get_default_dtype() == torch.float32
with precision.forward_context():
- assert torch.get_autocast_gpu_dtype() == torch.float16
+ # check with str due to a bug upstream: https://github.com/pytorch/pytorch/issues/65786
+ assert str(torch.get_autocast_gpu_dtype()) in ("torch.float16", "torch.half")
precision = NativeMixedPrecision(precision="bf16", device="cpu")
assert precision.device == "cpu"
assert precision.scaler is None
with precision.forward_context():
- assert torch.get_autocast_cpu_dtype() == torch.bfloat16
+ # check with str due to a bug upstream: https://github.com/pytorch/pytorch/issues/65786
+ assert str(torch.get_autocast_cpu_dtype()) == str(torch.bfloat16)
context_manager = precision._autocast_context_manager()
assert isinstance(context_manager, torch.autocast)
diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py
index 683d3cfe23d9d..a254a37bc68b5 100644
--- a/tests/tests_lite/test_connector.py
+++ b/tests/tests_lite/test_connector.py
@@ -32,6 +32,7 @@
from lightning_lite.plugins.environments import (
KubeflowEnvironment,
LightningEnvironment,
+ LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
)
@@ -200,24 +201,41 @@ class Strat(DDPStrategy):
assert connector.strategy is strategy
-@mock.patch.dict(
- os.environ,
- {
- "SLURM_NTASKS": "2",
- "SLURM_NTASKS_PER_NODE": "1",
- "SLURM_JOB_NAME": "SOME_NAME",
- "SLURM_NODEID": "0",
- "LOCAL_RANK": "0",
- "SLURM_PROCID": "0",
- "SLURM_LOCALID": "0",
- },
+@pytest.mark.parametrize(
+ "env_vars,expected_environment",
+ [
+ (
+ {
+ "SLURM_NTASKS": "2",
+ "SLURM_NTASKS_PER_NODE": "1",
+ "SLURM_JOB_NAME": "SOME_NAME",
+ "SLURM_NODEID": "0",
+ "LOCAL_RANK": "0",
+ "SLURM_PROCID": "0",
+ "SLURM_LOCALID": "0",
+ },
+ SLURMEnvironment,
+ ),
+ (
+ {
+ "LSB_JOBID": "1",
+ "LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
+ "JSM_NAMESPACE_LOCAL_RANK": "1",
+ "JSM_NAMESPACE_SIZE": "20",
+ "JSM_NAMESPACE_RANK": "1",
+ },
+ LSFEnvironment,
+ ),
+ ],
)
-@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0)
-def test_dist_backend_accelerator_mapping(*_):
- connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
- assert isinstance(connector.accelerator, CPUAccelerator)
- assert isinstance(connector.strategy, DDPStrategy)
- assert connector.strategy.local_rank == 0
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
+def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
+ with mock.patch.dict(os.environ, env_vars, clear=True):
+ trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
+ assert isinstance(trainer.accelerator, CPUAccelerator)
+ assert isinstance(trainer.strategy, DDPStrategy)
+ assert isinstance(trainer.strategy.cluster_environment, expected_environment)
@RunIf(mps=False)
diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
index 5d8616ad00657..2f5607828a232 100644
--- a/tests/tests_pytorch/conftest.py
+++ b/tests/tests_pytorch/conftest.py
@@ -72,6 +72,9 @@ def restore_env_variables():
"HOROVOD_FUSION_THRESHOLD",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
+ "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
+ "KMP_INIT_AT_FORK", # leaked since PyTorch 1.13
+ "KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13
}
leaked_vars.difference_update(allowlist)
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
index be8bced2cbf5f..97e3d27760ea8 100644
--- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
+++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
@@ -101,9 +101,6 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non
def _assert_layer_fsdp_instance(self) -> None:
assert isinstance(self.layer, FullyShardedDataParallel)
assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
- # root should not be resharding
- assert self.layer.reshard_after_forward is False
-
precision = torch.float16 if self.precision == 16 else torch.bfloat16
assert self.layer.mixed_precision.param_dtype == precision
assert self.layer.mixed_precision.reduce_dtype == precision
@@ -111,9 +108,6 @@ def _assert_layer_fsdp_instance(self) -> None:
for layer_num in [0, 2]:
assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel)
- # Assert that the nested layers are set reshard_after_forward to True
- assert self.layer.module[layer_num].reshard_after_forward is True
-
assert self.layer[layer_num].mixed_precision.param_dtype == precision
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
@@ -146,9 +140,6 @@ def _assert_layer_fsdp_instance(self) -> None:
precision = torch.float16 if self.precision == 16 else torch.bfloat16
for layer_num in [0, 2]:
assert isinstance(self.layer[layer_num], FullyShardedDataParallel)
- # Assert that the nested layers are set reshard_after_forward to True
- assert self.layer[layer_num].reshard_after_forward
-
assert self.layer[layer_num].mixed_precision.param_dtype == precision
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
index 9ff650c2768e3..77a4888351cf2 100644
--- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -25,6 +25,7 @@
from lightning_lite.plugins.environments import (
KubeflowEnvironment,
LightningEnvironment,
+ LSFEnvironment,
SLURMEnvironment,
TorchElasticEnvironment,
)
@@ -193,24 +194,41 @@ class Strat(DDPStrategy):
assert trainer._accelerator_connector.strategy is strategy
-@mock.patch.dict(
- os.environ,
- {
- "SLURM_NTASKS": "2",
- "SLURM_NTASKS_PER_NODE": "1",
- "SLURM_JOB_NAME": "SOME_NAME",
- "SLURM_NODEID": "0",
- "LOCAL_RANK": "0",
- "SLURM_PROCID": "0",
- "SLURM_LOCALID": "0",
- },
+@pytest.mark.parametrize(
+ "env_vars,expected_environment",
+ [
+ (
+ {
+ "SLURM_NTASKS": "2",
+ "SLURM_NTASKS_PER_NODE": "1",
+ "SLURM_JOB_NAME": "SOME_NAME",
+ "SLURM_NODEID": "0",
+ "LOCAL_RANK": "0",
+ "SLURM_PROCID": "0",
+ "SLURM_LOCALID": "0",
+ },
+ SLURMEnvironment,
+ ),
+ (
+ {
+ "LSB_JOBID": "1",
+ "LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
+ "JSM_NAMESPACE_LOCAL_RANK": "1",
+ "JSM_NAMESPACE_SIZE": "20",
+ "JSM_NAMESPACE_RANK": "1",
+ },
+ LSFEnvironment,
+ ),
+ ],
)
-@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
-def test_dist_backend_accelerator_mapping(cuda_count_0):
- trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2)
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
+@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
+def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
+ with mock.patch.dict(os.environ, env_vars, clear=True):
+ trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
- assert trainer.strategy.local_rank == 0
+ assert isinstance(trainer.strategy.cluster_environment, expected_environment)
def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):