diff --git a/.actions/assistant.py b/.actions/assistant.py index 0c6b567885a2c..ce253a96c4c21 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -1,8 +1,9 @@ import os import re +import shutil from itertools import chain +from os.path import dirname, isfile from pathlib import Path -from pprint import pprint from typing import Dict, List, Optional, Sequence, Tuple import pkg_resources @@ -65,6 +66,7 @@ def _replace_imports(lines: List[str], mapping: List[Tuple[str, str]]) -> List[s def copy_replace_imports( source_dir: str, source_imports: List[str], target_imports: List[str], target_dir: Optional[str] = None ) -> None: + """Copy package content with import adjustments.""" print(f"Replacing imports: {locals()}") assert len(source_imports) == len(target_imports), ( "source and target imports must have the same length, " @@ -75,19 +77,27 @@ def copy_replace_imports( ls = _retrieve_files(source_dir) for fp in ls: - if fp.endswith(".py") or not fp.endswith(".pyc"): - with open(fp, encoding="utf-8") as fo: - try: - lines = fo.readlines() - except UnicodeDecodeError: - # a binary file, skip - print(f"Skipped replacing imports for {fp}") - continue - lines = _replace_imports(lines, list(zip(source_imports, target_imports))) - fp_new = fp.replace(source_dir, target_dir) - os.makedirs(os.path.dirname(fp_new), exist_ok=True) - with open(fp_new, "w", encoding="utf-8") as fo: - fo.writelines(lines) + fp_new = fp.replace(source_dir, target_dir) + _, ext = os.path.splitext(fp) + if ext in (".png", ".jpg", ".ico"): + os.makedirs(dirname(fp_new), exist_ok=True) + if not isfile(fp_new): + shutil.copy(fp, fp_new) + continue + elif ext in (".pyc",): + continue + # Try to parse everything else + with open(fp, encoding="utf-8") as fo: + try: + lines = fo.readlines() + except UnicodeDecodeError: + # a binary file, skip + print(f"Skipped replacing imports for {fp}") + continue + lines = _replace_imports(lines, list(zip(source_imports, target_imports))) + os.makedirs(os.path.dirname(fp_new), exist_ok=True) + with open(fp_new, "w", encoding="utf-8") as fo: + fo.writelines(lines) def create_mirror_package(source_dir: str, package_mapping: Dict[str, str]) -> None: @@ -129,7 +139,7 @@ def _prune_packages(req_file: str, packages: Sequence[str]) -> None: req = list(pkg_resources.parse_requirements(ln_))[0] if req.name not in packages: final.append(line) - pprint(final) + print(final) path.write_text("\n".join(final)) @staticmethod @@ -147,7 +157,7 @@ def replace_oldest_ver(requirement_fnames: Sequence[str] = REQUIREMENT_FILES_ALL def copy_replace_imports( source_dir: str, source_import: str, target_import: str, target_dir: Optional[str] = None ) -> None: - """Recursively replace imports in given folder.""" + """Copy package content with import adjustments.""" source_imports = source_import.strip().split(",") target_imports = target_import.strip().split(",") copy_replace_imports(source_dir, source_imports, target_imports, target_dir=target_dir) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index b8c8305715300..24458c0daa385 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -27,12 +27,17 @@ pr: - "release/*" paths: include: - - ".azure/app-cloud-e2e.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "examples/app_*" - - "tests/tests_app_examples/**" - - ".actions/**" + - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "examples/app_*/**" # some tests_app tests call examples files + - "tests/tests_app_examples/**" + - "setup.py" + - ".actions/**" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" # variables are automatically exported as environment variables so this will override pip's default cache dir variables: diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 8fad0d69c15d1..52ad4251d4300 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -21,6 +21,11 @@ pr: paths: include: - ".azure/gpu-benchmark.yml" + - "tests/tests_pytorch/benchmarks/**" + - "requirements/pytorch/**" + - "!requirements/pytorch/docs.txt" + - "!*.md" + - "!**/*.md" schedules: - cron: "0 0 * * *" # At the end of every day @@ -37,7 +42,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" options: "--gpus=all --shm-size=32g" workspace: clean: all @@ -47,18 +52,41 @@ jobs: - bash: | echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver" echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" displayName: 'set env. vars' - bash: | - pip install -e .[strategies] --find-links ${TORCH_URL} + echo $CUDA_VISIBLE_DEVICES + echo $TORCH_URL + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version pip list + displayName: 'Image info & NVIDIA' + + - bash: | + python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt] + + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + displayName: 'Adjust dependencies' + + - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL} env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: "pytorch" + FREEZE_REQUIREMENTS: "1" displayName: 'Install package' + - bash: | + set -e + pip list + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" + displayName: 'Env details' + - bash: python -m pytest benchmarks -v --durations=0 env: PL_RUNNING_BENCHMARKS: "1" diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index ceb4c671cfe22..98ff44f879a71 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -21,12 +21,18 @@ pr: paths: include: - ".azure/gpu-tests-lite.yml" + - "examples/lite/**" + - "examples/run_lite_examples.sh" + - "tests/tests_lite/run_standalone_*.sh" + - "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink - "requirements/lite/**" - "src/lightning_lite/**" - "tests/tests_lite/**" - - "tests/tests_pytorch/run_standalone_tests.sh" - - "tests/tests_lite/run_standalone_tests.sh" # a symlink to the one above + - "setup.cfg" # includes pytest config - ".actions/**" + - "!requirements/lite/docs.txt" + - "!*.md" + - "!**/*.md" jobs: - job: testing @@ -38,7 +44,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--gpus=all --shm-size=2gb" @@ -48,6 +54,14 @@ jobs: steps: - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" + displayName: 'set env. vars' + + - bash: | + echo $CUDA_VISIBLE_DEVICES + echo $TORCH_URL lspci | egrep 'VGA|3D' whereis nvidia nvidia-smi @@ -58,14 +72,13 @@ jobs: displayName: 'Image info & NVIDIA' - bash: | - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: 'set visible devices' + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION} + displayName: 'Adjust dependencies' - bash: | - set -e - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - pip install -e .[dev,strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip list + pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL} env: PACKAGE_NAME: "lite" FREEZE_REQUIREMENTS: "1" @@ -73,7 +86,7 @@ jobs: - bash: | set -e - echo $CUDA_VISIBLE_DEVICES + pip list python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" displayName: 'Env details' diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 05571269a99a7..91fe0b6107bd1 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -37,14 +37,20 @@ pr: - "requirements/lite/**" - "src/lightning_lite/**" - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" jobs: - job: testing strategy: matrix: - # TODO: package parametrization - 'PyTorch - stable': + 'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + scope: "strategies" + 'PyTorch - latest': + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" + scope: "" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -93,11 +99,11 @@ jobs: python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} displayName: 'Adjust dependencies' - - bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL} + - bash: pip install -e .[dev,examples] --find-links ${TORCH_URL} env: PACKAGE_NAME: "pytorch" FREEZE_REQUIREMENTS: "1" - displayName: 'Install package' + displayName: 'Install package & extras' - bash: | set -e @@ -109,14 +115,17 @@ jobs: CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org - pip list - displayName: 'Install dependencies' + pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL} + + python requirements/pytorch/check-avail-strategies.py + condition: eq(variables['scope'], 'strategies') + displayName: 'Install strategies' - bash: | set -e + pip list python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 8281f9e5c5fa6..0c6851754f2a0 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -26,6 +26,9 @@ pr: - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" jobs: - job: testing diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml index 972bf1e95a06b..d96adabf4a1ff 100644 --- a/.azure/ipu-tests.yml +++ b/.azure/ipu-tests.yml @@ -23,6 +23,9 @@ pr: - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" variables: - name: poplar_sdk diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index b3b0ac8e8a7e4..53e8348626c25 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -1,129 +1,142 @@ custom_service_name: "Lightning CI required checker" -# For security reasons, configuration is only loaded from the repository's default branch, -# changes made in pull requests from different branches or forks are ignored. This means that changes to this file -# will only be used after they are merged. subprojects: # SECTION: pytorch_lightning - - id: "pytorch_lightning" + - id: "pytorch_lightning: Tests workflow" paths: - # all examples don't need to be added because they aren't used in CI, but these are - - "examples/run_ddp_examples.sh" - - "examples/convert_from_pt_to_pl/**" - - "examples/run_pl_examples.sh" - - "examples/pl_basics/backbone_image_classifier.py" - - "examples/pl_basics/autoencoder.py" - - "examples/pl_loops/mnist_lite.py" - - "examples/pl_fault_tolerant/automatic.py" - - "examples/test_pl_examples.py" - - "examples/pl_integrations/dali_image_classifier.py" + - ".github/workflows/ci-pytorch-tests.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" - "requirements/pytorch/**" - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "tests/legacy/back-compatible-versions.txt" - "setup.cfg" # includes pytest config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - # Note: updates here should be applied to the lightning_lite group - - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)" - - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)" - - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)" - - "pl-cpu (windows-2022, pytorch, 3.9, 1.11)" - - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)" - - "pl-cpu (windows-2022, pytorch, 3.10, 1.12)" - - "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)" - - "pl-cpu (macOS-11, lightning, 3.10, 1.12)" - #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)" - #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, lightning, 3.10, 1.12)" - #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)" - - "pytorch-lightning (GPUs)" - - "pytorch-lightning (HPUs)" - - "pytorch-lightning (IPUs)" - - "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)" - - "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)" - - "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)" - # TODO: since this job has intermittent availability, it cannot be required or it will block all PL PRs from forks - #- "test-on-tpus" - - - id: "pytorch_lightning: CPU workflow" - paths: - - ".github/workflows/ci-pytorch-tests.yml" - checks: - - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)" - - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)" - - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)" + - "pl-cpu (macOS-11, pytorch, 3.8, 1.11)" + - "pl-cpu (macOS-11, pytorch, 3.9, 1.12)" + - "pl-cpu (macOS-11, pytorch, 3.10, 1.13)" + - "pl-cpu (macOS-11, pytorch, 3.8, 1.9, oldest)" - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)" - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)" - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)" + - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.13)" - "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)" - "pl-cpu (windows-2022, pytorch, 3.9, 1.11)" - - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)" - "pl-cpu (windows-2022, pytorch, 3.10, 1.12)" + - "pl-cpu (windows-2022, pytorch, 3.10, 1.13)" - "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)" - - "pl-cpu (macOS-11, lightning, 3.10, 1.12)" - #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)" - #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, lightning, 3.10, 1.12)" - #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)" - - - id: "pytorch_lightning: Slow workflow" - paths: - - ".github/workflows/ci-pytorch-tests-slow.yml" - checks: - "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)" - "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)" - "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)" + - "pl-cpu (macOS-11, lightning, 3.8, 1.13)" + - "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13)" + - "pl-cpu (windows-2022, lightning, 3.8, 1.13)" - id: "pytorch_lightning: Azure GPU" paths: - ".azure/gpu-tests-pytorch.yml" - - "tests/tests_pytorch/run_standalone_*.sh" + # only the azure GPU workflow runs the examples + # all examples don't need to be added because they aren't used in CI, but these are + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - "requirements/lite/**" + - "src/lightning_lite/**" + - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "pytorch-lightning (GPUs)" + - id: "pytorch_lightning: Benchmarks" + paths: + - ".azure/gpu-benchmark.yml" + - "tests/tests_pytorch/benchmarks/**" + - "requirements/pytorch/**" + - "!requirements/pytorch/docs.txt" + - "!*.md" + - "!**/*.md" + checks: + - "pytorch-lightning.Benchmark" + - id: "pytorch_lightning: Azure HPU" paths: - ".azure/hpu-tests.yml" + - "examples/pl_hpu/mnist_sample.py" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "pytorch-lightning (HPUs)" - id: "pytorch_lightning: Azure IPU" paths: - ".azure/ipu-tests.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "pytorch-lightning (IPUs)" - - id: "pytorch-lightning: TPU workflow" - paths: - - ".github/workflows/tpu-tests.yml" - checks: - - "test-on-tpus" + # TODO: since this job has intermittent availability, it cannot be required + #- id: "pytorch-lightning: TPU workflow" + # paths: + # - ".github/workflows/tpu-tests.yml" + # - "dockers/base-xla/*" + # - "requirements/lite/**" + # - "src/lightning_lite/**" + # - "tests/tests_lite/**" + # - "requirements/pytorch/**" + # - "src/pytorch_lightning/**" + # - "tests/tests_pytorch/**" + # - "setup.cfg" # includes pytest config + # - ".actions/**" + # - "!requirements/**/docs.txt" + # - "!*.md" + # - "!**/*.md" + # checks: + # - "test-on-tpus" - id: "pytorch_lightning: Docs" paths: - "src/pytorch_lightning/**" - "docs/source-pytorch/**" - - ".github/workflows/docs-*.yml" + - ".github/workflows/docs-checks.yml" - "requirements/docs.txt" - "requirements/pytorch/**" + - "setup.py" + - "setup.cfg" # includes metadata used in the package creation + - ".actions/**" + - "!*.md" + - "!**/*.md" checks: - "make-doctest (pytorch)" - "make-html (pytorch)" @@ -132,141 +145,89 @@ subprojects: paths: - "dockers/**" - ".github/workflows/ci-pytorch-dockers.yml" - - "requirements.txt" - - "requirements/*.txt" - - "requirements/pytorch/*" + - "requirements/pytorch/**" + - "requirements/lite/**" - "environment.yml" - - ".github/workflows/*docker*.yml" - "setup.py" - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" - "build-cuda (3.9, 1.12, 11.6.1)" + - "build-cuda (3.9, 1.13, 11.6.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - "build-pl (3.9, 1.10, 11.3.1)" - "build-pl (3.9, 1.11, 11.3.1)" - "build-pl (3.9, 1.12, 11.6.1)" + - "build-pl (3.9, 1.13, 11.6.1)" - "build-xla (3.7, 1.12)" # SECTION: lightning_lite - - id: "lightning_lite" + - id: "lightning_lite: CPU workflow" paths: - "requirements/lite/**" - "src/lightning_lite/**" + - "tests/tests_lite/**" - "setup.cfg" # includes pytest config + - ".github/workflows/ci-lite-tests.yml" - ".actions/**" + - "!requirements/lite/docs.txt" + - "!*.md" + - "!**/*.md" checks: - - "lite-cpu (macOS-11, lite, 3.9, 1.11)" - - "lite-cpu (macOS-11, lite, 3.8, 1.10)" - - "lite-cpu (macOS-11, lite, 3.10, 1.12)" - - "lite-cpu (macOS-11, lite, 3.10, 1.13, pre)" - - "lite-cpu (macOS-11, lite, 3.7, 1.9, oldest)" - - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.11)" - - "lite-cpu (ubuntu-20.04, lite, 3.10, 1.12)" - - "lite-cpu (ubuntu-20.04, lite, 3.7, 1.9, oldest)" - - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.13, pre)" - - "lite-cpu (windows-2022, lite, 3.8, 1.9)" - - "lite-cpu (windows-2022, lite, 3.9, 1.10)" - - "lite-cpu (windows-2022, lite, 3.10, 1.11)" - - "lite-cpu (windows-2022, lite, 3.10, 1.12)" - - "lite-cpu (windows-2022, lite, 3.7, 1.9, oldest)" - - "lite-cpu (windows-2022, lite, 3.8, 1.13, pre)" - - "lite-cpu (macOS-11, lightning, 3.8, 1.12)" - - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.12)" - - "lite-cpu (windows-2022, lightning, 3.8, 1.12)" - - "lightning-lite (GPUs)" - # Lite also requires PL checks as it depends on Lite - - "pl-cpu (macOS-11, pytorch, 3.8, 1.10)" - - "pl-cpu (macOS-11, pytorch, 3.9, 1.11)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.12)" - - "pl-cpu (macOS-11, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (macOS-11, pytorch, 3.10, 1.13, pre)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.10)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.11)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.11)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.10, 1.12)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 1.13, pre)" - - "pl-cpu (windows-2022, pytorch, 3.9, 1.11)" - - "pl-cpu (windows-2022, pytorch, 3.10, 1.11)" - - "pl-cpu (windows-2022, pytorch, 3.10, 1.12)" - - "pl-cpu (windows-2022, pytorch, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, pytorch, 3.8, 1.13, pre)" - - "pl-cpu (macOS-11, lightning, 3.10, 1.12)" - #- "pl-cpu (macOS-11, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.12)" - #- "pl-cpu (ubuntu-20.04, lightning, 3.7, 1.9, oldest)" - - "pl-cpu (windows-2022, lightning, 3.10, 1.12)" - #- "pl-cpu (windows-2022, lightning, 3.7, 1.9, oldest)" - - "pytorch-lightning (GPUs)" - - "pytorch-lightning (HPUs)" - - "pytorch-lightning (IPUs)" - - "pl-cpu (slow, macOS-11, pytorch, 3.7, 1.11)" - - "pl-cpu (slow, ubuntu-20.04, pytorch, 3.7, 1.11)" - - "pl-cpu (slow, windows-2022, pytorch, 3.7, 1.11)" - #- "test-on-tpus" - - - id: "lightning_lite: Tests" - paths: - - "tests/tests_lite/**" - checks: - - "lite-cpu (macOS-11, lite, 3.9, 1.11)" - - "lite-cpu (macOS-11, lite, 3.8, 1.10)" - - "lite-cpu (macOS-11, lite, 3.10, 1.12)" - - "lite-cpu (macOS-11, lite, 3.10, 1.13, pre)" + - "lite-cpu (macOS-11, lite, 3.8, 1.11)" + - "lite-cpu (macOS-11, lite, 3.9, 1.12)" + - "lite-cpu (macOS-11, lite, 3.10, 1.13)" - "lite-cpu (macOS-11, lite, 3.7, 1.9, oldest)" - - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.11)" + - "lite-cpu (ubuntu-20.04, lite, 3.8, 1.10)" + - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.11)" - "lite-cpu (ubuntu-20.04, lite, 3.10, 1.12)" + - "lite-cpu (ubuntu-20.04, lite, 3.10, 1.13)" - "lite-cpu (ubuntu-20.04, lite, 3.7, 1.9, oldest)" - - "lite-cpu (ubuntu-20.04, lite, 3.9, 1.13, pre)" - - "lite-cpu (windows-2022, lite, 3.8, 1.9)" - - "lite-cpu (windows-2022, lite, 3.9, 1.10)" - - "lite-cpu (windows-2022, lite, 3.10, 1.11)" + - "lite-cpu (windows-2022, lite, 3.9, 1.11)" - "lite-cpu (windows-2022, lite, 3.10, 1.12)" + - "lite-cpu (windows-2022, lite, 3.10, 1.13)" - "lite-cpu (windows-2022, lite, 3.7, 1.9, oldest)" - - "lite-cpu (windows-2022, lite, 3.8, 1.13, pre)" - - "lite-cpu (macOS-11, lightning, 3.8, 1.12)" - - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.12)" - - "lite-cpu (windows-2022, lightning, 3.8, 1.12)" - - "lightning-lite (GPUs)" + - "lite-cpu (macOS-11, lightning, 3.8, 1.13)" + - "lite-cpu (ubuntu-20.04, lightning, 3.8, 1.13)" + - "lite-cpu (windows-2022, lightning, 3.8, 1.13)" - id: "lightning_lite: Azure GPU" paths: - ".azure/gpu-tests-lite.yml" - "tests/tests_lite/run_standalone_*.sh" - "tests/tests_pytorch/run_standalone_tests.sh" # used by Lite through a symlink + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "setup.cfg" # includes pytest config + - ".actions/**" + - "!requirements/lite/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "lightning-lite (GPUs)" # SECTION: lightning_app - - id: "lightning_app" + - id: "lightning_app: Tests workflow" paths: - - "requirements/app/**" + - ".github/workflows/ci-app-tests.yml" - "src/lightning_app/**" - "tests/tests_app/**" - "examples/app_*/**" # some tests_app tests call examples files + - "requirements/app/**" - "setup.py" - ".actions/**" - checks: - - "App.cloud-e2e" - - "app-pytest (macOS-11, app, 3.8, latest)" - - "app-pytest (macOS-11, app, 3.8, oldest)" - - "app-pytest (macOS-11, lightning, 3.9, latest)" - - "app-pytest (ubuntu-20.04, app, 3.8, latest)" - - "app-pytest (ubuntu-20.04, app, 3.8, oldest)" - - "app-pytest (ubuntu-20.04, lightning, 3.9, latest)" - - "app-pytest (windows-2022, app, 3.8, latest)" - - "app-pytest (windows-2022, app, 3.8, oldest)" - - "app-pytest (windows-2022, lightning, 3.8, latest)" - - - id: "lightning_app: Tests workflow" - paths: - - ".github/workflows/ci-app-tests.yml" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "app-pytest (macOS-11, app, 3.8, latest)" - "app-pytest (macOS-11, app, 3.8, oldest)" @@ -280,26 +241,16 @@ subprojects: - id: "lightning_app: Examples" paths: - - "requirements/app/**" + - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" - "tests/tests_app_examples/**" - - "examples/app_*/**" + - "examples/app_*" + - "requirements/app/**" - "setup.py" - ".actions/**" - checks: - - "app-examples (macOS-11, app, 3.9, latest)" - - "app-examples (macOS-11, app, 3.9, oldest)" - - "app-examples (macOS-11, lightning, 3.9, latest)" - - "app-examples (ubuntu-20.04, app, 3.9, latest)" - - "app-examples (ubuntu-20.04, app, 3.9, oldest)" - - "app-examples (ubuntu-20.04, lightning, 3.9, latest)" - - "app-examples (windows-2022, app, 3.9, latest)" - - "app-examples (windows-2022, app, 3.9, oldest)" - - "app-examples (windows-2022, lightning, 3.9, latest)" - - - id: "lightning_app: Examples workflow" - paths: - - ".github/workflows/ci-app-examples.yml" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "app-examples (macOS-11, app, 3.9, latest)" - "app-examples (macOS-11, app, 3.9, oldest)" @@ -314,6 +265,16 @@ subprojects: - id: "lightning_app: Azure" paths: - ".azure/app-cloud-e2e.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "examples/app_*/**" # some tests_app tests call examples files + - "tests/tests_app_examples/**" + - "setup.py" + - ".actions/**" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "App.cloud-e2e" @@ -321,9 +282,14 @@ subprojects: paths: - "src/lightning_app/**" - "docs/source-app/**" - - ".github/workflows/docs-*.yml" + - ".github/workflows/docs-checks.yml" - "requirements/docs.txt" - "requirements/app/**" + - "setup.py" + - "setup.cfg" # includes metadata used in the package creation + - ".actions/**" + - "!*.md" + - "!**/*.md" checks: - "make-doctest (app)" - "make-html (app)" @@ -337,6 +303,9 @@ subprojects: - "src/**" - "pyproject.toml" # includes mypy config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "mypy" @@ -347,6 +316,9 @@ subprojects: - "setup.py" - "src/**" - "requirements/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" checks: - "install-pkg (ubuntu-22.04, app, 3.7)" - "install-pkg (ubuntu-22.04, app, 3.10)" diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 457a01d643aab..88eadcfd920f8 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -12,10 +12,12 @@ on: - "src/lightning_app/**" - "tests/tests_app_examples/**" - "examples/app_*" - - "requirements/app/*" - - "!requirements/app/docs.txt" + - "requirements/app/**" - "setup.py" - ".actions/**" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 4bcf1850eff72..826dfc70b552f 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -12,10 +12,12 @@ on: - "src/lightning_app/**" - "tests/tests_app/**" - "examples/app_*" # some tests_app tests call examples files - - "requirements/app/*" - - "!requirements/app/docs.txt" + - "requirements/app/**" - "setup.py" - ".actions/**" + - "!requirements/app/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-lite-tests.yml b/.github/workflows/ci-lite-tests.yml index cfd9c9a0b4fb7..c6b835bba4672 100644 --- a/.github/workflows/ci-lite-tests.yml +++ b/.github/workflows/ci-lite-tests.yml @@ -8,13 +8,15 @@ on: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - - "requirements/lite/*" - - "!requirements/lite/docs.txt" + - "requirements/lite/**" - "src/lightning_lite/**" - "tests/tests_lite/**" - "setup.cfg" # includes pytest config - ".github/workflows/ci-lite-tests.yml" - ".actions/**" + - "!requirements/lite/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -36,29 +38,26 @@ jobs: matrix: include: # assign python and pytorch version combinations to operating systems (arbitrarily) - # note: there's no distribution of Torch==1.9 for Python>=3.9 or torch==1.10 for Python>=3.10 - - {os: "macOS-11", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"} - - {os: "macOS-11", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.10"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.11"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.10"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.9"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.11"} - # only run PyTorch latest with Python latest - - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"} + # note: there's no distribution of torch==1.10 for Python>=3.10 + - {os: "macOS-11", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.11"} + - {os: "macOS-11", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.10"} + - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"} - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"} - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"} + # only run PyTorch latest with Python latest + - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} + - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} + - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} # "oldest" versions tests, only on minimum Python - {os: "macOS-11", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "windows-2022", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # release-candidate tests, mixed Python versions - - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13", release: "pre"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.13", release: "pre"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.13", release: "pre"} # "lightning" installs the monolithic package - - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"} - - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"} - - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.12"} + - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} + - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} + - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} timeout-minutes: 15 diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index b1fad271779fc..e749b4916357f 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -13,7 +13,9 @@ on: - "setup.py" - "src/**" - "requirements/**" - - "!requirements/*/docs.txt" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -63,6 +65,8 @@ jobs: run: python -m lightning --version - name: DocTest package + env: + PY_IGNORE_IMPORTMISMATCH: 1 run: | PKG_NAME=$(python -c "print({'app': 'lightning_app', 'lite': 'lightning_lite', 'pytorch': 'pytorch_lightning', 'lightning': 'lightning'}['${{matrix.pkg-name}}'])") python -m pytest src/${PKG_NAME} --ignore-glob="**/cli/*-template/**" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 4682dd5b3b5ac..42c2998d295ee 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -8,14 +8,15 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - "dockers/**" - - "!dockers/README.md" - - "requirements.txt" - - "requirements/**" - - "!requirements/*/docs.txt" + - ".github/workflows/ci-pytorch-dockers.yml" + - "requirements/pytorch/**" + - "requirements/lite/**" - "environment.yml" - - ".github/workflows/*docker*.yml" - "setup.py" - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" schedule: - cron: "0 0 * * *" # at the end of every day @@ -39,6 +40,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -100,6 +102,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/ci-pytorch-tests.yml b/.github/workflows/ci-pytorch-tests.yml index 91d0a73452e1c..e45e40bb2db89 100644 --- a/.github/workflows/ci-pytorch-tests.yml +++ b/.github/workflows/ci-pytorch-tests.yml @@ -8,16 +8,18 @@ on: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - - "requirements/pytorch/*" + - "requirements/pytorch/**" - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "tests/legacy/back-compatible-versions.txt" - "setup.cfg" # includes pytest config - ".github/workflows/ci-pytorch-tests.yml" - - "requirements/lite/*" + - "requirements/lite/**" - "src/lightning_lite/**" - ".actions/**" - - "!requirements/*/docs.txt" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -40,39 +42,32 @@ jobs: matrix: include: # assign python and pytorch version combinations to operating systems (arbitrarily) - # note: there's no distribution of Torch==1.9 for Python>=3.9 or torch==1.10 for Python>=3.10 - - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"} - - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"} + # note: there's no distribution of torch==1.10 for Python>=3.10 + - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.11"} + - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.12"} - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"} - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"} - - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.11"} - - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.10"} - - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"} - - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.11"} - # only run PyTorch latest with Python latest - - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"} - - {os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"} - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"} - - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.11"} - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.12"} - - {os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.12"} + # only run PyTorch latest with Python latest + - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"} + - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"} + - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13"} # "oldest" versions tests, only on minimum Python - - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "macOS-11", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.9", requires: "oldest"} # 3.7 hangs - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - - {os: "windows-2022", pkg-name: "lightning", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} - # release-candidate tests, mixed Python versions - - {os: "macOS-11", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "1.13", release: "pre"} - - {os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.9", pytorch-version: "1.13", release: "pre"} - - {os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13", release: "pre"} # run test under SLOW label - {type: "slow", os: "macOS-11", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"} - {type: "slow", os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"} - {type: "slow", os: "windows-2022", pkg-name: "pytorch", python-version: "3.7", pytorch-version: "1.11"} + # "lightning" installs the monolithic package + - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} + - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} + - {os: "windows-2022", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} - timeout-minutes: 60 + timeout-minutes: 70 # tests with macOS-11, py3.7 oldest takes much longer then expected steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 40f14287c0656..2688b175063ed 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -11,6 +11,9 @@ on: - "src/**" - "pyproject.toml" # includes mypy config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 4169bc33e4632..1584fc5d9aa4e 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -14,8 +14,10 @@ on: - "docs/**" - "src/**" - "setup.py" - - "setup.cfg" + - "setup.cfg" # includes metadata used in the package creation - ".github/workflows/docs-checks.yml" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index 38c5db8a8946c..415802e0bb476 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -6,6 +6,7 @@ on: branches: ["master", "release/*"] paths: - ".github/workflows/docs-deploy.yml" + # TODO: this workflow is just for debugging. extend the paths that should trigger it env: FREEZE_REQUIREMENTS: 1 @@ -17,6 +18,7 @@ defaults: jobs: # https://github.com/marketplace/actions/deploy-to-github-pages build-docs-deploy: + if: github.repository_owner == 'Lightning-AI' runs-on: ubuntu-20.04 steps: - name: Checkout 🛎️ @@ -75,12 +77,12 @@ jobs: - id: 'auth' name: 'Authenticate to Google Cloud' - uses: google-github-actions/auth@v0 + uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GCS_SA_KEY }} - name: Setup gcloud - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/setup-gcloud@v1 with: project_id: ${{ secrets.GCS_PROJECT }} diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 35b267819a0cc..15965ca7eba47 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -14,7 +14,7 @@ jobs: if: github.event.pull_request.draft == false timeout-minutes: 61 # in case something is wrong with the internal timeout steps: - - uses: Lightning-AI/probot@v2 + - uses: Lightning-AI/probot@v4 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index eccbf608491ae..33102fd3e6705 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -19,6 +19,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v3 @@ -47,7 +48,7 @@ jobs: - name: Publish Latest to Docker uses: docker/build-push-action@v3 # Only latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index a7ffe3e10afe0..8f21eabee0ccf 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -8,12 +8,18 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".github/workflows/tpu-tests.yml" - - "requirements/pytorch/*" - - "!requirements/pytorch/docs.txt" + - "dockers/base-xla/*" + - "requirements/lite/**" + - "src/lightning_lite/**" + - "tests/tests_lite/**" + - "requirements/pytorch/**" - "src/pytorch_lightning/**" - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - ".actions/**" + - "!requirements/**/docs.txt" + - "!*.md" + - "!**/*.md" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -31,7 +37,7 @@ jobs: if: github.event.pull_request.draft == false env: PYTHON_VER: 3.7 - timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet` + timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet` steps: - uses: actions/checkout@v3 @@ -62,12 +68,12 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} SHA: ${{ github.event.pull_request.head.sha }} run: | - python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') + python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" - cat dockers/tpu-tests/tpu_test_cases.jsonnet + cat dockers/base-xla/tpu_workflow.jsonnet shell: bash - - uses: google-github-actions/auth@v0 + - uses: google-github-actions/auth@v1 with: credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }} @@ -80,7 +86,7 @@ jobs: - name: Deploy cluster run: | export PATH=$PATH:$HOME/go/bin - job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) + job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') diff --git a/README.md b/README.md index 66f1f28be4275..28e588a52145c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ ______________________________________________________________________ DocsExamplesCommunity • + ContributeLicense

@@ -406,7 +407,7 @@ The lightning community is maintained by - [10+ core contributors](https://pytorch-lightning.readthedocs.io/en/latest/governance.html) who are all a mix of professional engineers, Research Scientists, and Ph.D. students from top AI labs. - 590+ active community contributors. -Want to help us build Lightning and reduce boilerplate for thousands of researchers? [Learn how to make your first contribution here](https://devblog.pytorchlightning.ai/quick-contribution-guide-86d977171b3a) +Want to help us build Lightning and reduce boilerplate for thousands of researchers? [Learn how to make your first contribution here](https://pytorch-lightning.readthedocs.io/en/stable/generated/CONTRIBUTING.html) Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/) which requires projects to have solid testing, documentation and support. diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 9a2e0455ff40f..3aea1ca0a43b6 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,12 +13,13 @@ # limitations under the License. ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.6.1 + FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.12 +ARG PYTORCH_VERSION=1.13 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ @@ -35,7 +36,12 @@ ENV \ RUN \ # TODO: Remove the manual key installation once the base image is updated. # https://github.com/NVIDIA/nvidia-docker/issues/1631 - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214 + apt-get update && apt-get install -y wget && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \ + echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \ + apt-get update && \ apt-get update -qq --fix-missing && \ NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ @@ -132,16 +138,20 @@ RUN \ RUN \ # install Bagua - CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \ - if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ - python -c "import bagua; print(bagua.__version__)" + if [[ $PYTORCH_VERSION != "1.13" ]]; then \ + CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \ + python -c "import bagua_core; bagua_core.install_deps()"; \ + fi ; \ + python -c "import bagua; print(bagua.__version__)"; \ + fi RUN \ # install ColossalAI - SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \ - if [[ "$SHOULD_INSTALL_COLOSSAL" = "1" ]]; then \ + # TODO: 1.13 wheels are not released, remove skip once they are + if [[ $PYTORCH_VERSION != "1.13" ]]; then \ PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \ CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \ CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \ @@ -152,11 +162,8 @@ RUN \ RUN \ # install rest of strategies # remove colossalai from requirements since they are installed separately - SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \ - if [[ "$SHOULD_INSTALL_COLOSSAL" = "0" ]]; then \ - python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \ - fi && \ - echo "$SHOULD_INSTALL_COLOSSAL" && \ + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \ + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \ cat requirements/pytorch/strategies.txt && \ pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html @@ -170,5 +177,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python requirements/pytorch/check-avail-extras.py && \ - python requirements/pytorch/check-avail-strategies.py && \ rm -rf requirements/ diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/base-xla/tpu_workflow.jsonnet similarity index 100% rename from dockers/tpu-tests/tpu_test_cases.jsonnet rename to dockers/base-xla/tpu_workflow.jsonnet diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile deleted file mode 100644 index e23db55bb28e9..0000000000000 --- a/dockers/tpu-tests/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 - -FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} - -LABEL maintainer="Lightning-AI " - -COPY ./ ./lightning/ - -# Pull the legacy checkpoints -RUN cd lightning && \ - bash .actions/pull_legacy_checkpoints.sh - -RUN \ - pip install -q fire && \ - # drop unnecessary packages - pip install -r lightning/requirements/pytorch/devel.txt --no-cache-dir - -COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/ -RUN chmod +x /usr/local/bin/docker-entrypoint.sh - -ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] -CMD ["bash"] diff --git a/dockers/tpu-tests/docker-entrypoint.sh b/dockers/tpu-tests/docker-entrypoint.sh deleted file mode 100644 index 57abc703c8ace..0000000000000 --- a/dockers/tpu-tests/docker-entrypoint.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# source ~/.bashrc -echo "running docker-entrypoint.sh" -# conda activate container -echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS -echo "printed TPU info" -export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" -exec "$@" diff --git a/docs/source-app/levels/basic/build_a_lightning_component.rst b/docs/source-app/levels/basic/build_a_lightning_component.rst index 39522614fe03b..0d44f44442888 100644 --- a/docs/source-app/levels/basic/build_a_lightning_component.rst +++ b/docs/source-app/levels/basic/build_a_lightning_component.rst @@ -132,7 +132,7 @@ powerful Lightning app. Here are a few key features available to super-charge yo :titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container :code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py; :highlights: 11;11;11;11;11;2,7,10, 11; 11 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 430px diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py index 0ba033e0d86c0..5feed8a8864c3 100644 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py @@ -1,6 +1,6 @@ # app.py import lightning as L -from lightning.app.components import PyTorchLightningMultiNode +from lightning.app.components import LightningTrainerMultiNode from lightning.pytorch.demos.boring_classes import BoringModel @@ -12,9 +12,9 @@ def run(): trainer.fit(model) # 8 GPU: (2 nodes of 4 x v100) -component = PyTorchLightningMultiNode( +component = LightningTrainerMultiNode( LightningTrainerDistributed, - num_nodes=2, + num_nodes=4, cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) app = L.LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/run_ptl_script.py b/docs/source-app/levels/basic/hello_components/run_ptl_script.py index 84a86ec00d470..e9bcb16c92f6a 100644 --- a/docs/source-app/levels/basic/hello_components/run_ptl_script.py +++ b/docs/source-app/levels/basic/hello_components/run_ptl_script.py @@ -1,5 +1,5 @@ # app.py -# !curl https://bit.ly/demoLightningScriptpy -o pl_boring_script.py +# !curl https://raw.githubusercontent.com/Lightning-AI/lightning/master/examples/app_multi_node/pl_boring_script.py -o pl_boring_script.py import lightning as L from lightning.app.components.training import LightningTrainerScript diff --git a/docs/source-app/levels/basic/hello_components/xgboost.py b/docs/source-app/levels/basic/hello_components/xgboost.py index 0cedda2aa45b9..fae593a206790 100644 --- a/docs/source-app/levels/basic/hello_components/xgboost.py +++ b/docs/source-app/levels/basic/hello_components/xgboost.py @@ -1,5 +1,5 @@ # app.py -# !pip install sklearn xgboost +# !pip install scikit-learn xgboost import lightning as L from sklearn import datasets from sklearn.model_selection import train_test_split diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst index 6bb8947a1a9cd..81fecc9461403 100644 --- a/docs/source-app/levels/basic/hero_components.rst +++ b/docs/source-app/levels/basic/hero_components.rst @@ -1,7 +1,7 @@ .. lit_tabs:: :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py - :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-13, 25, 30; 7, 10; 15, 21; 9, 15, 24 - :app_id: abc123 + :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24 + :enable_run: true :tab_rows: 3 :height: 620px diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst index c660c1679ac72..da413f459234a 100644 --- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst +++ b/docs/source-app/levels/basic/real_lightning_component_implementations.rst @@ -27,7 +27,7 @@ or cloud GPUs without code changes. :descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic. :code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; :highlights: 2; 4; 10-12; 15-18; 17; 18; 20 - :app_id: abc123 + :enable_run: true :tab_rows: 5 :height: 420px @@ -48,7 +48,7 @@ This example shows how to deploy PyTorch and create an API :descriptions: Shortcut to list dependencies without a requirements.txt file.; Import one of our serving components (high-performance ones are available on the enterprise tiers); Define the setup function to load your favorite pretrained models and do any kind of pre-processing.; Define the predict function which is called when the endpoint is hit.; Initialize the server and define the type of cloud machine to use. :code_files: /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; :highlights: 1; 3; 10-12; 15-25; 28-30 - :app_id: abc123 + :enable_run: true :tab_rows: 4 :height: 620px diff --git a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst b/docs/source-app/levels/basic/save_money_on_cloud_costs.rst index 2218dc382fbee..5e752954da644 100644 --- a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst +++ b/docs/source-app/levels/basic/save_money_on_cloud_costs.rst @@ -18,7 +18,7 @@ Here are a few features that will enable you save a lot on your cloud costs: :titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container :code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py; :highlights: 11;11;11;11;11;1,7, 10, 11; 11 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 430px diff --git a/docs/source-app/levels/intermediate/connect_lightning_components.rst b/docs/source-app/levels/intermediate/connect_lightning_components.rst index 9e9a2f0667842..14c2e9d793ba2 100644 --- a/docs/source-app/levels/intermediate/connect_lightning_components.rst +++ b/docs/source-app/levels/intermediate/connect_lightning_components.rst @@ -37,7 +37,7 @@ on a separate CPU machine. We save money by stopping the GPU machine when the wo :descriptions: First, import Lightning; This component trains a model on a GPU machine; This component analyzes a model on a CPU machine; Define the LightningFlow that orchestrates components; Component 1 will run on a CPU machine; Component 2 will run on an accelerated GPU machine; Describe the workflow in the run method; Training runs first and completes; Analyze runs after training completes; This allows the app to be runnable :code_files: ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py :highlights: 2; 5-7; 9-11; 13; 16; 17; 19; 20; 21; 23 - :app_id: abc123 + :enable_run: true :tab_rows: 4 :height: 460px diff --git a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst b/docs/source-app/levels/intermediate/debug_a_lightning_app.rst index ae0e0496991ef..856be5a182c58 100644 --- a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst +++ b/docs/source-app/levels/intermediate/debug_a_lightning_app.rst @@ -16,7 +16,7 @@ To enable a breakpoint, use `L.pdb.set_trace()` (note direct python pdb support :descriptions: Toy app; Add a breakpoint. When the program runs, it will stop at this line. :code_files: ./debug_app_scripts/toy_app_1_component.py; ./debug_app_scripts/toy_app_1_component_pdb.py :highlights: ; 7 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 350px diff --git a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst b/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst index c4b8b168204d3..b0ce06dae2a41 100644 --- a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst +++ b/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst @@ -18,7 +18,7 @@ Lightning sends the variables across the machines for you automatically. :descriptions: Remember this component may live on its own machine; The flow may be on a separate machine as well; This variable is on the flow machine; When passed to the work component, it is actually sent across the network under the hood.; When it prints here, it prints on the work component machine (not the flow machine); The second string was directly created on machine 1 :code_files: ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py :highlights: 4-7; 9-16; 15; 16; 6; 7; - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 380px @@ -55,7 +55,7 @@ Example Continuous deployment: Every time a model saves a checkpoint, we redeplo :descriptions: Define a component that simulates training; Define a component that simulates deployment; Training will happen in parallel over a long period; The deployment server also runs in parallel forever; Start training in parallel (could take months); Whenever the model has a checkpoint deploy; When the checkpoint is updated, model re-deploys :code_files: ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py :highlights: 5-18; 20-22; 27; 28; 31; 32, 33; 33 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 690px @@ -110,7 +110,7 @@ transfering them across components. :descriptions: Let's define a component to simulate generating embeddings (from a DB, feature store, etc...); This component simulates a server that will use the embeddings.; Run the component to generate the embeddings; Simulate embeddings as an array. Here you would query a DB, load from a feature store or disk or even use a neural network to extract the embedding.; Allow the embeddings to be transfered efficiently by wrapping them in the Payload object.; Pass the variable to the EmbeddingServer (just the pointer).; The data gets transfered once you use the .value attribute in the other component. :code_files: ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; :highlights: 5-13; 15-19; 28; 12; 13; 29; 18 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 600px diff --git a/docs/source-app/workflows/add_web_ui/index_content.rst b/docs/source-app/workflows/add_web_ui/index_content.rst index 4e95e6c2a70c2..f3d516c5af546 100644 --- a/docs/source-app/workflows/add_web_ui/index_content.rst +++ b/docs/source-app/workflows/add_web_ui/index_content.rst @@ -13,7 +13,7 @@ Web UIs for non Javascript Developers :header: Dash :description: Learn how to add a web UI built in Python with Dash. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/dash/index.html + :button_link: ../../workflows/add_web_ui/dash/index.html :height: 150 :tag: basic @@ -21,7 +21,7 @@ Web UIs for non Javascript Developers :header: Gradio :description: Learn how to add a web UI built in Python with Gradio. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/gradio/index.html + :button_link: ../../workflows/add_web_ui/gradio/index.html :height: 150 :tag: basic @@ -29,7 +29,7 @@ Web UIs for non Javascript Developers :header: Panel :description: Learn how to add a web UI built in Python with Panel. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/panel/index.html + :button_link: ../../workflows/add_web_ui/panel/index.html :height: 150 :tag: basic @@ -37,7 +37,7 @@ Web UIs for non Javascript Developers :header: Jupyter Notebook :description: Learn how to enable a web UI that is a Jupyter Notebook. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/jupyter_basic.html + :button_link: ../../workflows/add_web_ui/jupyter_basic.html :height: 150 :tag: [docs coming soon] @@ -45,7 +45,7 @@ Web UIs for non Javascript Developers :header: Streamlit :description: Learn how to add a web UI built in Python with Streamlit. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/streamlit/index.html + :button_link: ../../workflows/add_web_ui/streamlit/index.html :height: 150 :tag: basic @@ -53,7 +53,7 @@ Web UIs for non Javascript Developers :header: JustPy :description: Learn how to add a web UI built in Python with JustPy. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/justpy/index.html + :button_link: ../../workflows/add_web_ui/justpy/index.html :height: 150 :tag: basic @@ -79,7 +79,7 @@ Web UIs for Javascript Developers :header: Any javascript framework :description: Learn how to link up any javascript framework to a Lightning app. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/integrate_any_javascript_framework.html + :button_link: ../../workflows/add_web_ui/integrate_any_javascript_framework.html :height: 150 :tag: advanced @@ -87,7 +87,7 @@ Web UIs for Javascript Developers :header: Angular.js :description: Learn how to add a web UI built in Javascript with Angular.js :col_css: col-md-4 - :button_link: /workflows/add_web_ui/angular_js_intermediate.html + :button_link: ../../workflows/add_web_ui/angular_js_intermediate.html :height: 150 :tag: [Docs coming soon] @@ -95,7 +95,7 @@ Web UIs for Javascript Developers :header: HTML :description: Learn how to add a web UI built with html. :col_css: col-md-4 - :button_link: /workflows/add_web_ui/html/index.html + :button_link: ../../workflows/add_web_ui/html/index.html :height: 150 :tag: basic @@ -103,7 +103,7 @@ Web UIs for Javascript Developers :header: React.js :description: Learn how to add a web UI built in Javascript with React.js :col_css: col-md-4 - :button_link: /workflows/add_web_ui/react/index.html + :button_link: ../../workflows/add_web_ui/react/index.html :height: 150 :tag: intermediate @@ -111,7 +111,7 @@ Web UIs for Javascript Developers :header: Vue.js :description: Learn how to add a web UI built in Javascript with Vue.js :col_css: col-md-4 - :button_link: /workflows/add_web_ui/vue_js_intermediate.html + :button_link: ../../workflows/add_web_ui/vue_js_intermediate.html :height: 150 :tag: [Docs coming soon] diff --git a/docs/source-app/workflows/run_work_in_parallel_content.rst b/docs/source-app/workflows/run_work_in_parallel_content.rst index 467f64f165043..1c8d5b374dbb2 100644 --- a/docs/source-app/workflows/run_work_in_parallel_content.rst +++ b/docs/source-app/workflows/run_work_in_parallel_content.rst @@ -20,7 +20,7 @@ to wait for the first one to finish. :descriptions: No parallel components; Allow the train component to run in parallel; When the component runs, it will run in parallel; The next component is unblocked and can now immediately run. :code_files: /workflows/scripts/parallel/toy_app.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py; :highlights: ; 18; 23; 24; - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 540px @@ -36,6 +36,6 @@ allows the third component to run without waiting for the others to finish. :descriptions: No parallel components; Enable 2 components to run in parallel; Start both components together in parallel; Last component is not blocked and can start immediately. :code_files: /workflows/scripts/parallel/toy_two_parallel_not_started.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py :highlights: ; 18, 19; 23, 24; 25 - :app_id: abc123 + :enable_run: true :tab_rows: 3 :height: 540px diff --git a/docs/source-pytorch/model/train_model_basic.rst b/docs/source-pytorch/model/train_model_basic.rst index 92f4a0a40fa7b..e5bce7dfdf1c1 100644 --- a/docs/source-pytorch/model/train_model_basic.rst +++ b/docs/source-pytorch/model/train_model_basic.rst @@ -20,7 +20,7 @@ Add the relevant imports at the top of the file import torch.nn.functional as F from torchvision import transforms from torchvision.datasets import MNIST - from torch.utils.data import DataLoader, random_split + from torch.utils.data import DataLoader import pytorch_lightning as pl ---- diff --git a/docs/source-pytorch/starter/lightning_lite.rst b/docs/source-pytorch/starter/lightning_lite.rst index bab4581fa91a4..bc097a02571d5 100644 --- a/docs/source-pytorch/starter/lightning_lite.rst +++ b/docs/source-pytorch/starter/lightning_lite.rst @@ -1,6 +1,6 @@ -########################################### -LightningLite (Stepping Stone to Lightning) -########################################### +############## +Lightning Lite +############## :class:`~pytorch_lightning.lite.LightningLite` enables pure PyTorch users to scale their existing code @@ -32,7 +32,7 @@ Learn by example My Existing PyTorch Code ======================== -The ``run`` function contains custom training loop used to train ``MyModel`` on ``MyDataset`` for ``num_epochs`` epochs. +The ``train`` function contains a standard training loop used to train ``MyModel`` on ``MyDataset`` for ``num_epochs`` epochs. .. code-block:: python @@ -49,7 +49,7 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on ... - def run(args): + def train(args): device = "cuda" if torch.cuda.is_available() else "cpu" model = MyModel(...).to(device) @@ -67,7 +67,7 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on optimizer.step() - run(args) + train(args) ---------- @@ -75,13 +75,12 @@ The ``run`` function contains custom training loop used to train ``MyModel`` on Convert to LightningLite ======================== -Here are five required steps to convert to :class:`~pytorch_lightning.lite.LightningLite`. +Here are five easy steps to let :class:`~pytorch_lightning.lite.LightningLite` scale your PyTorch models. -1. Subclass :class:`~pytorch_lightning.lite.LightningLite` and override its :meth:`~pytorch_lightning.lite.LightningLite.run` method. -2. Move the body of your existing ``run`` function into :class:`~pytorch_lightning.lite.LightningLite` ``run`` method. -3. Remove all ``.to(...)``, ``.cuda()`` etc calls since :class:`~pytorch_lightning.lite.LightningLite` will take care of it. -4. Apply :meth:`~pytorch_lightning.lite.LightningLite.setup` over each model and optimizers pair and :meth:`~pytorch_lightning.lite.LightningLite.setup_dataloaders` on all your dataloaders and replace ``loss.backward()`` by ``self.backward(loss)``. -5. Instantiate your :class:`~pytorch_lightning.lite.LightningLite` subclass and call its :meth:`~pytorch_lightning.lite.LightningLite.run` method. +1. Create the :class:`~pytorch_lightning.lite.LightningLite` object at the beginning of your training code. +2. Remove all ``.to`` and ``.cuda`` calls since :class:`~pytorch_lightning.lite.LightningLite` will take care of it. +3. Apply :meth:`~pytorch_lightning.lite.LightningLite.setup` over each model and optimizers pair and :meth:`~pytorch_lightning.lite.LightningLite.setup_dataloaders` on all your dataloaders and replace ``loss.backward()`` by ``lite.backward(loss)``. +4. Run the script from the terminal using ``lightning run model path/to/train.py`` or use the :meth:`~pytorch_lightning.lite.LightningLite.launch` method in a notebook. | @@ -90,7 +89,7 @@ Here are five required steps to convert to :class:`~pytorch_lightning.lite.Light import torch from torch import nn from torch.utils.data import DataLoader, Dataset - from pytorch_lightning.lite import LightningLite + from lightning.lite import LightningLite class MyModel(nn.Module): @@ -101,108 +100,88 @@ Here are five required steps to convert to :class:`~pytorch_lightning.lite.Light ... - class Lite(LightningLite): - def run(self, args): + def train(args): - model = MyModel(...) - optimizer = torch.optim.SGD(model.parameters(), ...) - model, optimizer = self.setup(model, optimizer) # Scale your model / optimizers + lite = LightningLite() - dataloader = DataLoader(MyDataset(...), ...) - dataloader = self.setup_dataloaders(dataloader) # Scale your dataloaders + model = MyModel(...) + optimizer = torch.optim.SGD(model.parameters(), ...) + model, optimizer = lite.setup(model, optimizer) # Scale your model / optimizers - model.train() - for epoch in range(args.num_epochs): - for batch in dataloader: - optimizer.zero_grad() - loss = model(batch) - self.backward(loss) # instead of loss.backward() - optimizer.step() + dataloader = DataLoader(MyDataset(...), ...) + dataloader = lite.setup_dataloaders(dataloader) # Scale your dataloaders + model.train() + for epoch in range(args.num_epochs): + for batch in dataloader: + optimizer.zero_grad() + loss = model(batch) + lite.backward(loss) # instead of loss.backward() + optimizer.step() - Lite(...).run(args) + train(args) -That's all. You can now train on any kind of device and scale your training. Check out `this `_ full MNIST training example with LightningLite. -:class:`~pytorch_lightning.lite.LightningLite` takes care of device management, so you don't have to. -You should remove any device-specific logic within your code. +That's all you need to do to your code. You can now train on any kind of device and scale your training. +Check out `this `_ full MNIST training example with LightningLite. Here is how to train on eight GPUs with `torch.bfloat16 `_ precision: -.. code-block:: python +.. code-block:: bash - Lite(strategy="ddp", devices=8, accelerator="gpu", precision="bf16").run(10) + lightning run model ./path/to/train.py --strategy=ddp --devices=8 --accelerator=cuda --precision="bf16" -Here is how to use `DeepSpeed Zero3 `_ with eight GPUs and precision 16: +Here is how to use `DeepSpeed Zero3 `_ with eight GPUs and mixed precision: -.. code-block:: python +.. code-block:: bash - Lite(strategy="deepspeed", devices=8, accelerator="gpu", precision=16).run(10) + lightning run model ./path/to/train.py --strategy=deepspeed --devices=8 --accelerator=cuda --precision=16 :class:`~pytorch_lightning.lite.LightningLite` can also figure it out automatically for you! -.. code-block:: python +.. code-block:: bash + + lightning run model ./path/to/train.py --devices=auto --accelerator=auto --precision=16 - Lite(devices="auto", accelerator="auto", precision=16).run(10) You can also easily use distributed collectives if required. -Here is an example while running on 256 GPUs (eight GPUs times 32 nodes). .. code-block:: python - class Lite(LightningLite): - def run(self): - - # Transfer and concatenate tensors across processes - self.all_gather(...) - - # Transfer an object from one process to all the others - self.broadcast(..., src=...) - - # The total number of processes running across all devices and nodes. - self.world_size - - # The global index of the current process across all devices and nodes. - self.global_rank - - # The index of the current process among the processes running on the local node. - self.local_rank + lite = LightningLite() - # The index of the current node. - self.node_rank + # Transfer and concatenate tensors across processes + lite.all_gather(...) - # Wether this global rank is rank zero. - if self.is_global_zero: - # do something on rank 0 - ... + # Transfer an object from one process to all the others + lite.broadcast(..., src=...) - # Wait for all processes to enter this call. - self.barrier() + # The total number of processes running across all devices and nodes. + lite.world_size + # The global index of the current process across all devices and nodes. + lite.global_rank - Lite(strategy="ddp", devices=8, num_nodes=32, accelerator="gpu").run() + # The index of the current process among the processes running on the local node. + lite.local_rank + # The index of the current node. + lite.node_rank -If you require custom data or model device placement, you can deactivate -:class:`~pytorch_lightning.lite.LightningLite` automatic placement by doing -``self.setup_dataloaders(..., move_to_device=False)`` for the data and -``self.setup(..., move_to_device=False)`` for the model. -Furthermore, you can access the current device from ``self.device`` or -rely on :meth:`~pytorch_lightning.lite.LightningLite.to_device` -utility to move an object to the current device. + # Whether this global rank is rank zero. + if lite.is_global_zero: + # do something on rank 0 + ... + # Wait for all processes to enter this call. + lite.barrier() -.. note:: We recommend instantiating the models within the :meth:`~pytorch_lightning.lite.LightningLite.run` method as large models would cause an out-of-memory error otherwise. -.. tip:: +The code stays agnostic, whether you are running on CPU, on two GPUS or on multiple machines with many GPUs. - If you have hundreds or thousands of lines within your :meth:`~pytorch_lightning.lite.LightningLite.run` function - and you are feeling unsure about them, then that is the correct feeling. - In 2019, our :class:`~pytorch_lightning.core.module.LightningModule` was getting larger - and we got the same feeling, so we started to organize our code for simplicity, interoperability and standardization. - This is definitely a good sign that you should consider refactoring your code and / or switching to - :class:`~pytorch_lightning.core.module.LightningModule` ultimately. +If you require custom data or model device placement, you can deactivate :class:`~pytorch_lightning.lite.LightningLite`'s automatic placement by doing ``lite.setup_dataloaders(..., move_to_device=False)`` for the data and ``lite.setup(..., move_to_device=False)`` for the model. +Furthermore, you can access the current device from ``lite.device`` or rely on :meth:`~pytorch_lightning.lite.LightningLite.to_device` utility to move an object to the current device. ---------- @@ -211,8 +190,7 @@ utility to move an object to the current device. Distributed Training Pitfalls ============================= -The :class:`~pytorch_lightning.lite.LightningLite` provides you with the tools to scale your training, -but there are several major challenges ahead of you now: +The :class:`~pytorch_lightning.lite.LightningLite` provides you with the tools to scale your training, but there are several major challenges ahead of you now: .. list-table:: @@ -236,103 +214,6 @@ but there are several major challenges ahead of you now: If you are facing one of those challenges, then you are already meeting the limit of :class:`~pytorch_lightning.lite.LightningLite`. We recommend you to convert to :doc:`Lightning <../starter/introduction>`, so you never have to worry about those. ----------- - -Convert to Lightning -==================== - -:class:`~pytorch_lightning.lite.LightningLite` is a stepping stone to transition fully to the Lightning API and benefit -from its hundreds of features. - -You can see our :class:`~pytorch_lightning.lite.LightningLite` class as a -future :class:`~pytorch_lightning.core.module.LightningModule`, and slowly refactor your code into its API. -Below, the :meth:`~pytorch_lightning.core.module.LightningModule.training_step`, :meth:`~pytorch_lightning.core.module.LightningModule.forward`, -:meth:`~pytorch_lightning.core.module.LightningModule.configure_optimizers`, :meth:`~pytorch_lightning.core.module.LightningModule.train_dataloader` methods -are implemented. - - -.. code-block:: python - - class Lite(LightningLite): - - # 1. This would become the LightningModule `__init__` function. - def run(self, args): - self.args = args - - self.model = MyModel(...) - - self.fit() # This would be automated by the Lightning Trainer. - - # 2. This can be fully removed as Lightning creates its own fitting loop, - # and sets up the model, optimizer, dataloader, etc for you. - def fit(self): - # setup everything - optimizer = self.configure_optimizers() - self.model, optimizer = self.setup(self.model, optimizer) - dataloader = self.setup_dataloaders(self.train_dataloader()) - - # start fitting - self.model.train() - for epoch in range(num_epochs): - for batch in enumerate(dataloader): - optimizer.zero_grad() - loss = self.training_step(batch, batch_idx) - self.backward(loss) - optimizer.step() - - # 3. This stays here as it belongs to the LightningModule. - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - return self.forward(batch) - - def configure_optimizers(self): - return torch.optim.SGD(self.model.parameters(), ...) - - # 4. [Optionally] This can stay here or be extracted to the LightningDataModule to enable higher composability. - def train_dataloader(self): - return DataLoader(MyDataset(...), ...) - - - Lite(...).run(args) - - -Finally, change the :meth:`~pytorch_lightning.lite.LightningLite.run` into a -:meth:`~pytorch_lightning.core.module.LightningModule.__init__` and drop the ``fit`` call from inside. - -.. code-block:: python - - from pytorch_lightning import LightningDataModule, LightningModule, Trainer - - - class LightningModel(LightningModule): - def __init__(self, args): - super().__init__() - self.model = MyModel(...) - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - loss = self(batch) - self.log("train_loss", loss) - return loss - - def configure_optimizers(self): - return torch.optim.SGD(self.model.parameters(), lr=0.001) - - - class BoringDataModule(LightningDataModule): - def train_dataloader(self): - return DataLoader(MyDataset(...), ...) - - - trainer = Trainer(max_epochs=10) - trainer.fit(LightningModel(), datamodule=BoringDataModule()) - - -You have successfully converted to PyTorch Lightning, and can now benefit from its hundred of features! ---------- @@ -538,33 +419,6 @@ Lightning Lite Methods ********************** -run -=== - -The run method serves two purposes: - -1. Override this method from the :class:`~pytorch_lightning.lite.lite.LightningLite` class and put your - training (or inference) code inside. -2. Launch the training procedure by calling the run method. Lite will take care of setting up the distributed backend. - -You can optionally pass arguments to the run method. For example, the hyperparameters or a backbone for the model. - -.. code-block:: python - - from pytorch_lightning.lite import LightningLite - - - class Lite(LightningLite): - - # Input arguments are optional; put whatever you need - def run(self, learning_rate, num_layers): - """Here goes your training loop""" - - - lite = Lite(accelerator="gpu", devices=2) - lite.run(learning_rate=0.01, num_layers=12) - - setup ===== @@ -577,10 +431,10 @@ Moves the model and optimizer to the correct device automatically. optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # Set up model and optimizer for accelerated training - model, optimizer = self.setup(model, optimizer) + model, optimizer = lite.setup(model, optimizer) # If you don't want Lite to set the device - model, optimizer = self.setup(model, optimizer, move_to_device=False) + model, optimizer = lite.setup(model, optimizer, move_to_device=False) The setup method also prepares the model for the selected precision choice so that operations during ``forward()`` get @@ -598,13 +452,13 @@ data tensors to the correct device automatically. train_data = torch.utils.DataLoader(train_dataset, ...) test_data = torch.utils.DataLoader(test_dataset, ...) - train_data, test_data = self.setup_dataloaders(train_data, test_data) + train_data, test_data = lite.setup_dataloaders(train_data, test_data) # If you don't want Lite to move the data to the device - train_data, test_data = self.setup_dataloaders(train_data, test_data, move_to_device=False) + train_data, test_data = lite.setup_dataloaders(train_data, test_data, move_to_device=False) # If you don't want Lite to replace the sampler in the context of distributed training - train_data, test_data = self.setup_dataloaders(train_data, test_data, replace_sampler=False) + train_data, test_data = lite.setup_dataloaders(train_data, test_data, replace_sampler=False) backward @@ -618,7 +472,7 @@ This replaces any occurrences of ``loss.backward()`` and makes your code acceler loss = loss_fn(output, target) # loss.backward() - self.backward(loss) + lite.backward(loss) to_device @@ -632,7 +486,7 @@ device, so calling this method is only necessary for manual operation when neede .. code-block:: python data = torch.load("dataset.pt") - data = self.to_device(data) + data = lite.to_device(data) seed_everything @@ -643,7 +497,7 @@ Make your code reproducible by calling this method at the beginning of your run. .. code-block:: python # Instead of `torch.manual_seed(...)`, call: - self.seed_everything(1234) + lite.seed_everything(1234) This covers PyTorch, NumPy and Python random number generators. In addition, Lite takes care of properly initializing @@ -659,15 +513,15 @@ You need this only if you wish to autocast more operations outside the ones in m .. code-block:: python - model, optimizer = self.setup(model, optimizer) + model, optimizer = lite.setup(model, optimizer) # Lite handles precision automatically for the model output = model(inputs) - with self.autocast(): # optional + with lite.autocast(): # optional loss = loss_function(output, target) - self.backward(loss) + lite.backward(loss) ... @@ -681,7 +535,7 @@ This avoids excessive printing and logs when running on multiple devices/nodes. .. code-block:: python # Print only on the main process - self.print(f"{epoch}/{num_epochs}| Train Epoch Loss: {loss}") + lite.print(f"{epoch}/{num_epochs}| Train Epoch Loss: {loss}") save @@ -693,7 +547,7 @@ handling the saving part correctly, no matter if you are running a single device .. code-block:: python # Instead of `torch.save(...)`, call: - self.save(model.state_dict(), "path/to/checkpoint.ckpt") + lite.save(model.state_dict(), "path/to/checkpoint.ckpt") load @@ -705,7 +559,7 @@ handling the loading part correctly, no matter if you are running a single devic .. code-block:: python # Instead of `torch.load(...)`, call: - self.load("path/to/checkpoint.ckpt") + lite.load("path/to/checkpoint.ckpt") barrier @@ -718,11 +572,11 @@ the data is written to disk. .. code-block:: python # Download data only on one process - if self.global_rank == 0: + if lite.global_rank == 0: download_data("http://...") # Wait until all processes meet up here - self.barrier() + lite.barrier() # All processes are allowed to read the data now @@ -738,10 +592,10 @@ It will speed up your training loop by cutting redundant communication between p # Accumulate gradient 8 batches at a time is_accumulating = batch_idx % 8 != 0 - with self.no_backward_sync(model, enabled=is_accumulating): + with lite.no_backward_sync(model, enabled=is_accumulating): output = model(input) loss = ... - self.backward(loss) + lite.backward(loss) ... # Step the optimizer every 8 batches @@ -749,7 +603,7 @@ It will speed up your training loop by cutting redundant communication between p optimizer.step() optimizer.zero_grad() -Both the model's `.forward()` and the `self.backward()` call need to run under this context as shown in the example above. +Both the model's `.forward()` and the `lite.backward()` call need to run under this context as shown in the example above. For single-device strategies, it is a no-op. There are strategies that don't support this: - deepspeed diff --git a/examples/app_multi_node/README.md b/examples/app_multi_node/README.md index 23e7afa23d68e..0fd2f369bb786 100644 --- a/examples/app_multi_node/README.md +++ b/examples/app_multi_node/README.md @@ -28,9 +28,9 @@ lightning run app train_lite.py Using Lite, you retain control over your loops while accessing in a minimal way all Lightning distributed strategies. -## Multi Node with PyTorch Lightning +## Multi Node with Lightning Trainer -Lightning supports running PyTorch Lightning from a script or within a Lightning Work. +Lightning supports running Lightning Trainer from a script or within a Lightning Work. You can either run a script directly diff --git a/examples/app_multi_node/train_lite.py b/examples/app_multi_node/train_lite.py index feb8ac2226b77..8e546b270a693 100644 --- a/examples/app_multi_node/train_lite.py +++ b/examples/app_multi_node/train_lite.py @@ -6,23 +6,26 @@ class LitePyTorchDistributed(L.LightningWork): - @staticmethod - def run(): - # 1. Create LightningLite. - lite = LightningLite(strategy="ddp", precision=16) + def run(self): + # 1. Prepare the model + model = torch.nn.Sequential( + torch.nn.Linear(1, 1), + torch.nn.ReLU(), + torch.nn.Linear(1, 1), + ) - # 2. Prepare distributed model and optimizer. - model = torch.nn.Linear(32, 2) - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - model, optimizer = lite.setup(model, optimizer) + # 2. Create LightningLite. + lite = LightningLite(strategy="ddp", precision=16) + model, optimizer = lite.setup(model, torch.optim.SGD(model.parameters(), lr=0.01)) criterion = torch.nn.MSELoss() - # 3. Train the model for 50 steps. - for step in range(50): + # 3. Train the model for 1000 steps. + for step in range(1000): model.zero_grad() - x = torch.randn(64, 32).to(lite.device) + x = torch.tensor([0.8]).to(lite.device) + target = torch.tensor([1.0]).to(lite.device) output = model(x) - loss = criterion(output, torch.ones_like(output)) + loss = criterion(output, target) print(f"global_rank: {lite.global_rank} step: {step} loss: {loss}") lite.backward(loss) optimizer.step() diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py index 5cbee32dd8132..c9e2f62392a56 100644 --- a/examples/app_multi_node/train_lt.py +++ b/examples/app_multi_node/train_lt.py @@ -4,11 +4,10 @@ class LightningTrainerDistributed(L.LightningWork): - @staticmethod - def run(): + def run(self): model = BoringModel() trainer = L.Trainer( - max_epochs=10, + max_steps=1000, strategy="ddp", ) trainer.fit(model) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 825112a9c17f1..9ce662fa40009 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -18,29 +18,28 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no init_method=f"tcp://{main_address}:{main_port}", ) - # 2. Prepare distributed model - model = torch.nn.Linear(32, 2) + # 2. Prepare the model + model = torch.nn.Sequential( + torch.nn.Linear(1, 1), + torch.nn.ReLU(), + torch.nn.Linear(1, 1), + ) # 3. Setup distributed training - if torch.cuda.is_available(): - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - else: - device = torch.device("cpu") - - model = model.to(device) - model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None) + device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") + model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None) # 4. Prepare loss and optimizer criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - # 5. Train the model for 50 steps. - for step in range(50): + # 5. Train the model for 1000 steps. + for step in range(1000): model.zero_grad() - x = torch.randn(64, 32).to(device) + x = torch.tensor([0.8]).to(device) + target = torch.tensor([1.0]).to(device) output = model(x) - loss = criterion(output, torch.ones_like(output)) + loss = criterion(output, target) print(f"global_rank: {global_rank} step: {step} loss: {loss}") loss.backward() optimizer.step() diff --git a/examples/app_multi_node/train_pytorch_spawn.py b/examples/app_multi_node/train_pytorch_spawn.py index dd3f9442dd829..d29ec83562ffb 100644 --- a/examples/app_multi_node/train_pytorch_spawn.py +++ b/examples/app_multi_node/train_pytorch_spawn.py @@ -6,38 +6,37 @@ class PyTorchDistributed(L.LightningWork): - - # Note: Only staticmethod are support for now with `PyTorchSpawnMultiNode` - @staticmethod def run( + self, world_size: int, node_rank: int, global_rank: str, local_rank: int, ): - # 1. Prepare distributed model - model = torch.nn.Linear(32, 2) + # 1. Prepare the model + model = torch.nn.Sequential( + torch.nn.Linear(1, 1), + torch.nn.ReLU(), + torch.nn.Linear(1, 1), + ) # 2. Setup distributed training - if torch.cuda.is_available(): - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - else: - device = torch.device("cpu") - - model = model.to(device) - model = DistributedDataParallel(model, device_ids=[device.index] if torch.cuda.is_available() else None) + device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") + model = DistributedDataParallel( + model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None + ) # 3. Prepare loss and optimizer criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - # 4. Train the model for 50 steps. - for step in range(50): + # 4. Train the model for 1000 steps. + for step in range(1000): model.zero_grad() - x = torch.randn(64, 32).to(device) + x = torch.tensor([0.8]).to(device) + target = torch.tensor([1.0]).to(device) output = model(x) - loss = criterion(output, torch.ones_like(output)) + loss = criterion(output, target) print(f"global_rank: {global_rank} step: {step} loss: {loss}") loss.backward() optimizer.step() diff --git a/pyproject.toml b/pyproject.toml index bc8d9c7658dcd..5e8a2bfa0e481 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ module = [ "lightning_app.components.serve.streamlit", "lightning_app.components.serve.types.image", "lightning_app.components.serve.types.type", + "lightning_app.components.serve.python_server", "lightning_app.components.training", "lightning_app.core.api", "lightning_app.core.app", diff --git a/requirements/app/test.txt b/requirements/app/test.txt index 3c67611b2dfc5..4b50f1fff4285 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -1,4 +1,4 @@ -coverage==6.4.2 +coverage==6.5.0 codecov==2.1.12 pytest==7.1.3 pytest-timeout==2.1.0 diff --git a/requirements/lite/base.txt b/requirements/lite/base.txt index b342ecacc0927..fa7182be0f9a3 100644 --- a/requirements/lite/base.txt +++ b/requirements/lite/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy>=1.17.2, <1.23.1 -torch>=1.9.*, <1.13.0 +torch>=1.9.*, <=1.13.0 fsspec[http]>2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <=4.4.0 diff --git a/requirements/lite/test.txt b/requirements/lite/test.txt index fde73e54556f8..01759799ff133 100644 --- a/requirements/lite/test.txt +++ b/requirements/lite/test.txt @@ -1,4 +1,4 @@ -coverage==6.4.2 +coverage==6.5.0 codecov==2.1.12 pytest==7.1.3 pytest-cov==4.0.0 diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py index 9d9f4047e6fc4..69d61e130ca4b 100644 --- a/requirements/pytorch/adjust-versions.py +++ b/requirements/pytorch/adjust-versions.py @@ -5,8 +5,8 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ - dict(torch="1.13.0", torchvision="0.14.0"), # RC - dict(torch="1.12.1", torchvision="0.13.1"), # stable + dict(torch="1.13.0", torchvision="0.14.0"), # stable + dict(torch="1.12.1", torchvision="0.13.1"), dict(torch="1.12.0", torchvision="0.13.0"), dict(torch="1.11.0", torchvision="0.12.0"), dict(torch="1.10.2", torchvision="0.11.3"), diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index e3eae1cd66ce8..2f2b9306bd22a 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy>=1.17.2, <1.23.1 -torch>=1.9.*, <1.13.0 +torch>=1.9.*, <=1.13.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>2021.06.0, <2022.8.0 diff --git a/requirements/pytorch/check-avail-extras.py b/requirements/pytorch/check-avail-extras.py index 9af53010b605b..3ab8d2848c3f0 100644 --- a/requirements/pytorch/check-avail-extras.py +++ b/requirements/pytorch/check-avail-extras.py @@ -1,5 +1,6 @@ -import hydra # noqa: F401 -import jsonargparse # noqa: F401 -import matplotlib # noqa: F401 -import omegaconf # noqa: F401 -import rich # noqa: F401 +if __name__ == "__main__": + import hydra # noqa: F401 + import jsonargparse # noqa: F401 + import matplotlib # noqa: F401 + import omegaconf # noqa: F401 + import rich # noqa: F401 diff --git a/requirements/pytorch/check-avail-strategies.py b/requirements/pytorch/check-avail-strategies.py index db28a1a1fe051..ffe12d024199b 100644 --- a/requirements/pytorch/check-avail-strategies.py +++ b/requirements/pytorch/check-avail-strategies.py @@ -1,7 +1,8 @@ -import bagua # noqa: F401 -import deepspeed # noqa: F401 -import fairscale # noqa: F401 -import horovod.torch +if __name__ == "__main__": + import bagua # noqa: F401 + import deepspeed # noqa: F401 + import fairscale # noqa: F401 + import horovod.torch -# returns an error code -assert horovod.torch.nccl_built() + # returns an error code + assert horovod.torch.nccl_built() diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 37a75ba9f45bd..2f0cce54f4158 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -5,5 +5,5 @@ colossalai>=0.1.10 fairscale>=0.4.5, <=0.4.6 deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed -horovod>=0.21.2, !=0.24.0, <0.25.1 +horovod>=0.21.2, !=0.24.0, <=0.26.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index fbd65ff0ef729..5ba99b269e002 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,4 +1,4 @@ -coverage==6.4.2 +coverage==6.5.0 codecov==2.1.12 pytest==7.1.3 pytest-cov==4.0.0 @@ -9,7 +9,7 @@ pre-commit==2.20.0 # needed in tests cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <1.1.3 -onnxruntime<1.13.0 +onnxruntime<1.14.0 psutil<5.9.4 # for `DeviceStatsMonitor` pandas>1.0, <1.5.2 # needed in benchmarks fastapi<0.87.0 diff --git a/src/lightning/__init__.py b/src/lightning/__init__.py index 30950d8c6bdbb..2755ce57e48b3 100644 --- a/src/lightning/__init__.py +++ b/src/lightning/__init__.py @@ -36,6 +36,7 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None: from lightning.app.perf import pdb # noqa: E402 from lightning.app.utilities.packaging.build_config import BuildConfig # noqa: E402 from lightning.app.utilities.packaging.cloud_compute import CloudCompute # noqa: E402 +from lightning.lite.lite import LightningLite # noqa: E402 from lightning.pytorch.callbacks import Callback # noqa: E402 from lightning.pytorch.core import LightningDataModule, LightningModule # noqa: E402 from lightning.pytorch.trainer import Trainer # noqa: E402 @@ -59,6 +60,7 @@ def _detail(self: Any, message: str, *args: Any, **kwargs: Any) -> None: "LightningModule", "Callback", "seed_everything", + "LightningLite", "storage", "pdb", ] diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index 6f30e218ab6e5..6254445efea1b 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -106,5 +106,6 @@ def _setup_args(**kwargs: Any) -> Dict[str, Any]: "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], # todo: consider aggregation/union of tags from particular packages ) diff --git a/src/lightning/__version__.py b/src/lightning/__version__.py index 72126ce16b766..ba22724db3594 100644 --- a/src/lightning/__version__.py +++ b/src/lightning/__version__.py @@ -1 +1 @@ -version = "1.8.1" +version = "1.8.2" diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 28445aec7df7b..f1b6740a9a344 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,8 +4,31 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.8.1] - 2022-11-10 +## [UnReleased] - 2022-11-DD + +### Added +- Added title and description to ServeGradio ([#15639](https://github.com/Lightning-AI/lightning/pull/15639)) +- Added a friendly error message when attempting to run the default cloud compute with a custom base image configured ([#14929](https://github.com/Lightning-AI/lightning/pull/14929)) + +### Changed + +- Improved support for running apps when dependencies aren't installed ([#15711](https://github.com/Lightning-AI/lightning/pull/15711)) +- Changed the root directory of the app (which gets uploaded) to be the folder containing the app file, rather than any parent folder containing a `.lightning` file ([#15654](https://github.com/Lightning-AI/lightning/pull/15654)) +- Enabled MultiNode Components to support state broadcasting ([#15607](https://github.com/Lightning-AI/lightning/pull/15607)) +- Prevent artefactual "running from outside your current environment" error ([#15647](https://github.com/Lightning-AI/lightning/pull/15647)) +- Rename failed -> error in tables ([#15608](https://github.com/Lightning-AI/lightning/pull/15608)) + +### Fixed + +- Fixed race condition to over-write the frontend with app infos ([#15398](https://github.com/Lightning-AI/lightning/pull/15398)) +- Fixed bi-directional queues sending delta with Drive Component name changes ([#15642](https://github.com/Lightning-AI/lightning/pull/15642)) +- Fixed CloudRuntime works collection with structures and accelerated multi node startup time ([#15650](https://github.com/Lightning-AI/lightning/pull/15650)) +- Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712)) +- Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714)) + + +## [1.8.1] - 2022-11-10 ### Added @@ -38,7 +61,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with the `lightning` CLI taking a long time to error out when the cloud is not reachable ([#15412](https://github.com/Lightning-AI/lightning/pull/15412)) - ## [1.8.0] - 2022-11-01 ### Added diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py index 72126ce16b766..ba22724db3594 100644 --- a/src/lightning_app/__version__.py +++ b/src/lightning_app/__version__.py @@ -1 +1 @@ -version = "1.8.1" +version = "1.8.2" diff --git a/src/lightning_app/cli/app-template/tests/requirements.txt b/src/lightning_app/cli/app-template/tests/requirements.txt index 984d177fbe16c..3185d1c44f033 100644 --- a/src/lightning_app/cli/app-template/tests/requirements.txt +++ b/src/lightning_app/cli/app-template/tests/requirements.txt @@ -1,8 +1,8 @@ coverage codecov>=2.1 -pytest>=3.0.5 +pytest>=5.0.0 pytest-cov pytest-flake8 flake8 check-manifest -twine==1.13.0 +twine==4.0.1 diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 4bb8b1fdb793f..e76b9c0695842 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -42,7 +42,7 @@ def as_table(self) -> Table: V1ClusterState.QUEUED: Text("queued", style="bold yellow"), V1ClusterState.PENDING: Text("pending", style="bold yellow"), V1ClusterState.RUNNING: Text("running", style="bold green"), - V1ClusterState.FAILED: Text("failed", style="bold red"), + V1ClusterState.FAILED: Text("error", style="bold red"), V1ClusterState.DELETED: Text("deleted", style="bold red"), } diff --git a/src/lightning_app/cli/commands/logs.py b/src/lightning_app/cli/commands/logs.py index 9d53601da0698..fb0746dd50fff 100644 --- a/src/lightning_app/cli/commands/logs.py +++ b/src/lightning_app/cli/commands/logs.py @@ -71,6 +71,7 @@ def _show_logs(app_name: str, components: List[str], follow: bool) -> None: works = client.lightningwork_service_list_lightningwork( project_id=project.project_id, app_id=apps[app_name].id ).lightningworks + app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works] if not components: diff --git a/src/lightning_app/cli/component-template/tests/requirements.txt b/src/lightning_app/cli/component-template/tests/requirements.txt index 984d177fbe16c..3185d1c44f033 100644 --- a/src/lightning_app/cli/component-template/tests/requirements.txt +++ b/src/lightning_app/cli/component-template/tests/requirements.txt @@ -1,8 +1,8 @@ coverage codecov>=2.1 -pytest>=3.0.5 +pytest>=5.0.0 pytest-cov pytest-flake8 flake8 check-manifest -twine==1.13.0 +twine==4.0.1 diff --git a/src/lightning_app/components/database/server.py b/src/lightning_app/components/database/server.py index a5499aaae17b8..01bd8f3b12033 100644 --- a/src/lightning_app/components/database/server.py +++ b/src/lightning_app/components/database/server.py @@ -4,6 +4,7 @@ import sys import tempfile import threading +import traceback from typing import List, Optional, Type, Union import uvicorn @@ -36,6 +37,9 @@ def install_signal_handlers(self): """Ignore Uvicorn Signal Handlers.""" +_lock = threading.Lock() + + class Database(LightningWork): def __init__( self, @@ -146,25 +150,29 @@ class CounterModel(SQLModel, table=True): self._exit_event = None def store_database(self): - with tempfile.TemporaryDirectory() as tmpdir: - tmp_db_filename = os.path.join(tmpdir, os.path.basename(self.db_filename)) + try: + with tempfile.TemporaryDirectory() as tmpdir: + tmp_db_filename = os.path.join(tmpdir, os.path.basename(self.db_filename)) - source = sqlite3.connect(self.db_filename) - dest = sqlite3.connect(tmp_db_filename) + source = sqlite3.connect(self.db_filename) + dest = sqlite3.connect(tmp_db_filename) - source.backup(dest) + source.backup(dest) - source.close() - dest.close() + source.close() + dest.close() - drive = Drive("lit://database", component_name=self.name, root_folder=tmpdir) - drive.put(os.path.basename(tmp_db_filename)) + drive = Drive("lit://database", component_name=self.name, root_folder=tmpdir) + drive.put(os.path.basename(tmp_db_filename)) - print("Stored the database to the Drive.") + print("Stored the database to the Drive.") + except Exception: + print(traceback.print_exc()) def periodic_store_database(self, store_interval): while not self._exit_event.is_set(): - self.store_database() + with _lock: + self.store_database() self._exit_event.wait(store_interval) def run(self, token: Optional[str] = None) -> None: @@ -210,4 +218,5 @@ def db_url(self) -> Optional[str]: def on_exit(self): self._exit_event.set() - self.store_database() + with _lock: + self.store_database() diff --git a/src/lightning_app/components/multi_node/base.py b/src/lightning_app/components/multi_node/base.py index 02adf218d3e36..4f2005771212a 100644 --- a/src/lightning_app/components/multi_node/base.py +++ b/src/lightning_app/components/multi_node/base.py @@ -3,7 +3,6 @@ from lightning_app import structures from lightning_app.core.flow import LightningFlow from lightning_app.core.work import LightningWork -from lightning_app.utilities.enum import WorkStageStatus from lightning_app.utilities.packaging.cloud_compute import CloudCompute @@ -52,46 +51,31 @@ def run( work_kwargs: Keywords arguments to be provided to the work on instantiation. """ super().__init__() - self.ws = structures.List() - self._work_cls = work_cls - self.num_nodes = num_nodes - self._cloud_compute = cloud_compute - self._work_args = work_args - self._work_kwargs = work_kwargs - self.has_started = False + self.ws = structures.List( + *[ + work_cls( + *work_args, + cloud_compute=cloud_compute, + **work_kwargs, + parallel=True, + ) + for _ in range(num_nodes) + ] + ) def run(self) -> None: - if not self.has_started: - - # 1. Create & start the works - if not self.ws: - for node_rank in range(self.num_nodes): - self.ws.append( - self._work_cls( - *self._work_args, - cloud_compute=self._cloud_compute, - **self._work_kwargs, - parallel=True, - ) - ) - - # Starting node `node_rank`` ... - self.ws[-1].start() - - # 2. Wait for all machines to be started ! - if not all(w.status.stage == WorkStageStatus.STARTED for w in self.ws): - return - - self.has_started = True + # 1. Wait for all works to be started ! + if not all(w.internal_ip for w in self.ws): + return - # Loop over all node machines - for node_rank in range(self.num_nodes): + # 2. Loop over all node machines + for node_rank in range(len(self.ws)): # 3. Run the user code in a distributed way ! self.ws[node_rank].run( main_address=self.ws[0].internal_ip, main_port=self.ws[0].port, - num_nodes=self.num_nodes, + num_nodes=len(self.ws), node_rank=node_rank, ) diff --git a/src/lightning_app/components/multi_node/lite.py b/src/lightning_app/components/multi_node/lite.py index 5295d0beb869e..2a9b33b0880d1 100644 --- a/src/lightning_app/components/multi_node/lite.py +++ b/src/lightning_app/components/multi_node/lite.py @@ -7,7 +7,6 @@ from lightning_app.components.multi_node.base import MultiNode from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor from lightning_app.core.work import LightningWork -from lightning_app.utilities.app_helpers import is_static_method from lightning_app.utilities.packaging.cloud_compute import CloudCompute from lightning_app.utilities.tracer import Tracer @@ -82,11 +81,6 @@ def __init__( **work_kwargs: Any, ) -> None: assert issubclass(work_cls, _LiteWorkProtocol) - if not is_static_method(work_cls, "run"): - raise TypeError( - f"The provided {work_cls} run method needs to be static for now." - "HINT: Remove `self` and add staticmethod decorator." - ) # Note: Private way to modify the work run executor # Probably exposed to the users in the future if needed. diff --git a/src/lightning_app/components/multi_node/pytorch_spawn.py b/src/lightning_app/components/multi_node/pytorch_spawn.py index 62ccfb95174eb..3119ffc51e0b5 100644 --- a/src/lightning_app/components/multi_node/pytorch_spawn.py +++ b/src/lightning_app/components/multi_node/pytorch_spawn.py @@ -3,10 +3,10 @@ from typing_extensions import Protocol, runtime_checkable from lightning_app.components.multi_node.base import MultiNode +from lightning_app.core.queues import MultiProcessQueue from lightning_app.core.work import LightningWork -from lightning_app.utilities.app_helpers import is_static_method from lightning_app.utilities.packaging.cloud_compute import CloudCompute -from lightning_app.utilities.proxies import WorkRunExecutor +from lightning_app.utilities.proxies import _proxy_setattr, unwrap, WorkRunExecutor, WorkStateObserver @runtime_checkable @@ -22,6 +22,9 @@ def run( class _PyTorchSpawnRunExecutor(WorkRunExecutor): + + enable_start_observer: bool = False + def __call__( self, main_address: str, @@ -31,10 +34,31 @@ def __call__( ): import torch - nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 - torch.multiprocessing.spawn( - self.run, args=(self.work_run, main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs - ) + with self.enable_spawn(): + nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 + queue = self.delta_queue if isinstance(self.delta_queue, MultiProcessQueue) else self.delta_queue.to_dict() + torch.multiprocessing.spawn( + self.dispatch_run, + args=(self.__class__, self.work, queue, main_address, main_port, num_nodes, node_rank, nprocs), + nprocs=nprocs, + ) + + @staticmethod + def dispatch_run(local_rank, cls, work, delta_queue, *args, **kwargs): + if local_rank == 0: + if isinstance(delta_queue, dict): + delta_queue = cls.process_queue(delta_queue) + work._request_queue = cls.process_queue(work._request_queue) + work._response_queue = cls.process_queue(work._response_queue) + + state_observer = WorkStateObserver(work, delta_queue=delta_queue) + state_observer.start() + _proxy_setattr(work, delta_queue, state_observer) + + cls.run(local_rank, unwrap(work.run), *args, **kwargs) + + if local_rank == 0: + state_observer.join(0) @staticmethod def run( @@ -46,6 +70,7 @@ def run( node_rank: int, nprocs: int, ): + import torch # 1. Setting distributed environment @@ -76,11 +101,6 @@ def __init__( **work_kwargs: Any, ) -> None: assert issubclass(work_cls, _PyTorchSpawnWorkProtocol) - if not is_static_method(work_cls, "run"): - raise TypeError( - f"The provided {work_cls} run method needs to be static for now." - "HINT: Remove `self` and add staticmethod decorator." - ) # Note: Private way to modify the work run executor # Probably exposed to the users in the future if needed. diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py index ea33106a7ece9..222f71ce59557 100644 --- a/src/lightning_app/components/multi_node/trainer.py +++ b/src/lightning_app/components/multi_node/trainer.py @@ -7,7 +7,6 @@ from lightning_app.components.multi_node.base import MultiNode from lightning_app.components.multi_node.pytorch_spawn import _PyTorchSpawnRunExecutor from lightning_app.core.work import LightningWork -from lightning_app.utilities.app_helpers import is_static_method from lightning_app.utilities.packaging.cloud_compute import CloudCompute from lightning_app.utilities.tracer import Tracer @@ -81,11 +80,6 @@ def __init__( **work_kwargs: Any, ) -> None: assert issubclass(work_cls, _LightningTrainerWorkProtocol) - if not is_static_method(work_cls, "run"): - raise TypeError( - f"The provided {work_cls} run method needs to be static for now." - "HINT: Remove `self` and add staticmethod decorator." - ) # Note: Private way to modify the work run executor # Probably exposed to the users in the future if needed. diff --git a/src/lightning_app/components/serve/gradio.py b/src/lightning_app/components/serve/gradio.py index 7e7801925937f..328e70e743b43 100644 --- a/src/lightning_app/components/serve/gradio.py +++ b/src/lightning_app/components/serve/gradio.py @@ -31,6 +31,8 @@ class ServeGradio(LightningWork, abc.ABC): outputs: Any examples: Optional[List] = None enable_queue: bool = False + title: Optional[str] = None + description: Optional[str] = None def __init__(self, *args, **kwargs): requires("gradio")(super().__init__(*args, **kwargs)) @@ -58,7 +60,14 @@ def run(self, *args, **kwargs): self._model = self.build_model() fn = partial(self.predict, *args, **kwargs) fn.__name__ = self.predict.__name__ - gradio.Interface(fn=fn, inputs=self.inputs, outputs=self.outputs, examples=self.examples).launch( + gradio.Interface( + fn=fn, + inputs=self.inputs, + outputs=self.outputs, + examples=self.examples, + title=self.title, + description=self.description, + ).launch( server_name=self.host, server_port=self.port, enable_queue=self.enable_queue, diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py index 03b0ceb26058f..f0361f9db5046 100644 --- a/src/lightning_app/components/serve/python_server.py +++ b/src/lightning_app/components/serve/python_server.py @@ -14,12 +14,6 @@ logger = Logger(__name__) -def image_to_base64(image_path): - with open(image_path, "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return encoded_string.decode("UTF-8") - - class _DefaultInputData(BaseModel): payload: str @@ -33,7 +27,7 @@ class Image(BaseModel): @staticmethod def _get_sample_data() -> Dict[Any, Any]: - imagepath = Path(__file__).absolute().parent / "catimage.png" + imagepath = Path(__file__).parent / "catimage.png" with open(imagepath, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) return {"image": encoded_string.decode("UTF-8")} diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index ef848cce54dba..255f498507f67 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -24,7 +24,7 @@ from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.core.work import LightningWork from lightning_app.frontend import Frontend -from lightning_app.storage import Drive, Path +from lightning_app.storage import Drive, Path, Payload from lightning_app.storage.path import _storage_root_dir from lightning_app.utilities import frontend from lightning_app.utilities.app_helpers import ( @@ -100,6 +100,7 @@ def __init__( """ self.root_path = root_path # when running behind a proxy + self.info = info from lightning_app.core.flow import _RootFlow @@ -168,9 +169,10 @@ def __init__( logger.debug(f"ENV: {os.environ}") + def _update_index_file(self): # update index.html, # this should happen once for all apps before the ui server starts running. - frontend.update_index_file(FRONTEND_DIR, info=info, root_path=root_path) + frontend.update_index_file(FRONTEND_DIR, info=self.info, root_path=self.root_path) if _should_dispatch_app(): os.environ["LIGHTNING_DISPATCHED"] = "1" @@ -470,6 +472,8 @@ def _run(self) -> bool: self._original_state = deepcopy(self.state) done = False + self._start_with_flow_works() + if self.should_publish_changes_to_api and self.api_publish_state_queue: logger.debug("Publishing the state with changes") # Push two states to optimize start in the cloud. @@ -628,8 +632,16 @@ def _extract_vars_from_component_name(component_name: str, state): else: return None - # Note: Remove private keys - return {k: v for k, v in child["vars"].items() if not k.startswith("_")} + # Filter private keys and drives + return { + k: v + for k, v in child["vars"].items() + if ( + not k.startswith("_") + and not (isinstance(v, dict) and v.get("type", None) == "__drive__") + and not (isinstance(v, (Payload, Path))) + ) + } def _send_flow_to_work_deltas(self, state) -> None: if not self.flow_to_work_delta_queues: @@ -650,10 +662,6 @@ def _send_flow_to_work_deltas(self, state) -> None: if state_work is None or last_state_work is None: continue - # Note: The flow shouldn't update path or drive manually. - last_state_work = apply_to_collection(last_state_work, (Path, Drive), lambda x: None) - state_work = apply_to_collection(state_work, (Path, Drive), lambda x: None) - deep_diff = DeepDiff(last_state_work, state_work, verbose_level=2).to_dict() if "unprocessed" in deep_diff: @@ -662,3 +670,11 @@ def _send_flow_to_work_deltas(self, state) -> None: if deep_diff: logger.debug(f"Sending deep_diff to {w.name} : {deep_diff}") self.flow_to_work_delta_queues[w.name].put(deep_diff) + + def _start_with_flow_works(self): + for w in self.works: + if w._start_with_flow: + parallel = w.parallel + w._parallel = True + w.start() + w._parallel = parallel diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py index 5d8f4e06ad429..a7fee9a3b6e12 100644 --- a/src/lightning_app/core/queues.py +++ b/src/lightning_app/core/queues.py @@ -235,12 +235,12 @@ def __init__( """ if name is None: raise ValueError("You must specify a name for the queue") - host = host or REDIS_HOST - port = port or REDIS_PORT - password = password or REDIS_PASSWORD + self.host = host or REDIS_HOST + self.port = port or REDIS_PORT + self.password = password or REDIS_PASSWORD self.name = name self.default_timeout = default_timeout - self.redis = redis.Redis(host=host, port=port, password=password) + self.redis = redis.Redis(host=self.host, port=self.port, password=self.password) def put(self, item: Any) -> None: from lightning_app import LightningWork @@ -329,6 +329,20 @@ def is_running(self) -> bool: except redis.exceptions.ConnectionError: return False + def to_dict(self): + return { + "type": "redis", + "name": self.name, + "default_timeout": self.default_timeout, + "host": self.host, + "port": self.port, + "password": self.password, + } + + @classmethod + def from_dict(cls, state): + return cls(**state) + class HTTPQueue(BaseQueue): def __init__(self, name: str, default_timeout: float): @@ -414,6 +428,17 @@ def _split_app_id_and_queue_name(queue_name): app_id, queue_name = queue_name.split("_", 1) return app_id, queue_name + def to_dict(self): + return { + "type": "http", + "name": self.name, + "default_timeout": self.default_timeout, + } + + @classmethod + def from_dict(cls, state): + return cls(**state) + def debug_log_callback(message: str, *args: Any, **kwargs: Any) -> None: if QUEUE_DEBUG_ENABLED or (Path(LIGHTNING_DIR) / "QUEUE_DEBUG_ENABLED").exists(): diff --git a/src/lightning_app/frontend/panel/app_state_watcher.py b/src/lightning_app/frontend/panel/app_state_watcher.py index 2c886bae341f5..2253312a13565 100644 --- a/src/lightning_app/frontend/panel/app_state_watcher.py +++ b/src/lightning_app/frontend/panel/app_state_watcher.py @@ -1,9 +1,9 @@ -"""The AppStateWatcher enables a Frontend to. +"""The ``AppStateWatcher`` enables a Frontend to: - subscribe to App state changes - to access and change the App state. -This is particularly useful for the PanelFrontend but can be used by other Frontends too. +This is particularly useful for the ``PanelFrontend`` but can be used by other frontends too. """ from __future__ import annotations @@ -26,15 +26,16 @@ class AppStateWatcher(Parameterized): - """The AppStateWatcher enables a Frontend to: + """The `AppStateWatcher` enables a Frontend to: - Subscribe to any App state changes. - To access and change the App state from the UI. - This is particularly useful for the PanelFrontend, but can be used by - other Frontend's too. + This is particularly useful for the `PanelFrontend , but can be used by + other frontends too. - Example: + Example + ------- .. code-block:: python @@ -54,10 +55,10 @@ def update(state): This would print ``The counter was updated to 2``. - The AppStateWatcher is built on top of Param which is a framework like dataclass, attrs and + The ``AppStateWatcher`` is built on top of Param, which is a framework like dataclass, attrs and Pydantic which additionally provides powerful and unique features for building reactive apps. - Please note the AppStateWatcher is a singleton, i.e. only one instance is instantiated + Please note the ``AppStateWatcher`` is a singleton, i.e., only one instance is instantiated """ state: AppState = ClassSelector( @@ -75,7 +76,7 @@ def __new__(cls): @requires("param") def __init__(self): - # It's critical to initialize only once + # It is critical to initialize only once # See https://github.com/holoviz/param/issues/643 if not hasattr(self, "_initialized"): super().__init__(name="singleton") diff --git a/src/lightning_app/frontend/panel/panel_frontend.py b/src/lightning_app/frontend/panel/panel_frontend.py index 359dca28b2766..48af9235fa796 100644 --- a/src/lightning_app/frontend/panel/panel_frontend.py +++ b/src/lightning_app/frontend/panel/panel_frontend.py @@ -27,17 +27,28 @@ def _has_panel_autoreload() -> bool: class PanelFrontend(Frontend): - """The PanelFrontend enables you to serve Panel code as a Frontend for your LightningFlow. + """The `PanelFrontend` enables you to serve Panel code as a Frontend for your LightningFlow. - To use this frontend, you must first install the `panel` package: + Reference: https://lightning.ai/lightning-docs/workflows/add_web_ui/panel/ + + Args: + entry_point: The path to a .py or .ipynb file, or a pure function. The file or function must contain your Panel + code. The function can optionally accept an ``AppStateWatcher`` argument. + + Raises: + TypeError: Raised if the ``entry_point`` provided is a class method + + Example: + + To use the `PanelFrontend`, you must first install the `panel` package: .. code-block:: bash pip install panel - Example: + Create the files `panel_app_basic.py` and `app_basic.py` with the content below. - `panel_app_basic.py` + **panel_app_basic.py** .. code-block:: python @@ -45,7 +56,7 @@ class PanelFrontend(Frontend): pn.panel("Hello **Panel ⚡** World").servable() - `app_basic.py` + **app_basic.py** .. code-block:: python @@ -69,20 +80,15 @@ def configure_layout(self): app = L.LightningApp(LitApp()) - You can start the Lightning server with Panel autoreload by setting the `PANEL_AUTORELOAD` - environment variable to 'yes': `PANEL_AUTORELOAD=yes lightning run app app_basic.py`. + Start the Lightning server with `lightning run app app_basic.py`. - Args: - entry_point: A pure function or the path to a .py or .ipynb file. - The function must be a pure function that contains your Panel code. - The function can optionally accept an `AppStateWatcher` argument. - - Raises: - TypeError: Raised if the entry_point is a class method + For development you can get Panel autoreload by setting the ``PANEL_AUTORELOAD`` + environment variable to 'yes', i.e. run + ``PANEL_AUTORELOAD=yes lightning run app app_basic.py`` """ @requires("panel") - def __init__(self, entry_point: Callable | str): + def __init__(self, entry_point: str | Callable): super().__init__() if inspect.ismethod(entry_point): diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 81e9e10ebb14a..af3eca424282d 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -4,7 +4,6 @@ import string import sys import time -import traceback from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, List, Optional, Union @@ -43,6 +42,7 @@ ) from lightning_cloud.openapi.rest import ApiException +from lightning_app import LightningWork from lightning_app.core.app import LightningApp from lightning_app.core.constants import ( CLOUD_QUEUE_TYPE, @@ -62,8 +62,8 @@ from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash -from lightning_app.utilities.load_app import _prettifiy_exception, load_app_from_file -from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file +from lightning_app.utilities.load_app import load_app_from_file +from lightning_app.utilities.packaging.app_config import _get_config_file, AppConfig from lightning_app.utilities.packaging.lightning_utils import _prepare_lightning_wheels_and_requirements from lightning_app.utilities.secrets import _names_to_ids @@ -95,10 +95,11 @@ def dispatch( # TODO: verify lightning version # _verify_lightning_version() - config_file = find_config_file(self.entrypoint_file) - app_config = AppConfig.load_from_file(config_file) if config_file else AppConfig() - root = config_file.parent if config_file else Path(self.entrypoint_file).absolute().parent + config_file = _get_config_file(self.entrypoint_file) + app_config = AppConfig.load_from_file(config_file) if config_file.exists() else AppConfig() + root = Path(self.entrypoint_file).absolute().parent cleanup_handle = _prepare_lightning_wheels_and_requirements(root) + self.app._update_index_file() repo = LocalSourceCodeDir(path=root) self._check_uploaded_folder(root, repo) requirements_file = root / "requirements.txt" @@ -141,78 +142,79 @@ def dispatch( v1_env_vars.append(V1EnvVar(name="ENABLE_PUSHING_STATE_ENDPOINT", value="0")) works: List[V1Work] = [] - for flow in self.app.flows: - for work in flow.works(recurse=False): - if not work._start_with_flow: - continue - - work_requirements = "\n".join(work.cloud_build_config.requirements) - build_spec = V1BuildSpec( - commands=work.cloud_build_config.build_commands(), - python_dependencies=V1PythonDependencyInfo( - package_manager=V1PackageManager.PIP, packages=work_requirements - ), - image=work.cloud_build_config.image, - ) - user_compute_config = V1UserRequestedComputeConfig( - name=work.cloud_compute.name, - count=1, - disk_size=work.cloud_compute.disk_size, - preemptible=work.cloud_compute.preemptible, - shm_size=work.cloud_compute.shm_size, - ) + for work in self.app.works: + _validate_build_spec_and_compute(work) - drive_specs: List[V1LightningworkDrives] = [] - for drive_attr_name, drive in [ - (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive) - ]: - if drive.protocol == "lit://": - drive_type = V1DriveType.NO_MOUNT_S3 - source_type = V1SourceType.S3 - else: - raise RuntimeError( - f"unknown drive protocol `{drive.protocol}`. Please verify this " - f"drive type has been configured for use in the cloud dispatcher." - ) + if not work._start_with_flow: + continue - drive_specs.append( - V1LightningworkDrives( - drive=V1Drive( - metadata=V1Metadata( - name=f"{work.name}.{drive_attr_name}", - ), - spec=V1DriveSpec( - drive_type=drive_type, - source_type=source_type, - source=f"{drive.protocol}{drive.id}", - ), - status=V1DriveStatus(), + work_requirements = "\n".join(work.cloud_build_config.requirements) + build_spec = V1BuildSpec( + commands=work.cloud_build_config.build_commands(), + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages=work_requirements + ), + image=work.cloud_build_config.image, + ) + user_compute_config = V1UserRequestedComputeConfig( + name=work.cloud_compute.name, + count=1, + disk_size=work.cloud_compute.disk_size, + preemptible=work.cloud_compute.preemptible, + shm_size=work.cloud_compute.shm_size, + ) + + drive_specs: List[V1LightningworkDrives] = [] + for drive_attr_name, drive in [ + (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive) + ]: + if drive.protocol == "lit://": + drive_type = V1DriveType.NO_MOUNT_S3 + source_type = V1SourceType.S3 + else: + raise RuntimeError( + f"unknown drive protocol `{drive.protocol}`. Please verify this " + f"drive type has been configured for use in the cloud dispatcher." + ) + + drive_specs.append( + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name=f"{work.name}.{drive_attr_name}", ), - mount_location=str(drive.root_folder), + spec=V1DriveSpec( + drive_type=drive_type, + source_type=source_type, + source=f"{drive.protocol}{drive.id}", + ), + status=V1DriveStatus(), ), - ) + mount_location=str(drive.root_folder), + ), + ) - # TODO: Move this to the CloudCompute class and update backend - if work.cloud_compute.mounts is not None: - mounts = work.cloud_compute.mounts - if isinstance(mounts, Mount): - mounts = [mounts] - for mount in mounts: - drive_specs.append( - _create_mount_drive_spec( - work_name=work.name, - mount=mount, - ) + # TODO: Move this to the CloudCompute class and update backend + if work.cloud_compute.mounts is not None: + mounts = work.cloud_compute.mounts + if isinstance(mounts, Mount): + mounts = [mounts] + for mount in mounts: + drive_specs.append( + _create_mount_drive_spec( + work_name=work.name, + mount=mount, ) + ) - random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) - work_spec = V1LightningworkSpec( - build_spec=build_spec, - drives=drive_specs, - user_requested_compute_config=user_compute_config, - network_config=[V1NetworkConfig(name=random_name, port=work.port)], - ) - works.append(V1Work(name=work.name, spec=work_spec)) + random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + work_spec = V1LightningworkSpec( + build_spec=build_spec, + drives=drive_specs, + user_requested_compute_config=user_compute_config, + network_config=[V1NetworkConfig(name=random_name, port=work.port)], + ) + works.append(V1Work(name=work.name, spec=work_spec)) # We need to collect a spec for each flow that contains a frontend so that the backend knows # for which flows it needs to start servers by invoking the cli (see the serve_frontend() method below) @@ -472,26 +474,17 @@ def _project_has_sufficient_credits(self, project: V1Membership, app: Optional[L @classmethod def load_app_from_file(cls, filepath: str) -> "LightningApp": - """This is meant to use only locally for cloud runtime.""" + """Load a LightningApp from a file, mocking the imports.""" try: - app = load_app_from_file(filepath, raise_exception=True) - except ModuleNotFoundError: - # this is very generic exception. - logger.info("Could not load the app locally. Starting the app directly on the cloud.") - # we want to format the exception as if no frame was on top. - exp, val, tb = sys.exc_info() - listing = traceback.format_exception(exp, val, tb) - # remove the entry for the first frame - del listing[1] - from lightning_app.testing.helpers import EmptyFlow - - # Create a mocking app. - app = LightningApp(EmptyFlow()) - + app = load_app_from_file(filepath, raise_exception=True, mock_imports=True) except FileNotFoundError as e: raise e except Exception: - _prettifiy_exception(filepath) + from lightning_app.testing.helpers import EmptyFlow + + # Create a generic app. + logger.info("Could not load the app locally. Starting the app directly on the cloud.") + app = LightningApp(EmptyFlow()) return app @@ -519,3 +512,12 @@ def _create_mount_drive_spec(work_name: str, mount: Mount) -> V1LightningworkDri ), mount_location=str(mount.mount_path), ) + + +def _validate_build_spec_and_compute(work: LightningWork) -> None: + if work.cloud_build_config.image is not None and work.cloud_compute.name == "default": + raise ValueError( + f"You requested a custom base image for the Work with name '{work.name}', but custom images are currently" + " not supported on the default cloud compute instance. Please choose a different configuration, for example" + " `CloudCompute('cpu-medium')`." + ) diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py index 1bc8c7b5cf178..8abd0a443ac32 100644 --- a/src/lightning_app/runners/multiprocess.py +++ b/src/lightning_app/runners/multiprocess.py @@ -30,9 +30,11 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg """Method to dispatch and run the LightningApp.""" try: _set_flow_context() + self.app.backend = self.backend self.backend._prepare_queues(self.app) self.backend.resolve_url(self.app, "http://127.0.0.1") + self.app._update_index_file() # set env variables os.environ.update(self.env_vars) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index f4c8c001acad7..43aa7c55be728 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -361,7 +361,7 @@ def run_app_in_cloud( except playwright._impl._api_types.TimeoutError: print("'Create Project' dialog not visible, skipping.") - admin_page.locator(f"text={name}").click() + admin_page.locator(f"role=link[name='{name}']").click() sleep(5) # Scroll to the bottom of the page. Used to capture all logs. admin_page.evaluate( @@ -431,7 +431,18 @@ def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: project_id=project.project_id, app_id=app_id, ).lightningworks + component_names = ["flow"] + [w.name for w in works] + else: + + def add_prefix(c: str) -> str: + if c == "flow": + return c + if not c.startswith("root."): + return "root." + c + return c + + component_names = [add_prefix(c) for c in component_names] gen = _app_logs_reader( logs_api_client=logs_api_client, diff --git a/src/lightning_app/utilities/app_commands.py b/src/lightning_app/utilities/app_commands.py index 011cb071299a1..3ec4c6d67dc57 100644 --- a/src/lightning_app/utilities/app_commands.py +++ b/src/lightning_app/utilities/app_commands.py @@ -38,12 +38,13 @@ def _extract_commands_from_file(file_name: str) -> CommandLines: file_lines = f.readlines() for line_number, line in enumerate(file_lines): - if line.strip() in APP_COMMAND_LINES_TO_IGNORE: + line = line.strip() + if line in APP_COMMAND_LINES_TO_IGNORE: continue # stop parsing at first non-comment line at top of file if not line.startswith("#"): - break + continue # remove comment marker and any leading / trailing whitespaces line = line.lstrip("#").strip() diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 3f2de886bcc64..d63e33db6addb 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -1,5 +1,6 @@ import abc import asyncio +import builtins import enum import functools import inspect @@ -10,9 +11,11 @@ import threading import time from abc import ABC, abstractmethod +from contextlib import contextmanager from copy import deepcopy from dataclasses import dataclass, field from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, Type, TYPE_CHECKING +from unittest.mock import MagicMock import websockets from deepdiff import Delta @@ -486,6 +489,29 @@ def _load_state_dict(root_flow: "LightningFlow", state: Dict[str, Any], strict: raise Exception(f"The component {component_name} was re-created during state reloading.") +class _MagicMockJsonSerializable(MagicMock): + @staticmethod + def __json__(): + return "{}" + + +def _mock_import(*args, original_fn=None): + try: + return original_fn(*args) + except Exception: + return _MagicMockJsonSerializable() + + +@contextmanager +def _mock_missing_imports(): + original_fn = builtins.__import__ + builtins.__import__ = functools.partial(_mock_import, original_fn=original_fn) + try: + yield + finally: + builtins.__import__ = original_fn + + def is_static_method(klass_or_instance, attr) -> bool: return isinstance(inspect.getattr_static(klass_or_instance, attr), staticmethod) diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index b3b59aa106074..293944ca82c50 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -299,10 +299,11 @@ def _check_environment_and_redirect(): If not, this utility tries to redirect the ``lightning`` call to the environment executable (prompting the user to install lightning for them there if needed). """ - env_executable = shutil.which("python") + env_executable = os.path.realpath(shutil.which("python")) + sys_executable = os.path.realpath(sys.executable) # on windows, the extension might be different, where one uses `.EXE` and the other `.exe` - if env_executable.lower() != sys.executable.lower(): + if env_executable.lower() != sys_executable.lower(): logger.info( "Lightning is running from outside your current environment. Switching to your current environment." ) diff --git a/src/lightning_app/utilities/layout.py b/src/lightning_app/utilities/layout.py index ca12ab8b7a616..9235993ad31d1 100644 --- a/src/lightning_app/utilities/layout.py +++ b/src/lightning_app/utilities/layout.py @@ -4,6 +4,7 @@ import lightning_app from lightning_app.frontend.frontend import Frontend +from lightning_app.utilities.app_helpers import _MagicMockJsonSerializable from lightning_app.utilities.cloud import is_running_in_cloud @@ -39,6 +40,9 @@ def _collect_layout(app: "lightning_app.LightningApp", flow: "lightning_app.Ligh # When running locally, the target will get overwritten by the dispatcher when launching the frontend servers # When running in the cloud, the frontend code will construct the URL based on the flow name return flow._layout + elif isinstance(layout, _MagicMockJsonSerializable): + # Do nothing + pass elif isinstance(layout, dict): layout = _collect_content_layout([layout], flow) elif isinstance(layout, (list, tuple)) and all(isinstance(item, dict) for item in layout): @@ -103,6 +107,9 @@ def _collect_content_layout(layout: List[Dict], flow: "lightning_app.LightningFl else: entry["content"] = "" entry["target"] = "" + elif isinstance(entry["content"], _MagicMockJsonSerializable): + # Do nothing + pass else: m = f""" A dictionary returned by `{flow.__class__.__name__}.configure_layout()` contains an unsupported entry. diff --git a/src/lightning_app/utilities/load_app.py b/src/lightning_app/utilities/load_app.py index 2182162f3e0c3..43a6776721cbb 100644 --- a/src/lightning_app/utilities/load_app.py +++ b/src/lightning_app/utilities/load_app.py @@ -4,6 +4,7 @@ import traceback import types from contextlib import contextmanager +from copy import copy from typing import Dict, List, TYPE_CHECKING, Union from lightning_app.utilities.exceptions import MisconfigurationException @@ -11,7 +12,7 @@ if TYPE_CHECKING: from lightning_app import LightningApp, LightningFlow, LightningWork -from lightning_app.utilities.app_helpers import Logger +from lightning_app.utilities.app_helpers import _mock_missing_imports, Logger logger = Logger(__name__) @@ -30,7 +31,7 @@ def _prettifiy_exception(filepath: str): sys.exit(1) -def load_app_from_file(filepath: str, raise_exception: bool = False) -> "LightningApp": +def load_app_from_file(filepath: str, raise_exception: bool = False, mock_imports: bool = False) -> "LightningApp": """Load a LightningApp from a file. Arguments: @@ -50,7 +51,11 @@ def load_app_from_file(filepath: str, raise_exception: bool = False) -> "Lightni module = _create_fake_main_module(filepath) try: with _patch_sys_argv(): - exec(code, module.__dict__) + if mock_imports: + with _mock_missing_imports(): + exec(code, module.__dict__) + else: + exec(code, module.__dict__) except Exception as e: if raise_exception: raise e @@ -140,7 +145,7 @@ def _patch_sys_argv(): """ from lightning_app.cli.lightning_cli import run_app - original_argv = sys.argv + original_argv = copy(sys.argv) # 1: Remove the CLI command if sys.argv[:3] == ["lightning", "run", "app"]: sys.argv = sys.argv[3:] diff --git a/src/lightning_app/utilities/packaging/app_config.py b/src/lightning_app/utilities/packaging/app_config.py index 59d05debc088c..c3e44159ffb4e 100644 --- a/src/lightning_app/utilities/packaging/app_config.py +++ b/src/lightning_app/utilities/packaging/app_config.py @@ -28,7 +28,7 @@ def save_to_file(self, path: Union[str, pathlib.Path]) -> None: def save_to_dir(self, directory: Union[str, pathlib.Path]) -> None: """Save the configuration to a file '.lightning' to the given folder in YAML format.""" - self.save_to_file(pathlib.Path(directory, _APP_CONFIG_FILENAME)) + self.save_to_file(_get_config_file(directory)) @classmethod def load_from_file(cls, path: Union[str, pathlib.Path]) -> "AppConfig": @@ -47,22 +47,14 @@ def load_from_dir(cls, directory: Union[str, pathlib.Path]) -> "AppConfig": return cls.load_from_file(pathlib.Path(directory, _APP_CONFIG_FILENAME)) -def find_config_file(source_path: pathlib.Path = pathlib.Path.cwd()) -> Optional[pathlib.Path]: - """Search for the Lightning app config file '.lightning' at the given source path. - - Relative to the given path, it will search for the '.lightning' config file by going up the directory structure - until found. Returns ``None`` if no config file is found in any of the parent directories. +def _get_config_file(source_path: Union[str, pathlib.Path]) -> pathlib.Path: + """Get the Lightning app config file '.lightning' at the given source path. Args: - source_path: A path to a folder or a file. The search for the config file will start relative to this path. + source_path: A path to a folder or a file. """ source_path = pathlib.Path(source_path).absolute() if source_path.is_file(): source_path = source_path.parent - candidate = pathlib.Path(source_path / _APP_CONFIG_FILENAME) - if candidate.is_file(): - return candidate - - if source_path.parents: - return find_config_file(source_path.parent) + return pathlib.Path(source_path / _APP_CONFIG_FILENAME) diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 16f7044f0c3cb..07b03da7d9201 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -7,11 +7,12 @@ import time import traceback import warnings +from contextlib import contextmanager from copy import deepcopy from dataclasses import dataclass, field from functools import partial from threading import Event, Thread -from typing import Any, Callable, Dict, Optional, Set, Tuple, Type, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, Generator, Optional, Set, Tuple, Type, TYPE_CHECKING, Union from deepdiff import DeepDiff, Delta from lightning_utilities.core.apply_func import apply_to_collection @@ -102,8 +103,6 @@ class ProxyWorkRun: caller_queue: "BaseQueue" def __post_init__(self): - self.cache_calls = self.work.cache_calls - self.parallel = self.work.parallel self.work_state = None def __call__(self, *args, **kwargs): @@ -122,7 +121,7 @@ def __call__(self, *args, **kwargs): # The if/else conditions are left un-compressed to simplify readability # for the readers. - if self.cache_calls: + if self.work.cache_calls: if not entered or stopped_on_sigterm: _send_data_to_caller_queue(self, self.work, self.caller_queue, data, call_hash) else: @@ -136,7 +135,7 @@ def __call__(self, *args, **kwargs): # the previous task has completed and we can re-queue the next one. # overriding the return value for next loop iteration. _send_data_to_caller_queue(self, self.work, self.caller_queue, data, call_hash) - if not self.parallel: + if not self.work.parallel: raise CacheMissException("Task never called before. Triggered now") def _validate_call_args(self, args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> None: @@ -314,7 +313,7 @@ def run(self): work_name: str work: "LightningWork" delta_queue: "BaseQueue" - state_observer: "WorkStateObserver" + state_observer: Optional["WorkStateObserver"] def __call__(self, name: str, value: Any) -> None: logger.debug(f"Setting {name}: {value}") @@ -329,7 +328,8 @@ def __call__(self, name: str, value: Any) -> None: self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta)) # add the delta to the buffer to let WorkStateObserver know we already sent this one to the Flow - self.state_observer._delta_memory.append(delta) + if self.state_observer: + self.state_observer._delta_memory.append(delta) @dataclass @@ -343,10 +343,36 @@ class WorkRunExecutor: work: "LightningWork" work_run: Callable + delta_queue: "BaseQueue" + enable_start_observer: bool = True def __call__(self, *args, **kwargs): return self.work_run(*args, **kwargs) + @contextmanager + def enable_spawn(self) -> Generator: + self.work._setattr_replacement = None + self.work._backend = None + self._clean_queues() + yield + + def _clean_queues(self): + if "LIGHTNING_APP_STATE_URL" in os.environ: + self.work._request_queue = self.work._request_queue.to_dict() + self.work._response_queue = self.work._response_queue.to_dict() + + @staticmethod + def process_queue(queue): + from lightning_app.core.queues import HTTPQueue, RedisQueue + + if isinstance(queue, dict): + queue_type = queue.pop("type") + if queue_type == "redis": + return RedisQueue.from_dict(queue) + else: + return HTTPQueue.from_dict(queue) + return queue + @dataclass class WorkRunner: @@ -442,12 +468,13 @@ def run_once(self): self._transfer_path_attributes() # 6. Create the state observer thread. - self.state_observer = WorkStateObserver( - self.work, - delta_queue=self.delta_queue, - flow_to_work_delta_queue=self.flow_to_work_delta_queue, - error_queue=self.error_queue, - ) + if self.run_executor_cls.enable_start_observer: + self.state_observer = WorkStateObserver( + self.work, + delta_queue=self.delta_queue, + flow_to_work_delta_queue=self.flow_to_work_delta_queue, + error_queue=self.error_queue, + ) # 7. Deepcopy the work state and send the first `RUNNING` status delta to the flow. reference_state = deepcopy(self.work.state) @@ -478,12 +505,13 @@ def run_once(self): # 11. Start the state observer thread. It will look for state changes and send them back to the Flow # The observer has to be initialized here, after the set_state call above so that the thread can start with # the proper initial state of the work - self.state_observer.start() + if self.run_executor_cls.enable_start_observer: + self.state_observer.start() # 12. Run the `work_run` method. # If an exception is raised, send a `FAILED` status delta to the flow and call the `on_exception` hook. try: - ret = self.run_executor_cls(self.work, work_run)(*args, **kwargs) + ret = self.run_executor_cls(self.work, work_run, self.delta_queue)(*args, **kwargs) except LightningSigtermStateException as e: raise e except BaseException as e: @@ -500,7 +528,7 @@ def run_once(self): used_runpy = True if user_exception: trace.append(p) - if "ret = self.run_executor_cls(self.work, work_run)(*args, **kwargs)" in p: + if "ret = self.run_executor_cls(" in p: user_exception = True if used_runpy: @@ -525,7 +553,8 @@ def run_once(self): return # 13. Destroy the state observer. - self.state_observer.join(0) + if self.run_executor_cls.enable_start_observer: + self.state_observer.join(0) self.state_observer = None # 14. Copy all artifacts to the shared storage so other Works can access them while this Work gets scaled down @@ -574,14 +603,7 @@ def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None: raise LightningSigtermStateException(0) def _proxy_setattr(self, cleanup: bool = False): - if cleanup: - setattr_proxy = None - else: - assert self.state_observer - setattr_proxy = LightningWorkSetAttrProxy( - self.work_name, self.work, delta_queue=self.delta_queue, state_observer=self.state_observer - ) - self.work._setattr_replacement = setattr_proxy + _proxy_setattr(self.work, self.delta_queue, self.state_observer, cleanup=cleanup) def _process_call_args( self, args: Tuple[Any, ...], kwargs: Dict[str, Any] @@ -688,3 +710,16 @@ def persist_artifacts(work: "LightningWork") -> None: f"All {destination_paths} artifacts from Work {work.name} successfully " "stored at {artifacts_path(work.name)}." ) + + +def _proxy_setattr(work, delta_queue, state_observer: Optional[WorkStateObserver], cleanup: bool = False): + if cleanup: + setattr_proxy = None + else: + setattr_proxy = LightningWorkSetAttrProxy( + work.name, + work, + delta_queue=delta_queue, + state_observer=state_observer, + ) + work._setattr_replacement = setattr_proxy diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md index 03d371681a95d..61a6bfe685c69 100644 --- a/src/lightning_lite/CHANGELOG.md +++ b/src/lightning_lite/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.8.2] - 2022-11-17 + +### Fixed + +- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103)) + ## [1.8.1] - 2022-11-10 diff --git a/src/lightning_lite/__version__.py b/src/lightning_lite/__version__.py index 72126ce16b766..ba22724db3594 100644 --- a/src/lightning_lite/__version__.py +++ b/src/lightning_lite/__version__.py @@ -1 +1 @@ -version = "1.8.1" +version = "1.8.2" diff --git a/src/lightning_lite/connector.py b/src/lightning_lite/connector.py index 788e4f9529115..738f7cc661b05 100644 --- a/src/lightning_lite/connector.py +++ b/src/lightning_lite/connector.py @@ -395,7 +395,10 @@ def _check_strategy_and_fallback(self) -> None: strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( - TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() + TorchElasticEnvironment.detect() + or KubeflowEnvironment.detect() + or SLURMEnvironment.detect() + or LSFEnvironment.detect() ): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": diff --git a/src/lightning_lite/lite.py b/src/lightning_lite/lite.py index a25655a5ba409..e6890742e42d9 100644 --- a/src/lightning_lite/lite.py +++ b/src/lightning_lite/lite.py @@ -123,7 +123,7 @@ def world_size(self) -> int: @property def is_global_zero(self) -> bool: - """Wether this rank is rank zero.""" + """Whether this rank is rank zero.""" return self._strategy.is_global_zero @abstractmethod diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index bd9a346931e2b..21d881f29e8bd 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.8.2] - 2022-11-17 + +### Fixed + +- Make sure save_dir can be empty str ([#15638](https://github.com/PyTorchLightning/pytorch-lightning/issues/15638)) +- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103)) + ## [1.8.1] - 2022-11-10 diff --git a/src/pytorch_lightning/__setup__.py b/src/pytorch_lightning/__setup__.py index 442bda630b884..a7ecff67d0630 100644 --- a/src/pytorch_lightning/__setup__.py +++ b/src/pytorch_lightning/__setup__.py @@ -124,5 +124,6 @@ def _setup_args(**__: Any) -> Dict[str, Any]: "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], ) diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 72126ce16b766..ba22724db3594 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.8.1" +version = "1.8.2" diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 8d0596e3bdccd..2e7b9bbb27b29 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -574,7 +574,10 @@ def __resolve_ckpt_dir(self, trainer: "pl.Trainer") -> _PATH: return self.dirpath if len(trainer.loggers) > 0: - save_dir = trainer.loggers[0].save_dir or trainer.default_root_dir + if trainer.loggers[0].save_dir is not None: + save_dir = trainer.loggers[0].save_dir + else: + save_dir = trainer.default_root_dir name = trainer.loggers[0].name version = trainer.loggers[0].version version = version if isinstance(version, str) else f"version_{version}" diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index d00e13a3194a9..fd8d2d4f4aa76 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None: strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( - TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() + TorchElasticEnvironment.detect() + or KubeflowEnvironment.detect() + or SLURMEnvironment.detect() + or LSFEnvironment.detect() ): strategy_flag = "ddp" if strategy_flag == "dp" and self._accelerator_flag == "cpu": diff --git a/tests/tests_app/components/database/test_client_server.py b/tests/tests_app/components/database/test_client_server.py index 6ebec90ff9b1e..7b193d8f74c20 100644 --- a/tests/tests_app/components/database/test_client_server.py +++ b/tests/tests_app/components/database/test_client_server.py @@ -2,6 +2,7 @@ import sys import tempfile import time +import traceback from pathlib import Path from time import sleep from typing import List, Optional @@ -197,7 +198,9 @@ def run(self): assert len(self._client.select_all()) == 1 self._exit() - with tempfile.TemporaryDirectory() as tmpdir: - - app = LightningApp(Flow(tmpdir)) - MultiProcessRuntime(app).dispatch() + try: + with tempfile.TemporaryDirectory() as tmpdir: + app = LightningApp(Flow(tmpdir)) + MultiProcessRuntime(app).dispatch() + except Exception: + print(traceback.print_exc()) diff --git a/tests/tests_app/components/serve/test_gradio.py b/tests/tests_app/components/serve/test_gradio.py index 8dcdeec70a341..0b57656e6aa31 100644 --- a/tests/tests_app/components/serve/test_gradio.py +++ b/tests/tests_app/components/serve/test_gradio.py @@ -27,4 +27,6 @@ def predict(self, *args, **kwargs): comp.run() assert comp.model == "model" assert comp.predict() == "prediction" - gradio_mock.Interface.assert_called_once_with(fn=ANY, inputs=ANY, outputs=ANY, examples=ANY) + gradio_mock.Interface.assert_called_once_with( + fn=ANY, inputs=ANY, outputs=ANY, examples=ANY, title=None, description=None + ) diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index a0069f1314841..d81c72c06f071 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -42,7 +42,7 @@ class WorkA(LightningWork): def __init__(self): - super().__init__(parallel=True) + super().__init__(parallel=True, start_with_flow=False) self.var_a = 0 self.drive = Drive("lit://test_app_state_api") diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index d95cac9899761..1b438f14632bb 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -247,10 +247,9 @@ def test_get_component_by_name_raises(): app.get_component_by_name("root.b.w_b.c") -@pytest.mark.parametrize("runtime_cls", [SingleProcessRuntime, MultiProcessRuntime]) -def test_nested_component(runtime_cls): +def test_nested_component(): app = LightningApp(A(), log_level="debug") - runtime_cls(app, start_server=False).dispatch() + MultiProcessRuntime(app, start_server=False).dispatch() assert app.root.w_a.c == 1 assert app.root.b.w_b.c == 1 assert app.root.b.c.w_c.c == 1 @@ -601,9 +600,10 @@ def run(self): class CheckpointFlow(LightningFlow): - def __init__(self, work: LightningWork, depth=0): + def __init__(self, work: CheckpointCounter, depth=0): super().__init__() self.depth = depth + if depth == 0: self.counter = 0 @@ -613,10 +613,9 @@ def __init__(self, work: LightningWork, depth=0): self.flow = CheckpointFlow(work, depth + 1) def run(self): - if hasattr(self, "counter"): - self.counter += 1 - if self.counter > 5: - self._exit() + if self.works()[0].counter == 5: + self._exit() + if self.depth >= 10: self.work.run() else: @@ -627,19 +626,16 @@ def test_lightning_app_checkpointing_with_nested_flows(): work = CheckpointCounter() app = LightningApp(CheckpointFlow(work)) app.checkpointing = True - SingleProcessRuntime(app, start_server=False).dispatch() + MultiProcessRuntime(app, start_server=False).dispatch() - assert app.root.counter == 6 assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 5 work = CheckpointCounter() app = LightningApp(CheckpointFlow(work)) - assert app.root.counter == 0 assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 0 app.load_state_dict_from_checkpoint_dir(app.checkpoint_dir) # The counter was increment to 6 after the latest checkpoints was created. - assert app.root.counter == 5 assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 5 @@ -956,8 +952,8 @@ def run(self): def test_state_size_constant_growth(): app = LightningApp(SizeFlow()) MultiProcessRuntime(app, start_server=False).dispatch() - assert app.root._state_sizes[0] <= 6952 - assert app.root._state_sizes[20] <= 26080 + assert app.root._state_sizes[0] <= 7824 + assert app.root._state_sizes[20] <= 26500 class FlowUpdated(LightningFlow): diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 23a465968efc8..25bc590893280 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1,5 +1,6 @@ import logging import os +import sys from copy import copy from pathlib import Path from unittest import mock @@ -39,10 +40,11 @@ V1Work, ) -from lightning_app import _PROJECT_ROOT, LightningApp, LightningWork +from lightning_app import _PROJECT_ROOT, BuildConfig, LightningApp, LightningWork from lightning_app.runners import backends, cloud, CloudRuntime +from lightning_app.runners.cloud import _validate_build_spec_and_compute from lightning_app.storage import Drive, Mount -from lightning_app.testing.helpers import EmptyFlow +from lightning_app.testing.helpers import EmptyFlow, EmptyWork from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash from lightning_app.utilities.packaging.cloud_compute import CloudCompute @@ -54,8 +56,8 @@ def run(self): class WorkWithSingleDrive(LightningWork): - def __init__(self): - super().__init__() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.drive = None def run(self): @@ -63,8 +65,8 @@ def run(self): class WorkWithTwoDrives(LightningWork): - def __init__(self): - super().__init__() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.lit_drive_1 = None self.lit_drive_2 = None @@ -402,18 +404,16 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) app = mock.MagicMock() - flow = mock.MagicMock() - work = MyWork(start_with_flow=start_with_flow) - monkeypatch.setattr(work, "_name", "test-work") - monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) - monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) - monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") - monkeypatch.setattr(work._cloud_compute, "disk_size", 0) - monkeypatch.setattr(work, "_port", 8080) + work = MyWork(start_with_flow=start_with_flow, cloud_compute=CloudCompute("custom")) + work._name = "test-work" + work._cloud_build_config.build_commands = lambda: ["echo 'start'"] + work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"] + work._cloud_build_config.image = "random_base_public_image" + work._cloud_compute.disk_size = 0 + work._port = 8080 - flow.works = lambda recurse: [work] - app.flows = [flow] + app.works = [work] cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) monkeypatch.setattr( "lightning_app.runners.cloud._get_project", @@ -452,7 +452,7 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t ), drives=[], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", + name="custom", count=1, disk_size=0, shm_size=0, @@ -575,7 +575,6 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) app = mock.MagicMock() - flow = mock.MagicMock() mocked_drive = MagicMock(spec=Drive) setattr(mocked_drive, "id", "foobar") @@ -588,7 +587,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch # should be the results of the deepcopy operation (an instance of the original class) mocked_drive.__deepcopy__.return_value = copy(mocked_drive) - work = WorkWithSingleDrive() + work = WorkWithSingleDrive(cloud_compute=CloudCompute("custom")) monkeypatch.setattr(work, "drive", mocked_drive) monkeypatch.setattr(work, "_state", {"_port", "drive"}) monkeypatch.setattr(work, "_name", "test-work") @@ -598,8 +597,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch monkeypatch.setattr(work._cloud_compute, "disk_size", 0) monkeypatch.setattr(work, "_port", 8080) - flow.works = lambda recurse: [work] - app.flows = [flow] + app.works = [work] cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) monkeypatch.setattr( "lightning_app.runners.cloud._get_project", @@ -650,7 +648,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch ), ], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", + name="custom", count=1, disk_size=0, shm_size=0, @@ -712,19 +710,17 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) app = mock.MagicMock() - flow = mock.MagicMock() - work = MyWork() - monkeypatch.setattr(work, "_state", {"_port"}) - monkeypatch.setattr(work, "_name", "test-work") - monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) - monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) - monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") - monkeypatch.setattr(work._cloud_compute, "disk_size", 0) - monkeypatch.setattr(work, "_port", 8080) + work = MyWork(cloud_compute=CloudCompute("custom")) + work._state = {"_port"} + work._name = "test-work" + work._cloud_build_config.build_commands = lambda: ["echo 'start'"] + work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"] + work._cloud_build_config.image = "random_base_public_image" + work._cloud_compute.disk_size = 0 + work._port = 8080 - flow.works = lambda recurse: [work] - app.flows = [flow] + app.works = [work] cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) monkeypatch.setattr( "lightning_app.runners.cloud._get_project", @@ -761,7 +757,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin ), drives=[], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", count=1, disk_size=0, shm_size=0, preemptible=mock.ANY + name="custom", count=1, disk_size=0, shm_size=0, preemptible=mock.ANY ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], cluster_id=mock.ANY, @@ -829,7 +825,6 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) app = mock.MagicMock() - flow = mock.MagicMock() mocked_lit_drive = MagicMock(spec=Drive) setattr(mocked_lit_drive, "id", "foobar") @@ -842,19 +837,18 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo # should be the results of the deepcopy operation (an instance of the original class) mocked_lit_drive.__deepcopy__.return_value = copy(mocked_lit_drive) - work = WorkWithTwoDrives() - monkeypatch.setattr(work, "lit_drive_1", mocked_lit_drive) - monkeypatch.setattr(work, "lit_drive_2", mocked_lit_drive) - monkeypatch.setattr(work, "_state", {"_port", "_name", "lit_drive_1", "lit_drive_2"}) - monkeypatch.setattr(work, "_name", "test-work") - monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) - monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) - monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") - monkeypatch.setattr(work._cloud_compute, "disk_size", 0) - monkeypatch.setattr(work, "_port", 8080) - - flow.works = lambda recurse: [work] - app.flows = [flow] + work = WorkWithTwoDrives(cloud_compute=CloudCompute("custom")) + work.lit_drive_1 = mocked_lit_drive + work.lit_drive_2 = mocked_lit_drive + work._state = {"_port", "_name", "lit_drive_1", "lit_drive_2"} + work._name = "test-work" + work._cloud_build_config.build_commands = lambda: ["echo 'start'"] + work._cloud_build_config.requirements = ["torch==1.0.0", "numpy==1.0.0"] + work._cloud_build_config.image = "random_base_public_image" + work._cloud_compute.disk_size = 0 + work._port = 8080 + + app.works = [work] cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) monkeypatch.setattr( "lightning_app.runners.cloud._get_project", @@ -922,7 +916,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo ), drives=[lit_drive_2_spec, lit_drive_1_spec], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", + name="custom", count=1, disk_size=0, shm_size=0, @@ -961,7 +955,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo ), drives=[lit_drive_1_spec, lit_drive_2_spec], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", + name="custom", count=1, disk_size=0, shm_size=0, @@ -1034,7 +1028,6 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) app = mock.MagicMock() - flow = mock.MagicMock() mocked_drive = MagicMock(spec=Drive) setattr(mocked_drive, "id", "foobar") @@ -1052,7 +1045,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo setattr(mocked_mount, "mount_path", "/content/foo") setattr(mocked_mount, "protocol", "s3://") - work = WorkWithSingleDrive() + work = WorkWithSingleDrive(cloud_compute=CloudCompute("custom")) monkeypatch.setattr(work, "drive", mocked_drive) monkeypatch.setattr(work, "_state", {"_port", "drive"}) monkeypatch.setattr(work, "_name", "test-work") @@ -1063,8 +1056,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo monkeypatch.setattr(work._cloud_compute, "mounts", mocked_mount) monkeypatch.setattr(work, "_port", 8080) - flow.works = lambda recurse: [work] - app.flows = [flow] + app.works = [work] cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) monkeypatch.setattr( "lightning_app.runners.cloud._get_project", @@ -1129,7 +1121,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo ), ], user_requested_compute_config=V1UserRequestedComputeConfig( - name="default", + name="custom", count=1, disk_size=0, shm_size=0, @@ -1237,3 +1229,74 @@ def test_load_app_from_file_module_error(): empty_app = CloudRuntime.load_app_from_file(os.path.join(_PROJECT_ROOT, "examples", "app_v0", "app.py")) assert isinstance(empty_app, LightningApp) assert isinstance(empty_app.root, EmptyFlow) + + +@pytest.mark.parametrize( + "lines", + [ + [ + "import this_package_is_not_real", + "from lightning_app import LightningApp", + "from lightning_app.testing.helpers import EmptyWork", + "app = LightningApp(EmptyWork())", + ], + [ + "from this_package_is_not_real import this_module_is_not_real", + "from lightning_app import LightningApp", + "from lightning_app.testing.helpers import EmptyWork", + "app = LightningApp(EmptyWork())", + ], + [ + "import this_package_is_not_real", + "from this_package_is_not_real import this_module_is_not_real", + "from lightning_app import LightningApp", + "from lightning_app.testing.helpers import EmptyWork", + "app = LightningApp(EmptyWork())", + ], + [ + "import this_package_is_not_real", + "from lightning_app import LightningApp", + "from lightning_app.core.flow import _RootFlow", + "from lightning_app.testing.helpers import EmptyWork", + "class MyFlow(_RootFlow):", + " def configure_layout(self):", + " return [{'name': 'test', 'content': this_package_is_not_real()}]", + "app = LightningApp(MyFlow(EmptyWork()))", + ], + ], +) +@pytest.mark.skipif(sys.platform != "linux", reason="Causing conflicts on non-linux") +def test_load_app_from_file_mock_imports(tmpdir, lines): + path = copy(sys.path) + app_file = os.path.join(tmpdir, "app.py") + + with open(app_file, "w") as f: + f.write("\n".join(lines)) + + app = CloudRuntime.load_app_from_file(app_file) + assert isinstance(app, LightningApp) + assert isinstance(app.root.work, EmptyWork) + + # Cleanup PATH to prevent conflict with other tests + sys.path = path + os.remove(app_file) + + +def test_incompatible_cloud_compute_and_build_config(): + """Test that an exception is raised when a build config has a custom image defined, but the cloud compute is + the default. + + This combination is not supported by the platform. + """ + + class Work(LightningWork): + def __init__(self): + super().__init__() + self.cloud_compute = CloudCompute(name="default") + self.cloud_build_config = BuildConfig(image="custom") + + def run(self): + pass + + with pytest.raises(ValueError, match="You requested a custom base image for the Work with name"): + _validate_build_spec_and_compute(Work()) diff --git a/tests/tests_app/storage/test_drive.py b/tests/tests_app/storage/test_drive.py index bee8de5e093a8..d39623bd74296 100644 --- a/tests/tests_app/storage/test_drive.py +++ b/tests/tests_app/storage/test_drive.py @@ -50,7 +50,8 @@ def test_synchronization_lit_drive(tmpdir): os.remove("a.txt") app = LightningApp(SyncFlowLITDrives(tmpdir)) MultiProcessRuntime(app, start_server=False).dispatch() - os.remove("a.txt") + if os.path.exists("a.txt"): + os.remove("a.txt") class LITDriveWork(LightningWork): diff --git a/tests/tests_app/utilities/packaging/test_app_config.py b/tests/tests_app/utilities/packaging/test_app_config.py index 2666f0a769ace..60da494a47fb8 100644 --- a/tests/tests_app/utilities/packaging/test_app_config.py +++ b/tests/tests_app/utilities/packaging/test_app_config.py @@ -1,6 +1,6 @@ import pathlib -from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file +from lightning_app.utilities.packaging.app_config import _get_config_file, AppConfig def _make_empty_config_file(folder): @@ -10,24 +10,12 @@ def _make_empty_config_file(folder): return file -def test_find_config_file(tmpdir, monkeypatch): - monkeypatch.chdir(pathlib.Path("/")) - assert find_config_file() is None - - monkeypatch.chdir(pathlib.Path.home()) - assert find_config_file() is None - +def test_get_config_file(tmpdir): _ = _make_empty_config_file(tmpdir) - config_file1 = _make_empty_config_file(tmpdir / "a" / "b") - - assert find_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning") - assert find_config_file(config_file1) == pathlib.Path(tmpdir, "a", "b", ".lightning") - assert find_config_file(pathlib.Path(tmpdir, "a")) == pathlib.Path(tmpdir, ".lightning") + config_file1 = _make_empty_config_file(tmpdir) - # the config must be a file, a folder of the same name gets ignored - fake_config_folder = pathlib.Path(tmpdir, "fake", ".lightning") - fake_config_folder.mkdir(parents=True) - assert find_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning") + assert _get_config_file(tmpdir) == pathlib.Path(tmpdir, ".lightning") + assert _get_config_file(config_file1) == pathlib.Path(tmpdir, ".lightning") def test_app_config_save_load(tmpdir): diff --git a/tests/tests_app/utilities/test_app_commands.py b/tests/tests_app/utilities/test_app_commands.py index 35f08509dca62..7e3b9beed4104 100644 --- a/tests/tests_app/utilities/test_app_commands.py +++ b/tests/tests_app/utilities/test_app_commands.py @@ -14,7 +14,7 @@ ("multiple_commands.txt", ['echo "foo"', 'echo "bar"'], [1, 2]), ("commands_with_mixed_comments_1.txt", ['echo "foo"', 'echo "bar"'], [1, 3]), ("commands_with_mixed_comments_2.txt", ['echo "foo"', 'echo "bar"'], [2, 4]), - ("command_after_first_non_comment_line.txt", ['echo "foo"'], [1]), + ("command_after_first_non_comment_line.txt", ['echo "foo"', 'echo "bar"'], [2, 4]), ("bang_not_at_start_of_line.txt", ['echo "foo"'], [2]), ("space_between_bang_and_command.txt", ['echo "foo"'], [1]), ("multiple_spaces_between_band_and_command.txt", ['echo "foo"'], [1]), diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py index fccbaaa671588..4b8a5f25f71e3 100644 --- a/tests/tests_app/utilities/test_proxies.py +++ b/tests/tests_app/utilities/test_proxies.py @@ -14,7 +14,7 @@ from lightning_app import LightningApp, LightningFlow, LightningWork from lightning_app.runners import MultiProcessRuntime -from lightning_app.storage import Path +from lightning_app.storage import Drive, Path from lightning_app.storage.path import _artifacts_path from lightning_app.storage.requests import _GetRequest from lightning_app.testing.helpers import _MockQueue, EmptyFlow @@ -67,6 +67,7 @@ def proxy_setattr(): @pytest.mark.parametrize("parallel", [True, False]) @pytest.mark.parametrize("cache_calls", [False, True]) +@pytest.mark.skipif(sys.platform == "win32", reason="TODO (@ethanwharris): Fix this on Windows") def test_work_runner(parallel, cache_calls): """This test validates the `WorkRunner` runs the work.run method and properly populates the `delta_queue`, `error_queue` and `readiness_queue`.""" @@ -216,7 +217,7 @@ def __init__(self): class WorkTimeout(LightningWork): def __init__(self): - super().__init__(parallel=True) + super().__init__(parallel=True, start_with_flow=False) self.counter = 0 def run(self): @@ -761,3 +762,31 @@ def test_bi_directional_proxy_forbidden(monkeypatch): MultiProcessRuntime(app, start_server=False).dispatch() assert app.stage == AppStage.FAILED assert "A forbidden operation to update the work" in str(app.exception) + + +class WorkDrive(LightningFlow): + def __init__(self, drive): + super().__init__() + self.drive = drive + self.path = Path("data") + + def run(self): + pass + + +class FlowDrive(LightningFlow): + def __init__(self): + super().__init__() + self.data = Drive("lit://data") + self.counter = 0 + + def run(self): + if not hasattr(self, "w"): + self.w = WorkDrive(self.data) + self.counter += 1 + + +def test_bi_directional_proxy_filtering(): + app = LightningApp(FlowDrive()) + app.root.run() + assert app._extract_vars_from_component_name(app.root.w.name, app.state) == {} diff --git a/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt b/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt index c9d90d8eff892..1cd80f15779df 100644 --- a/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt +++ b/tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt @@ -1,3 +1,4 @@ + # !echo "foo" import lighting # !echo "bar" diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py index 1455709e5c82f..af023504b5473 100644 --- a/tests/tests_lite/conftest.py +++ b/tests/tests_lite/conftest.py @@ -54,6 +54,7 @@ def restore_env_variables(): "HOROVOD_FUSION_THRESHOLD", "RANK", # set by DeepSpeed "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy + "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13 } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_lite/plugins/precision/test_native_amp.py b/tests/tests_lite/plugins/precision/test_native_amp.py index dbf2e1c9ec5c0..c64e0e8f19df0 100644 --- a/tests/tests_lite/plugins/precision/test_native_amp.py +++ b/tests/tests_lite/plugins/precision/test_native_amp.py @@ -43,19 +43,21 @@ def test_native_amp_precision_bf16_min_torch(): @RunIf(min_torch="1.10") def test_native_amp_precision_forward_context(): - """Test to ensure that the context manager correctly is set to CPU + bfloat16.""" + """Test to ensure that the context manager correctly is set to bfloat16 on CPU and CUDA.""" precision = NativeMixedPrecision(precision=16, device="cuda") assert precision.device == "cuda" assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): - assert torch.get_autocast_gpu_dtype() == torch.float16 + # check with str due to a bug upstream: https://github.com/pytorch/pytorch/issues/65786 + assert str(torch.get_autocast_gpu_dtype()) in ("torch.float16", "torch.half") precision = NativeMixedPrecision(precision="bf16", device="cpu") assert precision.device == "cpu" assert precision.scaler is None with precision.forward_context(): - assert torch.get_autocast_cpu_dtype() == torch.bfloat16 + # check with str due to a bug upstream: https://github.com/pytorch/pytorch/issues/65786 + assert str(torch.get_autocast_cpu_dtype()) == str(torch.bfloat16) context_manager = precision._autocast_context_manager() assert isinstance(context_manager, torch.autocast) diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py index 683d3cfe23d9d..a254a37bc68b5 100644 --- a/tests/tests_lite/test_connector.py +++ b/tests/tests_lite/test_connector.py @@ -32,6 +32,7 @@ from lightning_lite.plugins.environments import ( KubeflowEnvironment, LightningEnvironment, + LSFEnvironment, SLURMEnvironment, TorchElasticEnvironment, ) @@ -200,24 +201,41 @@ class Strat(DDPStrategy): assert connector.strategy is strategy -@mock.patch.dict( - os.environ, - { - "SLURM_NTASKS": "2", - "SLURM_NTASKS_PER_NODE": "1", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_PROCID": "0", - "SLURM_LOCALID": "0", - }, +@pytest.mark.parametrize( + "env_vars,expected_environment", + [ + ( + { + "SLURM_NTASKS": "2", + "SLURM_NTASKS_PER_NODE": "1", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, + SLURMEnvironment, + ), + ( + { + "LSB_JOBID": "1", + "LSB_DJOB_RANKFILE": "SOME_RANK_FILE", + "JSM_NAMESPACE_LOCAL_RANK": "1", + "JSM_NAMESPACE_SIZE": "20", + "JSM_NAMESPACE_RANK": "1", + }, + LSFEnvironment, + ), + ], ) -@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0) -def test_dist_backend_accelerator_mapping(*_): - connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) - assert isinstance(connector.accelerator, CPUAccelerator) - assert isinstance(connector.strategy, DDPStrategy) - assert connector.strategy.local_rank == 0 +@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"]) +@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0) +def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment): + with mock.patch.dict(os.environ, env_vars, clear=True): + trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2) + assert isinstance(trainer.accelerator, CPUAccelerator) + assert isinstance(trainer.strategy, DDPStrategy) + assert isinstance(trainer.strategy.cluster_environment, expected_environment) @RunIf(mps=False) diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 5d8616ad00657..2f5607828a232 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -72,6 +72,9 @@ def restore_env_variables(): "HOROVOD_FUSION_THRESHOLD", "RANK", # set by DeepSpeed "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy + "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13 + "KMP_INIT_AT_FORK", # leaked since PyTorch 1.13 + "KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13 } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index be8bced2cbf5f..97e3d27760ea8 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -101,9 +101,6 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) - # root should not be resharding - assert self.layer.reshard_after_forward is False - precision = torch.float16 if self.precision == 16 else torch.bfloat16 assert self.layer.mixed_precision.param_dtype == precision assert self.layer.mixed_precision.reduce_dtype == precision @@ -111,9 +108,6 @@ def _assert_layer_fsdp_instance(self) -> None: for layer_num in [0, 2]: assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel) - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer.module[layer_num].reshard_after_forward is True - assert self.layer[layer_num].mixed_precision.param_dtype == precision assert self.layer[layer_num].mixed_precision.reduce_dtype == precision assert self.layer[layer_num].mixed_precision.buffer_dtype == precision @@ -146,9 +140,6 @@ def _assert_layer_fsdp_instance(self) -> None: precision = torch.float16 if self.precision == 16 else torch.bfloat16 for layer_num in [0, 2]: assert isinstance(self.layer[layer_num], FullyShardedDataParallel) - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer[layer_num].reshard_after_forward - assert self.layer[layer_num].mixed_precision.param_dtype == precision assert self.layer[layer_num].mixed_precision.reduce_dtype == precision assert self.layer[layer_num].mixed_precision.buffer_dtype == precision diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 9ff650c2768e3..77a4888351cf2 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -25,6 +25,7 @@ from lightning_lite.plugins.environments import ( KubeflowEnvironment, LightningEnvironment, + LSFEnvironment, SLURMEnvironment, TorchElasticEnvironment, ) @@ -193,24 +194,41 @@ class Strat(DDPStrategy): assert trainer._accelerator_connector.strategy is strategy -@mock.patch.dict( - os.environ, - { - "SLURM_NTASKS": "2", - "SLURM_NTASKS_PER_NODE": "1", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_PROCID": "0", - "SLURM_LOCALID": "0", - }, +@pytest.mark.parametrize( + "env_vars,expected_environment", + [ + ( + { + "SLURM_NTASKS": "2", + "SLURM_NTASKS_PER_NODE": "1", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_PROCID": "0", + "SLURM_LOCALID": "0", + }, + SLURMEnvironment, + ), + ( + { + "LSB_JOBID": "1", + "LSB_DJOB_RANKFILE": "SOME_RANK_FILE", + "JSM_NAMESPACE_LOCAL_RANK": "1", + "JSM_NAMESPACE_SIZE": "20", + "JSM_NAMESPACE_RANK": "1", + }, + LSFEnvironment, + ), + ], ) -@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) -def test_dist_backend_accelerator_mapping(cuda_count_0): - trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2) +@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"]) +@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0) +def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment): + with mock.patch.dict(os.environ, env_vars, clear=True): + trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2) assert isinstance(trainer.accelerator, CPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) - assert trainer.strategy.local_rank == 0 + assert isinstance(trainer.strategy.cluster_environment, expected_environment) def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):