From 37fd1ab427eb3e987ddab770f6340d04a58efd8d Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 21 Nov 2022 14:06:06 +0100 Subject: [PATCH 01/29] chlog update --- src/lightning_app/CHANGELOG.md | 19 ++++++++++++++++++- src/lightning_lite/CHANGELOG.md | 17 +++++++++++++++++ src/pytorch_lightning/CHANGELOG.md | 17 +++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index f1b6740a9a344..69dc901e6ba18 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,7 +4,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [UnReleased] - 2022-11-DD +## [1.8.3] - 2022-11-DD + +### Added + +- + + +### Changed + +- + + +### Fixed + +- + + +## [1.8.2] - 2022-11-17 ### Added diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md index 61a6bfe685c69..63802002395a5 100644 --- a/src/lightning_lite/CHANGELOG.md +++ b/src/lightning_lite/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.8.3] - 2022-11-DD + +### Added + +- + + +### Changed + +- + + +### Fixed + +- + + ## [1.8.2] - 2022-11-17 ### Fixed diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 21d881f29e8bd..72107da99044b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.8.3] - 2022-11-DD + +### Added + +- + + +### Changed + +- + + +### Fixed + +- + + ## [1.8.2] - 2022-11-17 ### Fixed From 6694c3deb49b542f657eb04cd1bbf2923fbea397 Mon Sep 17 00:00:00 2001 From: yiftachbeer Date: Sat, 19 Nov 2022 13:26:35 +0200 Subject: [PATCH 02/29] Fix typo in script name (#15724) (cherry picked from commit d925077253811f47f256b00b96bf1094dafb8c15) --- docs/source-pytorch/cli/lightning_cli_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/cli/lightning_cli_intermediate.rst b/docs/source-pytorch/cli/lightning_cli_intermediate.rst index db8b6cf4c77ec..3e728daf8d46b 100644 --- a/docs/source-pytorch/cli/lightning_cli_intermediate.rst +++ b/docs/source-pytorch/cli/lightning_cli_intermediate.rst @@ -107,7 +107,7 @@ Which prints out: .. code:: bash - usage: a.py [-h] [-c CONFIG] [--print_config [={comments,skip_null,skip_default}+]] + usage: main.py [-h] [-c CONFIG] [--print_config [={comments,skip_null,skip_default}+]] {fit,validate,test,predict,tune} ... pytorch-lightning trainer command line tool From a151ff531b6030df21faa7faa6f9db97156d12dd Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Sat, 19 Nov 2022 23:35:55 +0530 Subject: [PATCH 03/29] Torch inference mode for prediction (#15719) torch inference mode for prediction (cherry picked from commit 08d14ec190fd7dace6fa51997e6926a362cc9543) --- src/lightning_app/components/serve/python_server.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py index f0361f9db5046..731bf1c37e969 100644 --- a/src/lightning_app/components/serve/python_server.py +++ b/src/lightning_app/components/serve/python_server.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any, Dict, Optional +import torch import uvicorn from fastapi import FastAPI from pydantic import BaseModel @@ -105,7 +106,7 @@ def predict(self, request): self._input_type = input_type self._output_type = output_type - def setup(self) -> None: + def setup(self, *args, **kwargs) -> None: """This method is called before the server starts. Override this if you need to download the model or initialize the weights, setting up pipelines etc. @@ -154,7 +155,8 @@ def _attach_predict_fn(self, fastapi_app: FastAPI) -> None: output_type: type = self.configure_output_type() def predict_fn(request: input_type): # type: ignore - return self.predict(request) + with torch.inference_mode(): + return self.predict(request) fastapi_app.post("/predict", response_model=output_type)(predict_fn) @@ -207,7 +209,7 @@ def run(self, *args: Any, **kwargs: Any) -> Any: Normally, you don't need to override this method. """ - self.setup() + self.setup(*args, **kwargs) fastapi_app = FastAPI() self._attach_predict_fn(fastapi_app) From 8b91b49cb807378aaa7302808b5f0ab62403aeb9 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 10:41:59 +0000 Subject: [PATCH 04/29] [App] Update multi-node examples (#15700) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Carlos Mocholí (cherry picked from commit 83067977af858c3147f2b7e9bcb10ef7d2e4b4df) --- .github/workflows/ci-app-tests.yml | 2 +- .../basic/hello_components/pl_multinode.py | 3 +- .../basic/hello_components/pt_multinode.py | 3 +- .../levels/basic/hero_components.rst | 2 +- ...al_lightning_component_implementations.rst | 2 +- examples/app_multi_node/train_lt.py | 19 +++---- examples/app_multi_node/train_pytorch.py | 54 ++++++++----------- 7 files changed, 36 insertions(+), 49 deletions(-) diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 826dfc70b552f..8ddbf2a5ddb8d 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -11,7 +11,7 @@ on: - ".github/workflows/ci-app-tests.yml" - "src/lightning_app/**" - "tests/tests_app/**" - - "examples/app_*" # some tests_app tests call examples files + - "examples/app_*/**" # some tests_app tests call examples files - "requirements/app/**" - "setup.py" - ".actions/**" diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py index 5feed8a8864c3..e6764ee8fafae 100644 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py @@ -5,8 +5,7 @@ class LightningTrainerDistributed(L.LightningWork): - @staticmethod - def run(): + def run(self): model = BoringModel() trainer = L.Trainer(max_epochs=10, strategy="ddp") trainer.fit(model) diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py index 585b85540bf61..86bd7da10c6ff 100644 --- a/docs/source-app/levels/basic/hello_components/pt_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pt_multinode.py @@ -22,8 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - device_ids = device if torch.cuda.is_available() else None - model = DistributedDataParallel(model, device_ids=device_ids).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst index 81fecc9461403..396cd96ed0d26 100644 --- a/docs/source-app/levels/basic/hero_components.rst +++ b/docs/source-app/levels/basic/hero_components.rst @@ -1,7 +1,7 @@ .. lit_tabs:: :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py - :highlights: 7; 10, 11; 10-12, 17, 18; 4, 8, 12, 18-19, 26; 5, 10, 22, 28, 32, 42, 58-60; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24 + :highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24 :enable_run: true :tab_rows: 3 :height: 620px diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst index da413f459234a..268517463c612 100644 --- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst +++ b/docs/source-app/levels/basic/real_lightning_component_implementations.rst @@ -26,7 +26,7 @@ or cloud GPUs without code changes. .. lit_tabs:: :descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 2.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic. :code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; - :highlights: 2; 4; 10-12; 15-18; 17; 18; 20 + :highlights: 2; 4; 9-11; 14-17; 16; 17; 19 :enable_run: true :tab_rows: 5 :height: 420px diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py index c9e2f62392a56..4abe375c89b9b 100644 --- a/examples/app_multi_node/train_lt.py +++ b/examples/app_multi_node/train_lt.py @@ -1,3 +1,4 @@ +# app.py import lightning as L from lightning.app.components import LightningTrainerMultiNode from lightning.pytorch.demos.boring_classes import BoringModel @@ -6,18 +7,14 @@ class LightningTrainerDistributed(L.LightningWork): def run(self): model = BoringModel() - trainer = L.Trainer( - max_steps=1000, - strategy="ddp", - ) + trainer = L.Trainer(max_epochs=10, strategy="ddp") trainer.fit(model) -# Run over 2 nodes of 4 x V100 -app = L.LightningApp( - LightningTrainerMultiNode( - LightningTrainerDistributed, - num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 - ) +# 8 GPU: (2 nodes of 4 x v100) +component = LightningTrainerMultiNode( + LightningTrainerDistributed, + num_nodes=4, + cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) +app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 9ce662fa40009..9599bce5bbd85 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -1,3 +1,5 @@ +# app.py +# ! pip install torch import torch from torch.nn.parallel.distributed import DistributedDataParallel @@ -6,7 +8,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int): - # 1. Setting distributed environment + # 1. SET UP DISTRIBUTED ENVIRONMENT global_rank = local_rank + node_rank * nprocs world_size = num_nodes * nprocs @@ -18,52 +20,42 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no init_method=f"tcp://{main_address}:{main_port}", ) - # 2. Prepare the model - model = torch.nn.Sequential( - torch.nn.Linear(1, 1), - torch.nn.ReLU(), - torch.nn.Linear(1, 1), - ) - - # 3. Setup distributed training + # 2. PREPARE DISTRIBUTED MODEL + model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None) + model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) - # 4. Prepare loss and optimizer + # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - # 5. Train the model for 1000 steps. - for step in range(1000): + # 4.TRAIN THE MODEL FOR 50 STEPS + for step in range(50): model.zero_grad() - x = torch.tensor([0.8]).to(device) - target = torch.tensor([1.0]).to(device) + x = torch.randn(64, 32).to(device) output = model(x) - loss = criterion(output, target) + loss = criterion(output, torch.ones_like(output)) print(f"global_rank: {global_rank} step: {step} loss: {loss}") loss.backward() optimizer.step() + # 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING + weight = model.module.weight.clone() + torch.distributed.all_reduce(weight) + assert torch.equal(model.module.weight, weight / world_size) + + print("Multi Node Distributed Training Done!") + class PyTorchDistributed(L.LightningWork): - def run( - self, - main_address: str, - main_port: int, - num_nodes: int, - node_rank: int, - ): + def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int): nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 torch.multiprocessing.spawn( distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs ) -# Run over 2 nodes of 4 x V100 -app = L.LightningApp( - MultiNode( - PyTorchDistributed, - num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 - ) -) +# 32 GPUs: (8 nodes x 4 v 100) +compute = L.CloudCompute("gpu-fast-multi") # 4xV100 +component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) +app = L.LightningApp(component) From 8e2222583fd5f8d5e485e974acdcae395c137f6e Mon Sep 17 00:00:00 2001 From: Yurij Mikhalevich Date: Mon, 21 Nov 2022 13:46:01 +0100 Subject: [PATCH 05/29] feature(docs/app/lit_tabs): add works (#15731) (cherry picked from commit 1a31d13521fb54a210b302e7bad2b9eb0b2b60b4) --- docs/source-app/levels/basic/hero_components.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst index 396cd96ed0d26..da1d9d076a794 100644 --- a/docs/source-app/levels/basic/hero_components.rst +++ b/docs/source-app/levels/basic/hero_components.rst @@ -2,6 +2,7 @@ :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py :highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24 + :works: [{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0},"networkConfig":[{"name":"dzodf","port":61304}]}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"qnlgd","port":61516}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[{"name":"root.ws.0","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"ajfrc","port":61553}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.1","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"ttyqc","port":61554}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.2","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"svyej","port":61555}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.3","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"parme","port":61556}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"cutdu","port":61584}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[{"name":"root.ws.0","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"whhby","port":61613}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.1","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"yhjtf","port":61614}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.2","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"rqwkt","port":61615}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.3","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"pjdsj","port":61616}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.4","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"efdor","port":61617}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.5","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"pxmso","port":61618}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.6","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"feevy","port":61619}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.7","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"tbmse","port":61620}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"umqqg","port":7777}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"tggba","port":61729}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"hpyaz","port":61763}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0}}}] :enable_run: true :tab_rows: 3 :height: 620px From 6a4f2215fffc8a80c8c76a889f37758fd0c140ec Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 21 Nov 2022 14:06:06 +0100 Subject: [PATCH 06/29] [App] Fix VSCode IDE debugger (#15747) (cherry picked from commit 6714ca7132c4b10e995feec073ec21c1b4f97a02) --- src/lightning_app/CHANGELOG.md | 3 +++ src/lightning_app/core/app.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 69dc901e6ba18..c0062d63bb9f1 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed debugging with VSCode IDE ([#15747](https://github.com/Lightning-AI/lightning/pull/15747)) + + - diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 255f498507f67..128b6cfb2980f 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -169,17 +169,17 @@ def __init__( logger.debug(f"ENV: {os.environ}") - def _update_index_file(self): - # update index.html, - # this should happen once for all apps before the ui server starts running. - frontend.update_index_file(FRONTEND_DIR, info=self.info, root_path=self.root_path) - if _should_dispatch_app(): os.environ["LIGHTNING_DISPATCHED"] = "1" from lightning_app.runners import MultiProcessRuntime MultiProcessRuntime(self).dispatch() + def _update_index_file(self): + # update index.html, + # this should happen once for all apps before the ui server starts running. + frontend.update_index_file(FRONTEND_DIR, info=self.info, root_path=self.root_path) + def get_component_by_name(self, component_name: str): """Returns the instance corresponding to the given component name.""" from lightning_app.structures import Dict as LightningDict From 669b28a7fb277a2d608822d814ee004981871f5b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:20:23 +0900 Subject: [PATCH 07/29] Update tensorboard requirement from <2.11.0,>=2.9.1 to >=2.9.1,<2.12.0 in /requirements (#15746) Update tensorboard requirement in /requirements Updates the requirements on [tensorboard](https://github.com/tensorflow/tensorboard) to permit the latest version. - [Release notes](https://github.com/tensorflow/tensorboard/releases) - [Changelog](https://github.com/tensorflow/tensorboard/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorboard/compare/2.9.1...2.11.0) --- updated-dependencies: - dependency-name: tensorboard dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit 0b58b6981ee155ba9173a83aef91a3deb4ac6874) --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 2f2b9306bd22a..ad9573493ae6f 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -6,7 +6,7 @@ torch>=1.9.*, <=1.13.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>2021.06.0, <2022.8.0 -tensorboard>=2.9.1, <2.11.0 +tensorboard>=2.9.1, <2.12.0 torchmetrics>=0.7.0, <0.10.1 # needed for using fixed compare_version packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <=4.4.0 From 49f3da7ebd371896b3e6674d36bd06aa83701b3a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:22:43 +0900 Subject: [PATCH 08/29] Update beautifulsoup4 requirement from <=4.8.2 to <4.11.2 in /requirements (#15745) * Update beautifulsoup4 requirement in /requirements Updates the requirements on [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) to permit the latest version. --- updated-dependencies: - dependency-name: beautifulsoup4 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Apply suggestions from code review Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 1ffbe1bf1e50f7ee1f2d06ea86ba097733f09196) --- requirements/app/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index d772c242376fd..094c6dea6a7c5 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -9,7 +9,7 @@ croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more ro traitlets>=5.3.0, <=5.4.0 arrow>=1.2.0, <1.2.4 lightning-utilities==0.3.* -beautifulsoup4<=4.8.2 +beautifulsoup4>=4.8.0, <4.11.2 inquirer>=2.10.0 psutil<5.9.4 click<=8.1.3 From 9351d412aecb9cde0989f90dfb785edc591a92ec Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 16:02:30 +0000 Subject: [PATCH 09/29] [App] Fix multi-node pytorch example CI (#15753) (cherry picked from commit bc797fd37613f18ddf0fd5122776b4cdcc4922ae) --- .github/checkgroup.yml | 2 +- .github/workflows/ci-app-examples.yml | 2 +- docs/source-app/levels/basic/hello_components/pt_multinode.py | 2 +- examples/app_multi_node/train_pytorch.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 53e8348626c25..9a55c50198879 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -244,7 +244,7 @@ subprojects: - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" - "tests/tests_app_examples/**" - - "examples/app_*" + - "examples/app_*/**" - "requirements/app/**" - "setup.py" - ".actions/**" diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 88eadcfd920f8..9646efa27db66 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -11,7 +11,7 @@ on: - ".github/workflows/ci-app-examples.yml" - "src/lightning_app/**" - "tests/tests_app_examples/**" - - "examples/app_*" + - "examples/app_*/**" - "requirements/app/**" - "setup.py" - ".actions/**" diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py index 86bd7da10c6ff..8b39c74cdcc2e 100644 --- a/docs/source-app/levels/basic/hello_components/pt_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pt_multinode.py @@ -22,7 +22,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index 9599bce5bbd85..cc9e84297c151 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -23,7 +23,7 @@ def distributed_train(local_rank: int, main_address: str, main_port: int, num_no # 2. PREPARE DISTRIBUTED MODEL model = torch.nn.Linear(32, 2) device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank]).to(device) + model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) # 3. SETUP LOSS AND OPTIMIZER criterion = torch.nn.MSELoss() @@ -55,7 +55,7 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) ) -# 32 GPUs: (8 nodes x 4 v 100) +# 8 GPUs: (2 nodes x 4 v 100) compute = L.CloudCompute("gpu-fast-multi") # 4xV100 component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) From abf0a406c8baf001361d8786fe8438007e995dca Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Mon, 21 Nov 2022 17:13:14 +0000 Subject: [PATCH 10/29] [App] Improve `LightningTrainerScript` start-up time (#15751) (cherry picked from commit c2c197486a96852dfc755ef115cd6139e97b49e7) --- src/lightning_app/components/training.py | 31 ++++++++++-------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py index 4618b5aa9e9cb..6d3c86eb50374 100644 --- a/src/lightning_app/components/training.py +++ b/src/lightning_app/components/training.py @@ -147,33 +147,28 @@ def __init__( the ServableModule API """ super().__init__() - self.ws = structures.List() - self.has_initialized = False self.script_path = script_path self.script_args = script_args self.num_nodes = num_nodes - self._cloud_compute = cloud_compute # TODO: Add support for cloudCompute self.sanity_serving = sanity_serving self._script_runner = script_runner self._script_runner_kwargs = script_runner_kwargs - def run(self, **run_kwargs): - if not self.has_initialized: - for node_rank in range(self.num_nodes): - self.ws.append( - self._script_runner( - script_path=self.script_path, - script_args=self.script_args, - cloud_compute=self._cloud_compute, - node_rank=node_rank, - sanity_serving=self.sanity_serving, - num_nodes=self.num_nodes, - **self._script_runner_kwargs, - ) + self.ws = structures.List() + for node_rank in range(self.num_nodes): + self.ws.append( + self._script_runner( + script_path=self.script_path, + script_args=self.script_args, + cloud_compute=cloud_compute, + node_rank=node_rank, + sanity_serving=self.sanity_serving, + num_nodes=self.num_nodes, + **self._script_runner_kwargs, ) + ) - self.has_initialized = True - + def run(self, **run_kwargs): for work in self.ws: if all(w.internal_ip for w in self.ws): internal_urls = [(w.internal_ip, w.port) for w in self.ws] From 69f4aeaa7f7d3ce15b66eb5cc0f392d5db5ee646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 21 Nov 2022 18:58:28 +0100 Subject: [PATCH 11/29] Enable Probot CheckGroup v5 (#15670) (cherry picked from commit 6c8ee019f7a5dc8a62aa6e705cab172f3b026e38) --- .github/workflows/probot-check-group.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 15965ca7eba47..1aafee679da07 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -14,12 +14,12 @@ jobs: if: github.event.pull_request.draft == false timeout-minutes: 61 # in case something is wrong with the internal timeout steps: - - uses: Lightning-AI/probot@v4 + - uses: Lightning-AI/probot@v5 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: job: check-group interval: 180 # seconds timeout: 60 # minutes - maintainers: '@Lightning-AI/lai-frameworks' - owner: '@carmocca' + maintainers: 'Lightning-AI/lai-frameworks' + owner: 'carmocca' From 636a46ce4f46f1e79e136826d85937079185eb26 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 21 Nov 2022 18:39:51 +0000 Subject: [PATCH 12/29] [App] Enable properties for the Lightning flow (#15750) (cherry picked from commit 5cfb176214c293ff9e589d9747adc9d8f85b47a8) --- src/lightning_app/CHANGELOG.md | 3 ++ src/lightning_app/core/flow.py | 6 +++- tests/tests_app/core/test_lightning_app.py | 33 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index c0062d63bb9f1..f1a975c2bb93f 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -47,6 +47,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712)) - Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714)) +- Fixed setting property to the LightningFlow ([#15750](https://github.com/Lightning-AI/lightning/pull/15750)) + + ## [1.8.1] - 2022-11-10 diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index ac8a7ff325049..18b6fd40f756a 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -110,7 +110,11 @@ def name(self): """Return the current LightningFlow name.""" return self._name or "root" - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any) -> None: + attr = getattr(self.__class__, name, None) + if isinstance(attr, property) and attr.fset is not None: + return attr.fset(self, value) + from lightning_app.structures import Dict, List if ( diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 1b438f14632bb..4f2c0d8a50358 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1108,3 +1108,36 @@ def test_cloud_compute_binding(): with pytest.raises(Exception, match="A Cloud Compute can be assigned only to a single Work"): FlowCC() + + +class FlowValue(LightningFlow): + def __init__(self): + super().__init__() + self._value = None + self._has_found = False + + @property + def value(self): + return self._value + + @value.setter + def value(self, value): + self._value = value + + def run(self): + if self.value is None: + self.value = True + + def __setattr__(self, name, value): + if name == "_value" and value is True: + self._has_found = True + super().__setattr__(name, value) + + +def test_lightning_flow_properties(): + """Validates setting properties to the LightningFlow properly calls property.fset.""" + + flow = FlowValue() + assert not flow._has_found + flow.run() + assert flow._has_found From 0e6a3a3c6a37ef88bc2335d2faa28ce6d7f881d9 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Mon, 21 Nov 2022 20:35:45 +0100 Subject: [PATCH 13/29] test for Enable setting property (#15755) Co-authored-by: thomas chaton Co-authored-by: Ethan Harris (cherry picked from commit ba140387f928a04a8b0e090824059b52e6df0bc5) --- tests/tests_app/core/test_lightning_app.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 4f2c0d8a50358..64bee758af5d5 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1114,7 +1114,6 @@ class FlowValue(LightningFlow): def __init__(self): super().__init__() self._value = None - self._has_found = False @property def value(self): @@ -1125,19 +1124,13 @@ def value(self, value): self._value = value def run(self): - if self.value is None: - self.value = True - - def __setattr__(self, name, value): - if name == "_value" and value is True: - self._has_found = True - super().__setattr__(name, value) + self.value = True def test_lightning_flow_properties(): """Validates setting properties to the LightningFlow properly calls property.fset.""" flow = FlowValue() - assert not flow._has_found + assert flow._value is None flow.run() - assert flow._has_found + assert flow._value is True From 23ec3c499c5ead38ab5a0b282c4d28a7f89b7339 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 21 Nov 2022 20:57:04 +0100 Subject: [PATCH 14/29] Move s3fs to cloud extras (#15729) Co-authored-by: Luca Antiga (cherry picked from commit dd75906785bf2aa17c0d31fe874d3994c2b7c7ae) --- requirements/app/base.txt | 1 - requirements/app/cloud.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 094c6dea6a7c5..2d0c3482eea79 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -4,7 +4,6 @@ typing-extensions>=4.0.0, <=4.4.0 deepdiff>=5.7.0, <=5.8.1 starsessions>=1.2.1, <2.0 # strict fsspec>=2022.5.0, <=2022.7.1 -s3fs>=2022.5.0, <2022.8.3 croniter>=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust. traitlets>=5.3.0, <=5.4.0 arrow>=1.2.0, <1.2.4 diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index 4685a596d2337..acda01289da6f 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,3 +1,4 @@ redis>=4.0.1, <=4.2.4 docker>=5.0.0, <=5.0.3 +s3fs>=2022.5.0, <2022.8.3 # setuptools==59.5.0 From 7d8d21bc9f110f3d8192ce597cf6d6a4900b0d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 21 Nov 2022 21:19:13 +0100 Subject: [PATCH 15/29] Revert new Hydra launch behavior (#15737) * revert new hydra cwd behavior * remove debug statements * changelog Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 88b2e5a2584c8daea4ea09eba66656b9b7682872) --- .../strategies/launchers/subprocess_script.py | 39 ++----- src/pytorch_lightning/CHANGELOG.md | 6 +- .../strategies/launchers/subprocess_script.py | 5 +- .../launchers/test_subprocess_script.py | 8 +- .../launchers/test_subprocess_script.py | 106 +----------------- 5 files changed, 22 insertions(+), 142 deletions(-) diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index 07af414e23bc3..c9649f4639818 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -14,8 +14,7 @@ import os import subprocess import sys -from time import sleep -from typing import Any, Callable, Sequence +from typing import Any, Callable, Optional, Sequence, Tuple import numpy as np from lightning_utilities.core.imports import RequirementCache @@ -116,15 +115,16 @@ def _call_children_scripts(self) -> None: # start process # if hydra is available and initialized, make sure to set the cwd correctly hydra_in_use = False + cwd: Optional[str] = None if _HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig hydra_in_use = HydraConfig.initialized() if hydra_in_use: - command = _hydra_subprocess_cmd(local_rank=local_rank) + command, cwd = _hydra_subprocess_cmd(local_rank=local_rank) else: command = _basic_subprocess_cmd() - subprocess.Popen(command, env=env_copy) + subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds @@ -149,10 +149,9 @@ def _basic_subprocess_cmd() -> Sequence[str]: return [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:] -def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]: +def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]: import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218 - from hydra.core.hydra_config import HydraConfig - from hydra.utils import to_absolute_path + from hydra.utils import get_original_cwd, to_absolute_path # when user is using hydra find the absolute path if __main__.__spec__ is None: # pragma: no-cover @@ -160,25 +159,9 @@ def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]: else: command = [sys.executable, "-m", __main__.__spec__.name] - # extract the hydra configuration - hydra_cfg = HydraConfig.get() + command += sys.argv[1:] - # the location of the hydra configuration files saved for the current job - hydra_output = hydra_cfg.runtime.output_dir - if hydra_cfg.output_subdir is not None: - hydra_output = os.path.join(hydra_output, hydra_cfg.output_subdir) - - # check if experimental re-run capability exists - # otherwise use existing config.yaml which may have issues - pickled_config = os.path.join(hydra_output, "config.pickle") - if os.path.exists(pickled_config): - command += ["--experimental-rerun", pickled_config] - - else: - command += ["-cp", hydra_output, "-cn", "config.yaml"] - command += [ - f"hydra.output_subdir=.pl_ddp_hydra_{local_rank}", - f"hydra.run.dir={hydra_cfg.runtime.output_dir}", - ] - - return command + cwd = get_original_cwd() + os_cwd = f'"{os.getcwd()}"' + command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"] + return command, cwd diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 72107da99044b..5420f2ed2a446 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -13,7 +13,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) ### Fixed @@ -79,7 +79,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a sanity check that scripts are executed with the `srun` command in SLURM and that environment variables are not conflicting ([#15011](https://github.com/Lightning-AI/lightning/pull/15011)) - Added an error message when attempting to launch processes with `python -i` and an interactive-incompatible strategy ([#15293](https://github.com/Lightning-AI/lightning/pull/15293)) - ### Changed - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) @@ -107,7 +106,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - To avoid issues with forking processes, from PyTorch 1.13 and higher, Lightning will directly use the PyTorch NVML-based check for `torch.cuda.device_count` and from PyTorch 1.14 and higher, Lightning will configure PyTorch to use a NVML-based check for `torch.cuda.is_available`. ([#15110](https://github.com/Lightning-AI/lightning/pull/15110), [#15133](https://github.com/Lightning-AI/lightning/pull/15133)) - The `NeptuneLogger` now uses `neptune.init_run` instead of the deprecated `neptune.init` to initialize a run ([#15393](https://github.com/Lightning-AI/lightning/pull/15393)) - ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) @@ -137,7 +135,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `TrainerFn.TUNING`, `RunningStage.TUNING` and `trainer.tuning` property ([#15100](https://github.com/Lightning-AI/lightning/pull/15100)) - Deprecated custom `pl.utilities.distributed.AllGatherGrad` implementation in favor of PyTorch's ([#15364](https://github.com/Lightnign-AI/lightning/pull/15364)) - ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) @@ -195,7 +192,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `LightningDataModule.on_save/load_checkpoint` hooks ([#14909](https://github.com/Lightning-AI/lightning/pull/14909)) - Removed support for returning a value in `Callback.on_save_checkpoint` in favor of implementing `Callback.state_dict` ([#14835](https://github.com/Lightning-AI/lightning/pull/14835)) - ### Fixed - Fixed an issue with `LightningLite.setup()` not setting the `.device` attribute correctly on the returned wrapper ([#14822](https://github.com/Lightning-AI/lightning/pull/14822)) diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index b5f4d39973ede..a55d8d71cf7e8 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -111,17 +111,18 @@ def _call_children_scripts(self) -> None: del env_copy["PL_GLOBAL_SEED"] hydra_in_use = False + cwd: Optional[str] = None if _HYDRA_AVAILABLE: from hydra.core.hydra_config import HydraConfig hydra_in_use = HydraConfig.initialized() if hydra_in_use: - command = _hydra_subprocess_cmd(local_rank) + command, cwd = _hydra_subprocess_cmd(local_rank) else: command = _basic_subprocess_cmd() - subprocess.Popen(command, env=env_copy) + subprocess.Popen(command, env=env_copy, cwd=cwd) # starting all processes at once can cause issues # with dataloaders delay between 1-10 seconds diff --git a/tests/tests_lite/strategies/launchers/test_subprocess_script.py b/tests/tests_lite/strategies/launchers/test_subprocess_script.py index 1b2e4360e2432..b16bab30ecf4d 100644 --- a/tests/tests_lite/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_lite/strategies/launchers/test_subprocess_script.py @@ -84,7 +84,7 @@ def test_subprocess_script_launcher_launch_processes(popen_mock, _): @mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen") def test_subprocess_script_launcher_hydra_in_use(popen_mock, _, monkeypatch): basic_command = Mock(return_value="basic_command") - hydra_command = Mock(return_value="hydra_command") + hydra_command = Mock(return_value=("hydra_command", "hydra_cwd")) monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_basic_subprocess_cmd", basic_command) monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_hydra_subprocess_cmd", hydra_command) @@ -101,7 +101,7 @@ def simulate_launch(): # when hydra not available monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_HYDRA_AVAILABLE", False) simulate_launch() - popen_mock.assert_called_with("basic_command", env=ANY) + popen_mock.assert_called_with("basic_command", env=ANY, cwd=None) popen_mock.reset_mock() import hydra @@ -112,7 +112,7 @@ def simulate_launch(): HydraConfigMock.initialized.return_value = False monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock) simulate_launch() - popen_mock.assert_called_with("basic_command", env=ANY) + popen_mock.assert_called_with("basic_command", env=ANY, cwd=None) popen_mock.reset_mock() # when hydra available and initialized @@ -121,5 +121,5 @@ def simulate_launch(): HydraConfigMock.initialized.return_value = True monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock) simulate_launch() - popen_mock.assert_called_with("hydra_command", env=ANY) + popen_mock.assert_called_with("hydra_command", env=ANY, cwd="hydra_cwd") popen_mock.reset_mock() diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py index 83605f53873db..5b495f13ca316 100644 --- a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py +++ b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py @@ -1,35 +1,17 @@ -import logging -import os import sys -from pathlib import Path import pytest from lightning_utilities.core.imports import RequirementCache -from pytorch_lightning.strategies.launchers.subprocess_script import _HYDRA_AVAILABLE from tests_pytorch.helpers.runif import RunIf _HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2") _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7") -if _HYDRA_AVAILABLE: - from omegaconf import OmegaConf if _HYDRA_WITH_RUN_PROCESS: from hydra.test_utils.test_utils import run_process -# fixture to run hydra jobs in a clean temporary directory -# Hydra creates its own output directories and logs -@pytest.fixture -def cleandir(tmp_path): - """Run function in a temporary directory.""" - old_dir = os.getcwd() # get current working directory (cwd) - os.chdir(tmp_path) # change cwd to the temp-directory - yield tmp_path # yields control to the test to be run - os.chdir(old_dir) - logging.shutdown() - - # Script to run from command line script = """ import hydra @@ -64,7 +46,9 @@ def task_fn(cfg): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) @pytest.mark.parametrize("subdir", [None, "dksa", ".hello"]) -def test_ddp_with_hydra_runjob(cleandir, subdir): +def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch): + monkeypatch.chdir(tmpdir) + # Save script locally with open("temp.py", "w") as fn: fn.write(script) @@ -75,87 +59,3 @@ def test_ddp_with_hydra_runjob(cleandir, subdir): if subdir is not None: cmd += [f"hydra.output_subdir={subdir}"] run_process(cmd) - - # Make sure config.yaml was created for additional - # processes. - logs = list(Path.cwd().glob("**/config.yaml")) - assert len(logs) == devices - - # Make sure the parameter was set and used - cfg = OmegaConf.load(logs[0]) - assert cfg.devices == devices - - # Make sure PL spawned a job that is logged by Hydra - logs = list(Path.cwd().glob("**/*.log")) - assert len(logs) == 1 - - -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) -@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS)) -@pytest.mark.parametrize("num_jobs", [1, 2]) -def test_ddp_with_hydra_multirunjob(cleandir, num_jobs): - # Save script locally - with open("temp.py", "w") as fn: - fn.write(script) - - # create fake multirun params based on `num_jobs` - fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs)) - - # Run CLI - run_process([sys.executable, "temp.py", "+devices=2", '+strategy="ddp"', fake_param, "--multirun"]) - - # Make sure config.yaml was created for each job - configs = sorted(Path.cwd().glob("**/.pl_ddp_hydra_*/config.yaml")) - assert len(configs) == num_jobs - - # Make sure the parameter was set and used for each job - for i, config in enumerate(configs): - cfg = OmegaConf.load(config) - local_rank = int(config.parent.parent.parts[-1]) - assert cfg.devices == 2 - assert cfg.foo == local_rank - - logs = list(Path.cwd().glob("**/*.log")) - assert len(logs) == num_jobs - - -yaml_file = """ -hydra: - callbacks: - save_job_info: - _target_: hydra.experimental.callbacks.PickleJobInfoCallback -""" - - -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) -@pytest.mark.skipif(not _HYDRA_WITH_RERUN, reason=str(_HYDRA_WITH_RERUN)) -@pytest.mark.parametrize("num_jobs", [1, 2]) -def test_ddp_with_hydra_multirunjob_rerun(cleandir, num_jobs): - # Save script locally - with open("temp.py", "w") as fn: - fn.write(script) - - with open("config.yaml", "w") as fn: - fn.write(yaml_file) - - # create fake multirun params based on `num_jobs` - fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs)) - - # Run CLI - run_process( - [ - sys.executable, - "temp.py", - "-cp", - ".", - "-cn", - "config.yaml", - "+devices=2", - '+strategy="ddp"', - fake_param, - "--multirun", - ] - ) - - pickles = sorted(Path.cwd().glob("**/.hydra/config.pickle")) - assert len(pickles) == num_jobs From eeb7166ced89941fce24ac7408229d2ff52ae462 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 21 Nov 2022 21:30:58 +0100 Subject: [PATCH 16/29] FCCV Docs (#15598) * add custom data iter docs * add custom data iter docs * Update docs/source-pytorch/data/custom_data_iterables.rst * remove ToDevice * nit * Update docs/source-pytorch/data/custom_data_iterables.rst Co-authored-by: Luca Antiga * clarification for @lantiga * typo * Update docs/source-pytorch/data/custom_data_iterables.rst * Update docs/source-pytorch/data/custom_data_iterables.rst * Update docs/source-pytorch/data/custom_data_iterables.rst Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Akihiro Nitta Co-authored-by: Luca Antiga (cherry picked from commit 006fde974719d4aec2991c60db22e1ed79ddaa2a) --- .../data/custom_data_iterables.rst | 122 ++++++++++++++++++ docs/source-pytorch/index.rst | 1 + 2 files changed, 123 insertions(+) create mode 100644 docs/source-pytorch/data/custom_data_iterables.rst diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst new file mode 100644 index 0000000000000..3b124c1356aee --- /dev/null +++ b/docs/source-pytorch/data/custom_data_iterables.rst @@ -0,0 +1,122 @@ +.. _dataiters: + +################################## +Injecting 3rd Party Data Iterables +################################## + +When training a model on a specific task, data loading and preprocessing might become a bottleneck. +Lightning does not enforce a specific data loading approach nor does it try to control it. +The only assumption Lightning makes is that the data is returned as an iterable of batches. + +For PyTorch-based programs, these iterables are typically instances of :class:`~torch.utils.data.DataLoader`. + +However, Lightning also supports other data types such as plain list of batches, generators or other custom iterables. + +.. code-block:: python + + # random list of batches + data = [(torch.rand(32, 3, 32, 32), torch.randint(0, 10, (32,))) for _ in range(100)] + model = LitClassifier() + trainer = Trainer() + trainer.fit(model, data) + +Examples for custom iterables include `NVIDIA DALI `__ or `FFCV `__ for computer vision. +Both libraries offer support for custom data loading and preprocessing (also hardware accelerated) and can be used with Lightning. + + +For example, taking the example from FFCV's readme, we can use it with Lightning by just removing the hardcoded ``ToDevice(0)`` +as Lightning takes care of GPU placement. In case you want to use some data transformations on GPUs, change the +``ToDevice(0)`` to ``ToDevice(self.trainer.local_rank)`` to correctly map to the desired GPU in your pipeline. + +.. code-block:: python + + from ffcv.loader import Loader, OrderOption + from ffcv.transforms import ToTensor, ToDevice, ToTorchImage, Cutout + from ffcv.fields.decoders import IntDecoder, RandomResizedCropRGBImageDecoder + + + class CustomClassifier(LitClassifier): + def train_dataloader(self): + + # Random resized crop + decoder = RandomResizedCropRGBImageDecoder((224, 224)) + + # Data decoding and augmentation + image_pipeline = [decoder, Cutout(), ToTensor(), ToTorchImage()] + label_pipeline = [IntDecoder(), ToTensor()] + + # Pipeline for each data field + pipelines = {"image": image_pipeline, "label": label_pipeline} + + # Replaces PyTorch data loader (`torch.utils.data.Dataloader`) + loader = Loader( + write_path, batch_size=bs, num_workers=num_workers, order=OrderOption.RANDOM, pipelines=pipelines + ) + + return loader + +When moving data to a specific device, you can always refer to ``self.trainer.local_rank`` to get the accelerator +used by the current process. + +By just changing ``device_id=0`` to ``device_id=self.trainer.local_rank`` we can also leverage DALI's GPU decoding: + +.. code-block:: python + + from nvidia.dali.pipeline import pipeline_def + import nvidia.dali.types as types + import nvidia.dali.fn as fn + from nvidia.dali.plugin.pytorch import DALIGenericIterator + import os + + + class CustomLitClassifier(LitClassifier): + def train_dataloader(self): + + # To run with different data, see documentation of nvidia.dali.fn.readers.file + # points to https://github.com/NVIDIA/DALI_extra + data_root_dir = os.environ["DALI_EXTRA_PATH"] + images_dir = os.path.join(data_root_dir, "db", "single", "jpeg") + + @pipeline_def(num_threads=4, device_id=self.trainer.local_rank) + def get_dali_pipeline(): + images, labels = fn.readers.file(file_root=images_dir, random_shuffle=True, name="Reader") + # decode data on the GPU + images = fn.decoders.image_random_crop(images, device="mixed", output_type=types.RGB) + # the rest of processing happens on the GPU as well + images = fn.resize(images, resize_x=256, resize_y=256) + images = fn.crop_mirror_normalize( + images, + crop_h=224, + crop_w=224, + mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], + std=[0.229 * 255, 0.224 * 255, 0.225 * 255], + mirror=fn.random.coin_flip(), + ) + return images, labels + + train_data = DALIGenericIterator( + [get_dali_pipeline(batch_size=16)], + ["data", "label"], + reader_name="Reader", + ) + + return train_data + + +Limitations +------------ +Lightning works with all kinds of custom data iterables as shown above. There are, however, a few features that cannot +be supported this way. These restrictions come from the fact that for their support, +Lightning needs to know a lot on the internals of these iterables. + +- In a distributed multi-GPU setting (ddp), + Lightning automatically replaces the DataLoader's sampler with its distributed counterpart. + This makes sure that each GPU sees a different part of the dataset. + As sampling can be implemented in arbitrary ways with custom iterables, + there is no way for Lightning to know, how to replace the sampler. + +- When training fails for some reason, Lightning is able to extract all of the relevant data from the model, + optimizers, trainer and dataloader to resume it at the exact same batch it crashed. + This feature is called fault-tolerance and is limited to PyTorch DataLoaders. + Lighning needs to know a lot about sampling, fast forwarding and random number handling to enable fault tolerance, + meaning that it cannot be supported for arbitrary iterables. diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst index 1c867e1e345e9..e8803ba147e83 100644 --- a/docs/source-pytorch/index.rst +++ b/docs/source-pytorch/index.rst @@ -207,6 +207,7 @@ Current Lightning Users Train on single or multiple TPUs Train on MPS Use a pretrained model + Inject Custom Data Iterables model/own_your_loop .. toctree:: From eac6c3fadd78306170026d40708f85a1eee8944f Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 21 Nov 2022 21:53:25 +0100 Subject: [PATCH 17/29] Switch from tensorboard to tensorboardx in logger (#15728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Switch from tensorboard to tensorboardx in logger * Warn if log_graph is set to True but tensorboard is not installed * Fix warning message formatting * Apply suggestions from code review * simplify for TBX as required pkg * docs example * chlog * tbx 2.2 Co-authored-by: Luca Antiga Co-authored-by: William Falcon Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Jirka (cherry picked from commit 9c2eb52c866d98768c3fb43c61cf035b2e3da0ae) --- requirements/pytorch/base.txt | 2 +- requirements/pytorch/extra.txt | 1 - requirements/pytorch/test.txt | 3 +++ src/pytorch_lightning/CHANGELOG.md | 4 ++- src/pytorch_lightning/loggers/tensorboard.py | 25 +++++++++++++++---- tests/tests_pytorch/conftest.py | 1 + .../tests_pytorch/loggers/test_tensorboard.py | 2 ++ 7 files changed, 30 insertions(+), 8 deletions(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index ad9573493ae6f..374fbcb41e54a 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -6,7 +6,7 @@ torch>=1.9.*, <=1.13.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>2021.06.0, <2022.8.0 -tensorboard>=2.9.1, <2.12.0 +tensorboardX>=2.2, <=2.5.1 # min version is set by torch.onnx missing attribute torchmetrics>=0.7.0, <0.10.1 # needed for using fixed compare_version packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <=4.4.0 diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 471f0aafbd50b..3eb221d020230 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -7,4 +7,3 @@ omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.15.2, <4.16.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 -protobuf<=3.20.1 # strict # an extra is updating protobuf, this pin prevents TensorBoard failure diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 5ba99b269e002..d27e3677690a5 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -14,3 +14,6 @@ psutil<5.9.4 # for `DeviceStatsMonitor` pandas>1.0, <1.5.2 # needed in benchmarks fastapi<0.87.0 uvicorn<0.19.1 + +tensorboard>=2.9.1, <2.12.0 +protobuf<=3.20.1 # strict # an extra is updating protobuf, this pin prevents TensorBoard failure diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5420f2ed2a446..b22ee5c8f994c 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -16,6 +16,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) +- Switch from `tensorboard` to `tensorboardx` in `TensorBoardLogger` ([#15728](https://github.com/Lightning-AI/lightning/pull/15728)) + + ### Fixed - @@ -46,7 +49,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.8.0] - 2022-11-01 - ### Added - Added support for requeueing slurm array jobs ([#15040](https://github.com/Lightning-AI/lightning/pull/15040)) diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 50d6e95add25b..1c840a3dea7e1 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -22,9 +22,10 @@ from typing import Any, Dict, Mapping, Optional, Union import numpy as np +from lightning_utilities.core.imports import RequirementCache +from tensorboardX import SummaryWriter +from tensorboardX.summary import hparams from torch import Tensor -from torch.utils.tensorboard import SummaryWriter -from torch.utils.tensorboard.summary import hparams import pytorch_lightning as pl from lightning_lite.utilities.cloud_io import get_filesystem @@ -38,6 +39,8 @@ log = logging.getLogger(__name__) +_TENSORBOARD_AVAILABLE = RequirementCache("tensorboard") + if _OMEGACONF_AVAILABLE: from omegaconf import Container, OmegaConf @@ -46,7 +49,7 @@ class TensorBoardLogger(Logger): r""" Log to local file system in `TensorBoard `_ format. - Implemented using :class:`~torch.utils.tensorboard.SummaryWriter`. Logs are saved to + Implemented using :class:`~tensorboardX.SummaryWriter`. Logs are saved to ``os.path.join(save_dir, name, version)``. This is the default logger in Lightning, it comes preinstalled. @@ -77,11 +80,20 @@ class TensorBoardLogger(Logger): sub_dir: Sub-directory to group TensorBoard logs. If a sub_dir argument is passed then logs are saved in ``/save_dir/name/version/sub_dir/``. Defaults to ``None`` in which logs are saved in ``/save_dir/name/version/``. - \**kwargs: Additional arguments used by :class:`SummaryWriter` can be passed as keyword + \**kwargs: Additional arguments used by :class:`tensorboardX.SummaryWriter` can be passed as keyword arguments in this logger. To automatically flush to disk, `max_queue` sets the size of the queue for pending logs before flushing. `flush_secs` determines how many seconds elapses before flushing. + Example: + >>> import shutil, tempfile + >>> tmp = tempfile.mkdtemp() + >>> tbl = TensorBoardLogger(tmp) + >>> tbl.log_hyperparams({"epochs": 5, "optimizer": "Adam"}) + >>> tbl.log_metrics({"acc": 0.75}) + >>> tbl.log_metrics({"acc": 0.9}) + >>> tbl.finalize("success") + >>> shutil.rmtree(tmp) """ NAME_HPARAMS_FILE = "hparams.yaml" LOGGER_JOIN_CHAR = "-" @@ -103,7 +115,10 @@ def __init__( self._name = name or "" self._version = version self._sub_dir = None if sub_dir is None else os.fspath(sub_dir) - self._log_graph = log_graph + if log_graph and not _TENSORBOARD_AVAILABLE: + rank_zero_warn("You set `TensorBoardLogger(log_graph=True)` but `tensorboard` is not available.") + self._log_graph = log_graph and _TENSORBOARD_AVAILABLE + self._default_hp_metric = default_hp_metric self._prefix = prefix self._fs = get_filesystem(save_dir) diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 2f5607828a232..a4ddd88a39ae5 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -75,6 +75,7 @@ def restore_env_variables(): "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13 "KMP_INIT_AT_FORK", # leaked since PyTorch 1.13 "KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13 + "CRC32C_SW_MODE", # leaked by tensorboardX } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_pytorch/loggers/test_tensorboard.py b/tests/tests_pytorch/loggers/test_tensorboard.py index 90d15c06d7bf1..ddab738269904 100644 --- a/tests/tests_pytorch/loggers/test_tensorboard.py +++ b/tests/tests_pytorch/loggers/test_tensorboard.py @@ -24,6 +24,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.loggers.tensorboard import _TENSORBOARD_AVAILABLE from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE from tests_pytorch.helpers.runif import RunIf @@ -220,6 +221,7 @@ def test_tensorboard_log_graph(tmpdir, example_input_array): logger.log_graph(model, example_input_array) +@pytest.mark.skipif(not _TENSORBOARD_AVAILABLE, reason=str(_TENSORBOARD_AVAILABLE)) def test_tensorboard_log_graph_warning_no_example_input_array(tmpdir): """test that log graph throws warning if model.example_input_array is None.""" model = BoringModel() From 2ac70609705c5a09af5cff5c1b6f17acfefb5633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 21 Nov 2022 12:19:47 +0100 Subject: [PATCH 18/29] resolve conflicts --- src/lightning_app/CHANGELOG.md | 6 +++--- src/lightning_lite/CHANGELOG.md | 6 +++--- .../strategies/launchers/subprocess_script.py | 1 + src/pytorch_lightning/CHANGELOG.md | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index f1a975c2bb93f..2bf7043dcce7d 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -8,12 +8,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- ### Changed -- +- ### Fixed @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed debugging with VSCode IDE ([#15747](https://github.com/Lightning-AI/lightning/pull/15747)) -- +- ## [1.8.2] - 2022-11-17 diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md index 63802002395a5..515bd44ce2415 100644 --- a/src/lightning_lite/CHANGELOG.md +++ b/src/lightning_lite/CHANGELOG.md @@ -8,17 +8,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- ### Changed -- +- ### Fixed -- +- ## [1.8.2] - 2022-11-17 diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py index c9649f4639818..1fb8b9686bb41 100644 --- a/src/lightning_lite/strategies/launchers/subprocess_script.py +++ b/src/lightning_lite/strategies/launchers/subprocess_script.py @@ -14,6 +14,7 @@ import os import subprocess import sys +from time import sleep from typing import Any, Callable, Optional, Sequence, Tuple import numpy as np diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b22ee5c8f994c..088454ba873a5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- ### Changed @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- ## [1.8.2] - 2022-11-17 From 9f383cb443f9b479673a3cfcbb46d2db904db498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 22 Nov 2022 02:25:01 +0100 Subject: [PATCH 19/29] Fix azure path excludes (#15756) Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit aef94ce52bd13c370225875dc5466503cff4b70e) --- .azure/app-cloud-e2e.yml | 7 ++++--- .azure/gpu-benchmark.yml | 7 ++++--- .azure/gpu-tests-lite.yml | 7 ++++--- .azure/gpu-tests-pytorch.yml | 7 ++++--- .azure/hpu-tests.yml | 7 ++++--- .azure/ipu-tests.yml | 7 ++++--- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index 24458c0daa385..0b0372e225adb 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -35,9 +35,10 @@ pr: - "tests/tests_app_examples/**" - "setup.py" - ".actions/**" - - "!requirements/app/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/app/docs.txt" + - "*.md" + - "**/*.md" # variables are automatically exported as environment variables so this will override pip's default cache dir variables: diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 52ad4251d4300..6d16dcaf7ace5 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -23,9 +23,10 @@ pr: - ".azure/gpu-benchmark.yml" - "tests/tests_pytorch/benchmarks/**" - "requirements/pytorch/**" - - "!requirements/pytorch/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/pytorch/docs.txt" + - "*.md" + - "**/*.md" schedules: - cron: "0 0 * * *" # At the end of every day diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 98ff44f879a71..63badbde16250 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -30,9 +30,10 @@ pr: - "tests/tests_lite/**" - "setup.cfg" # includes pytest config - ".actions/**" - - "!requirements/lite/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/lite/docs.txt" + - "*.md" + - "**/*.md" jobs: - job: testing diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 91fe0b6107bd1..2cd83595ed249 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -37,9 +37,10 @@ pr: - "requirements/lite/**" - "src/lightning_lite/**" - ".actions/**" - - "!requirements/**/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/**/docs.txt" + - "*.md" + - "**/*.md" jobs: - job: testing diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index 0c6851754f2a0..bf7ef020de038 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -26,9 +26,10 @@ pr: - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - ".actions/**" - - "!requirements/**/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/**/docs.txt" + - "*.md" + - "**/*.md" jobs: - job: testing diff --git a/.azure/ipu-tests.yml b/.azure/ipu-tests.yml index d96adabf4a1ff..c5009a139c5f1 100644 --- a/.azure/ipu-tests.yml +++ b/.azure/ipu-tests.yml @@ -23,9 +23,10 @@ pr: - "tests/tests_pytorch/**" - "setup.cfg" # includes pytest config - ".actions/**" - - "!requirements/**/docs.txt" - - "!*.md" - - "!**/*.md" + exclude: + - "requirements/**/docs.txt" + - "*.md" + - "**/*.md" variables: - name: poplar_sdk From 50e2ec73467442e9b8618c39d0cdff3c3bc00e73 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Tue, 22 Nov 2022 08:49:10 +0100 Subject: [PATCH 20/29] Disable XSRF protection in StreamlitFrontend to support upload in localhost (#15684) * Enable CORS in StreamlitFrontend to support upload * Only disable XSRF when running on localhost * Update test * Use utility fn to detect if localhost Co-authored-by: Luca Antiga (cherry picked from commit ed3eef0a3f253da99dde6cd1f067abe86e8ca176) --- src/lightning_app/frontend/stream_lit.py | 3 +++ tests/tests_app/frontend/test_stream_lit.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/lightning_app/frontend/stream_lit.py b/src/lightning_app/frontend/stream_lit.py index bc5ec21fb830c..e72248432441b 100644 --- a/src/lightning_app/frontend/stream_lit.py +++ b/src/lightning_app/frontend/stream_lit.py @@ -6,6 +6,7 @@ import lightning_app from lightning_app.frontend.frontend import Frontend +from lightning_app.utilities.cloud import is_running_in_cloud from lightning_app.utilities.imports import requires from lightning_app.utilities.log import get_logfile @@ -83,6 +84,8 @@ def start_server(self, host: str, port: int) -> None: self.flow.name, "--server.headless", "true", # do not open the browser window when running locally + "--server.enableXsrfProtection", + "true" if is_running_in_cloud() else "false", ], env=env, stdout=stdout, diff --git a/tests/tests_app/frontend/test_stream_lit.py b/tests/tests_app/frontend/test_stream_lit.py index cd3a394a89d75..1b3be42c0ae6f 100644 --- a/tests/tests_app/frontend/test_stream_lit.py +++ b/tests/tests_app/frontend/test_stream_lit.py @@ -54,6 +54,8 @@ def test_streamlit_frontend_start_stop_server(subprocess_mock): "root.my.flow", "--server.headless", "true", + "--server.enableXsrfProtection", + "false", ] assert env_variables["LIGHTNING_FLOW_NAME"] == "root.my.flow" From de7b37a1e417b35a4f0bab1ab24cb7d2ca2ebff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 22 Nov 2022 09:15:33 +0100 Subject: [PATCH 21/29] Enable Probot CheckGroup v5.1 (#15763) (cherry picked from commit c55f80fcc886839bab194ff454a853c4575a8f26) --- .github/workflows/probot-check-group.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml index 1aafee679da07..47a60061cc8a3 100644 --- a/.github/workflows/probot-check-group.yml +++ b/.github/workflows/probot-check-group.yml @@ -14,7 +14,7 @@ jobs: if: github.event.pull_request.draft == false timeout-minutes: 61 # in case something is wrong with the internal timeout steps: - - uses: Lightning-AI/probot@v5 + - uses: Lightning-AI/probot@v5.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: From a2c7ccef838a4ae1b7bfdcc314a4210d7175aabb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Nov 2022 08:31:21 +0000 Subject: [PATCH 22/29] Bump pytest from 7.1.3 to 7.2.0 in /requirements (#15677) Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.1.3 to 7.2.0. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.1.3...7.2.0) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> (cherry picked from commit cfb27bd75b92dba86291e25abee24d52cdb1adb8) --- requirements/app/test.txt | 2 +- requirements/lite/test.txt | 2 +- requirements/pytorch/test.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/app/test.txt b/requirements/app/test.txt index 4b50f1fff4285..3fc0e51c42215 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -1,6 +1,6 @@ coverage==6.5.0 codecov==2.1.12 -pytest==7.1.3 +pytest==7.2.0 pytest-timeout==2.1.0 pytest-cov==4.0.0 playwright==1.27.1 diff --git a/requirements/lite/test.txt b/requirements/lite/test.txt index 01759799ff133..98d5f23a7a8aa 100644 --- a/requirements/lite/test.txt +++ b/requirements/lite/test.txt @@ -1,5 +1,5 @@ coverage==6.5.0 codecov==2.1.12 -pytest==7.1.3 +pytest==7.2.0 pytest-cov==4.0.0 pre-commit==2.20.0 diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index d27e3677690a5..537c897620229 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,6 +1,6 @@ coverage==6.5.0 codecov==2.1.12 -pytest==7.1.3 +pytest==7.2.0 pytest-cov==4.0.0 pytest-forked==1.4.0 pytest-rerunfailures==10.2 From 00e9292add3d284c3a2bf1b45d91c8e45bc51252 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Tue, 22 Nov 2022 14:09:44 +0530 Subject: [PATCH 23/29] Fix the `examples/app_dag` App (#14359) * Fix app dag example * Add test * Update doc * Update tests/tests_app_examples/test_app_dag.py Co-authored-by: Sherin Thomas (cherry picked from commit 2b61c92ceb773da91d27fd0e33969699c3966799) --- .../examples/dag/dag_from_scratch.rst | 5 ++--- examples/app_dag/app.py | 6 ++---- src/lightning_app/CHANGELOG.md | 3 +++ src/lightning_app/testing/testing.py | 2 +- tests/tests_app_examples/test_app_dag.py | 21 +++++++++++++++++++ 5 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 tests/tests_app_examples/test_app_dag.py diff --git a/docs/source-app/examples/dag/dag_from_scratch.rst b/docs/source-app/examples/dag/dag_from_scratch.rst index cde46953328bd..4af39f1af794e 100644 --- a/docs/source-app/examples/dag/dag_from_scratch.rst +++ b/docs/source-app/examples/dag/dag_from_scratch.rst @@ -39,10 +39,9 @@ First, let's define the component we need: :lines: 55-79 And its run method executes the steps described above. -Additionally, ``work.stop`` is used to reduce cost when running in the cloud. .. literalinclude:: ../../../examples/app_dag/app.py - :lines: 81-108 + :lines: 80-103 ---- @@ -51,4 +50,4 @@ Step 2: Define the scheduling ***************************** .. literalinclude:: ../../../examples/app_dag/app.py - :lines: 109-137 + :lines: 106-135 diff --git a/examples/app_dag/app.py b/examples/app_dag/app.py index 2c1cfb4309fd6..531e39028420e 100644 --- a/examples/app_dag/app.py +++ b/examples/app_dag/app.py @@ -56,7 +56,7 @@ class DAG(L.LightningFlow): """This component is a DAG.""" - def __init__(self, models_paths): + def __init__(self, models_paths: list): super().__init__() # Step 1: Create a work to get the data. self.data_collector = GetDataWork() @@ -80,12 +80,10 @@ def __init__(self, models_paths): def run(self): # Step 1 and 2: Download and process the data. self.data_collector.run() - self.data_collector.stop() # Stop the data_collector to reduce cost self.processing.run( df_data=self.data_collector.df_data, df_target=self.data_collector.df_target, ) - self.processing.stop() # Stop the processing to reduce cost # Step 3: Launch n models training in parallel. for model, work in self.dict.items(): @@ -128,7 +126,7 @@ def run(self): app = L.LightningApp( ScheduledDAG( DAG, - models=[ + models_paths=[ "svm.SVR", "linear_model.LinearRegression", "tree.DecisionTreeRegressor", diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 2bf7043dcce7d..3391e9c4626bf 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -208,6 +208,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved a bug where the `install` command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) +- Fixed the `examples/app_dag` example ([#14359](https://github.com/Lightning-AI/lightning/pull/14359)) + + ## [0.5.5] - 2022-08-9 ### Deprecated diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 43aa7c55be728..fb087a2d62b71 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -219,7 +219,7 @@ def _run_cli(args) -> Generator: def run_app_in_cloud( app_folder: str, app_name: str = "app.py", extra_args: List[str] = [], debug: bool = True ) -> Generator: - """This utility is used to automate testing e2e application with lightning_app.ai.""" + """This utility is used to automate testing e2e application with lightning.ai.""" # 1. Validate the provide app_folder is correct. if not os.path.exists(os.path.join(app_folder, "app.py")): raise Exception("The app folder should contain an app.py file.") diff --git a/tests/tests_app_examples/test_app_dag.py b/tests/tests_app_examples/test_app_dag.py new file mode 100644 index 0000000000000..6d9a865a0a000 --- /dev/null +++ b/tests/tests_app_examples/test_app_dag.py @@ -0,0 +1,21 @@ +import os +from time import sleep + +import pytest +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import run_app_in_cloud + + +@pytest.mark.cloud +def test_app_dag_example_cloud() -> None: + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_dag")) as (_, _, fetch_logs, _): + + launch_log, finish_log = False, False + while not (launch_log and finish_log): + for log in fetch_logs(["flow"]): + if "Launching a new DAG" in log: + launch_log = True + elif "Finished training and evaluating" in log: + finish_log = True + sleep(1) From afa51e389fbddb4e2f5ceb1fddffb206c0fc9543 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 22 Nov 2022 10:12:58 +0100 Subject: [PATCH 24/29] mergify: drop ready for draft (#15766) (cherry picked from commit 1a07a9c7835e40c12d1118a3b3f21536ee5ed514) --- .github/mergify.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index eb69860666c06..efd58381e4a9b 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -50,6 +50,7 @@ pull_request_rules: - name: Not ready yet conditions: - or: + - draft # filter-out GH draft PRs - label="has conflicts" - "#approved-reviews-by=0" # number of review approvals - "#changes-requested-reviews-by>=1" # no requested changes From 555fe823eb88a9774522f64f69cc7e2a8e2a3c74 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 22 Nov 2022 04:13:42 -0500 Subject: [PATCH 25/29] lightning delete cluster CLI command help text update (#15760) * updated the lighting delete cluster CLI command help text output * updated changelog * typo fix * Apply suggestions from code review Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 75b05733ab6044d1fc6c72ea3b848d0a61c15d66) --- src/lightning_app/cli/lightning_cli_delete.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index 91b0222b28c04..d315fd2397799 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -33,17 +33,19 @@ def delete() -> None: help="Enabling this flag makes the CLI wait until the cluster is deleted.", ) def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> None: - """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. + """Delete a Lightning AI BYOC cluster and all associated cloud provider resources. - Deleting a run also deletes all Runs and Experiments that were started on the cluster. - Deletion permanently removes not only the record of all runs on a cluster, but all associated experiments, - artifacts, metrics, logs, etc. + Deleting a cluster also deletes all apps that were started on the cluster. + Deletion permanently removes not only the record of all apps run on a cluster, + but all associated data, artifacts, metrics, logs, web-UIs, etc. - WARNING: This process may take a few minutes to complete, but once started it CANNOT be rolled back. - Deletion permanently removes not only the BYOC cluster from being managed by Lightning AI, but tears down - every BYOC resource Lightning AI managed (for that cluster id) in the host cloud. + WARNING: This process may take a few minutes to complete, but once started it + CANNOT be rolled back. Deletion tears down every cloud provider resource + managed by Lightning AI and permanently revokes the ability for Lightning AI + to create, manage, or access any resources within the host cloud account. - All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. + All object stores, container registries, logs, compute nodes, volumes, + VPC components, etc. are irreversibly deleted and cannot be recovered! """ cluster_manager = AWSClusterManager() cluster_manager.delete(cluster_id=cluster, force=force, wait=wait) From eeacf693ecc57590f72681d35a9bdeaa63c5732d Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 22 Nov 2022 04:41:33 -0500 Subject: [PATCH 26/29] Deduplicate top level lighting CLI command groups (#15761) * unify remove and delete command groups & the add and delete command groups * added changelog * fix tests * Apply suggestions from code review Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 7b2788e39c113f03abaad37161653ce7f24941ea) --- src/lightning_app/CHANGELOG.md | 3 +- src/lightning_app/cli/cmd_ssh_keys.py | 2 +- src/lightning_app/cli/lightning_cli.py | 4 -- src/lightning_app/cli/lightning_cli_add.py | 40 ------------------- src/lightning_app/cli/lightning_cli_create.py | 32 ++++++++++++++- src/lightning_app/cli/lightning_cli_delete.py | 9 +++++ src/lightning_app/cli/lightning_cli_remove.py | 17 -------- tests/tests_app/cli/test_cli.py | 14 +++++-- 8 files changed, 53 insertions(+), 68 deletions(-) delete mode 100644 src/lightning_app/cli/lightning_cli_add.py delete mode 100644 src/lightning_app/cli/lightning_cli_remove.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 3391e9c4626bf..a793abb1e1776 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -13,7 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- `lightning add ssh-key` CLI command has been transitioned to `lightning create ssh-key` with the same calling signature ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) +- `lightning remove ssh-key` CLI command has been transitioned to `lightning delete ssh-key` with the same calling signature ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) ### Fixed diff --git a/src/lightning_app/cli/cmd_ssh_keys.py b/src/lightning_app/cli/cmd_ssh_keys.py index f831df89c3367..1f32796076ea1 100644 --- a/src/lightning_app/cli/cmd_ssh_keys.py +++ b/src/lightning_app/cli/cmd_ssh_keys.py @@ -19,7 +19,7 @@ def as_json(self) -> str: return json.dumps(self.ssh_keys) def as_table(self) -> Table: - table = Table("id", "public_key", "created", show_header=True) + table = Table("id", "public_key", "created", show_header=True, header_style="bold green") for ssh_key in self.ssh_keys: table.add_row( ssh_key.id, diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 5a41f4b511ac0..2fbed3cd36a6c 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -24,11 +24,9 @@ disconnect, ) from lightning_app.cli.commands.logs import logs -from lightning_app.cli.lightning_cli_add import cli_add from lightning_app.cli.lightning_cli_create import create from lightning_app.cli.lightning_cli_delete import delete from lightning_app.cli.lightning_cli_list import get_list -from lightning_app.cli.lightning_cli_remove import cli_remove from lightning_app.core.constants import DEBUG, ENABLE_APP_COMMENT_COMMAND_EXECUTION, get_lightning_cloud_url from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType @@ -381,8 +379,6 @@ def stop() -> None: _main.add_command(get_list) _main.add_command(delete) _main.add_command(create) -_main.add_command(cli_add) -_main.add_command(cli_remove) @_main.command("ssh") diff --git a/src/lightning_app/cli/lightning_cli_add.py b/src/lightning_app/cli/lightning_cli_add.py deleted file mode 100644 index 9e061d1810690..0000000000000 --- a/src/lightning_app/cli/lightning_cli_add.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -from pathlib import Path -from typing import Optional, Union - -import click -from lightning_cloud.openapi.rest import ApiException - -from lightning_app.cli.cmd_ssh_keys import _SSHKeyManager - - -@click.group("add") -def cli_add() -> None: - """Add Lightning AI self-managed resources (ssh-keys, etc…)""" - pass - - -@cli_add.command("ssh-key") -@click.option("--name", "key_name", default=None, help="name of ssh key") -@click.option("--comment", "comment", default="", help="comment detailing your SSH key") -@click.option( - "--public-key", - "public_key", - help="public key or path to public key file", - required=True, -) -def add_ssh_key( - public_key: Union[str, "os.PathLike[str]"], key_name: Optional[str] = None, comment: Optional[str] = None -) -> None: - """Add a new Lightning AI ssh-key to your account.""" - ssh_key_manager = _SSHKeyManager() - - new_public_key = Path(str(public_key)).read_text() if os.path.isfile(str(public_key)) else public_key - try: - ssh_key_manager.add_key(name=key_name, comment=comment, public_key=str(new_public_key)) - except ApiException as e: - # if we got an exception it might be the user passed the private key file - if os.path.isfile(str(public_key)) and os.path.isfile(f"{public_key}.pub"): - ssh_key_manager.add_key(name=key_name, comment=comment, public_key=Path(f"{public_key}.pub").read_text()) - else: - raise e diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 34c5d356f4989..75803056a85ce 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -1,8 +1,12 @@ -from typing import Any +import os +from pathlib import Path +from typing import Any, Optional, Union import click +from lightning_cloud.openapi.rest import ApiException from lightning_app.cli.cmd_clusters import _check_cluster_name_is_valid, AWSClusterManager +from lightning_app.cli.cmd_ssh_keys import _SSHKeyManager @click.group("create") @@ -77,3 +81,29 @@ def create_cluster( cost_savings=not enable_performance, wait=wait, ) + + +@create.command("ssh-key") +@click.option("--name", "key_name", default=None, help="name of ssh key") +@click.option("--comment", "comment", default="", help="comment detailing your SSH key") +@click.option( + "--public-key", + "public_key", + help="public key or path to public key file", + required=True, +) +def add_ssh_key( + public_key: Union[str, "os.PathLike[str]"], key_name: Optional[str] = None, comment: Optional[str] = None +) -> None: + """Add a new Lightning AI ssh-key to your account.""" + ssh_key_manager = _SSHKeyManager() + + new_public_key = Path(str(public_key)).read_text() if os.path.isfile(str(public_key)) else public_key + try: + ssh_key_manager.add_key(name=key_name, comment=comment, public_key=str(new_public_key)) + except ApiException as e: + # if we got an exception it might be the user passed the private key file + if os.path.isfile(str(public_key)) and os.path.isfile(f"{public_key}.pub"): + ssh_key_manager.add_key(name=key_name, comment=comment, public_key=Path(f"{public_key}.pub").read_text()) + else: + raise e diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index d315fd2397799..bbe2508c27d3a 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -1,6 +1,7 @@ import click from lightning_app.cli.cmd_clusters import AWSClusterManager +from lightning_app.cli.cmd_ssh_keys import _SSHKeyManager @click.group("delete") @@ -49,3 +50,11 @@ def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> Non """ cluster_manager = AWSClusterManager() cluster_manager.delete(cluster_id=cluster, force=force, wait=wait) + + +@delete.command("ssh-key") +@click.argument("key_id") +def remove_ssh_key(key_id: str) -> None: + """Delete a ssh-key from your Lightning AI account.""" + ssh_key_manager = _SSHKeyManager() + ssh_key_manager.remove_key(key_id=key_id) diff --git a/src/lightning_app/cli/lightning_cli_remove.py b/src/lightning_app/cli/lightning_cli_remove.py deleted file mode 100644 index 30112f29ced3d..0000000000000 --- a/src/lightning_app/cli/lightning_cli_remove.py +++ /dev/null @@ -1,17 +0,0 @@ -import click - -from lightning_app.cli.cmd_ssh_keys import _SSHKeyManager - - -@click.group("remove") -def cli_remove() -> None: - """Remove Lightning AI self-managed resources (ssh-keys, etc…)""" - pass - - -@cli_remove.command("ssh-key") -@click.argument("key_id") -def remove_ssh_key(key_id: str) -> None: - """Remove a ssh-key from your Lightning AI account.""" - ssh_key_manager = _SSHKeyManager() - ssh_key_manager.remove_key(key_id=key_id) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 620bd6ea9e74f..c3f5085d9c322 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -61,8 +61,6 @@ def test_main_lightning_cli_no_arguments(): assert "create " in res assert "show " in res assert "ssh " in res - assert "add " in res - assert "remove " in res def test_main_lightning_cli_help(): @@ -76,8 +74,6 @@ def test_main_lightning_cli_help(): assert "create " in res assert "show " in res assert "ssh " in res - assert "add " in res - assert "remove " in res res = os.popen("lightning run --help").read() assert "app " in res @@ -97,6 +93,16 @@ def test_main_lightning_cli_help(): res = os.popen("lightning show cluster --help").read() assert "logs " in res + # inspect create group + res = os.popen("lightning create --help").read() + assert "cluster " in res + assert "ssh-key " in res + + # inspect delete group + res = os.popen("lightning delete --help").read() + assert "cluster " in res + assert "ssh-key " in res + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") From 696fe7de0564d1e0d3a83e1eb915bedb1400044e Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 22 Nov 2022 16:53:03 +0100 Subject: [PATCH 27/29] releasing 1.8.3 --- src/lightning/__version__.py | 2 +- src/lightning_app/CHANGELOG.md | 41 ++++++++++++---------------- src/lightning_app/__version__.py | 2 +- src/lightning_lite/CHANGELOG.md | 14 ++-------- src/lightning_lite/__version__.py | 2 +- src/pytorch_lightning/CHANGELOG.md | 14 +--------- src/pytorch_lightning/__version__.py | 2 +- 7 files changed, 24 insertions(+), 53 deletions(-) diff --git a/src/lightning/__version__.py b/src/lightning/__version__.py index ba22724db3594..eaf9d19e02fdc 100644 --- a/src/lightning/__version__.py +++ b/src/lightning/__version__.py @@ -1 +1 @@ -version = "1.8.2" +version = "1.8.3" diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index a793abb1e1776..23419a48c92b3 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,25 +4,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.8.3] - 2022-11-DD - -### Added - -- - +## [1.8.3] - 2022-11-22 ### Changed -- `lightning add ssh-key` CLI command has been transitioned to `lightning create ssh-key` with the same calling signature ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) -- `lightning remove ssh-key` CLI command has been transitioned to `lightning delete ssh-key` with the same calling signature ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) - +- Deduplicate top level lighting CLI command groups ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) + * `lightning add ssh-key` CLI command has been transitioned to `lightning create ssh-key` + * `lightning remove ssh-key` CLI command has been transitioned to `lightning delete ssh-key` +- Set Torch inference mode for prediction ([#15719](https://github.com/Lightning-AI/lightning/pull/15719)) +- Improved `LightningTrainerScript` start-up time ([#15751](https://github.com/Lightning-AI/lightning/pull/15751)) +- Disable XSRF protection in `StreamlitFrontend` to support upload in localhost ([#15684](https://github.com/Lightning-AI/lightning/pull/15684)) ### Fixed - Fixed debugging with VSCode IDE ([#15747](https://github.com/Lightning-AI/lightning/pull/15747)) - - -- +- Fixed setting property to the `LightningFlow` ([#15750](https://github.com/Lightning-AI/lightning/pull/15750)) ## [1.8.2] - 2022-11-17 @@ -48,9 +44,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712)) - Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714)) -- Fixed setting property to the LightningFlow ([#15750](https://github.com/Lightning-AI/lightning/pull/15750)) - - ## [1.8.1] - 2022-11-10 @@ -145,18 +138,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Application storage prefix moved from `app_id` to `project_id/app_id` (#14583) -- LightningCloud client calls to use keyword arguments instead of positional arguments (#14685) +- Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) +- LightningCloud client calls to use keyword arguments instead of positional arguments ([#14685](https://github.com/Lightning-AI/lightning/pull/14685)) ### Fixed -- Making `threadpool` non-default from LightningCloud client (#14757) -- Resolved a bug where the state change detection using DeepDiff won't work with Path, Drive objects (#14465) -- Resolved a bug where the wrong client was passed to collect cloud logs (#14684) -- Resolved the memory leak issue with the Lightning Cloud package and bumped the requirements to use the latest version (#14697) -- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs (#14458) -- Fixed a bug where the uploaded command file wasn't properly parsed (#14532) -- Resolved `LightningApp(..., debug=True)` (#14464) +- Making `threadpool` non-default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) +- Resolved a bug where the state change detection using DeepDiff won't work with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) +- Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) +- Resolved the memory leak issue with the Lightning Cloud package and bumped the requirements to use the latest version ([#14697](https://github.com/Lightning-AI/lightning/pull/14697)) +- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs ([#14458](https://github.com/Lightning-AI/lightning/pull/14458)) +- Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) +- Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) ## [0.6.0] - 2022-09-08 diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py index ba22724db3594..eaf9d19e02fdc 100644 --- a/src/lightning_app/__version__.py +++ b/src/lightning_app/__version__.py @@ -1 +1 @@ -version = "1.8.2" +version = "1.8.3" diff --git a/src/lightning_lite/CHANGELOG.md b/src/lightning_lite/CHANGELOG.md index 515bd44ce2415..3baf4a52c1054 100644 --- a/src/lightning_lite/CHANGELOG.md +++ b/src/lightning_lite/CHANGELOG.md @@ -4,21 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.8.3] - 2022-11-DD - -### Added - -- - +## [1.8.3] - 2022-11-22 ### Changed -- - - -### Fixed - -- +- Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) ## [1.8.2] - 2022-11-17 diff --git a/src/lightning_lite/__version__.py b/src/lightning_lite/__version__.py index ba22724db3594..eaf9d19e02fdc 100644 --- a/src/lightning_lite/__version__.py +++ b/src/lightning_lite/__version__.py @@ -1 +1 @@ -version = "1.8.2" +version = "1.8.3" diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 088454ba873a5..5a8d856713f27 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,26 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.8.3] - 2022-11-DD - -### Added - -- - +## [1.8.3] - 2022-11-22 ### Changed - Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737)) - - - Switch from `tensorboard` to `tensorboardx` in `TensorBoardLogger` ([#15728](https://github.com/Lightning-AI/lightning/pull/15728)) -### Fixed - -- - - ## [1.8.2] - 2022-11-17 ### Fixed diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index ba22724db3594..eaf9d19e02fdc 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.8.2" +version = "1.8.3" From e002452508cebd56722bc6389228f185d7075e8b Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 22 Nov 2022 17:27:55 +0100 Subject: [PATCH 28/29] CI: lite on GPU --- .azure/gpu-tests-lite.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 63badbde16250..5533e72a72227 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -75,7 +75,6 @@ jobs: - bash: | PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} - python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION} displayName: 'Adjust dependencies' - bash: | From ed26322f25ef392f3c31aaa2f672242ea245f6e0 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Tue, 22 Nov 2022 15:39:20 -0500 Subject: [PATCH 29/29] Fix App Docs for lightning ssh-keys command (#15773) fixed ssh-keys docs (cherry picked from commit 317591d2e2144b4837bf54763b493ac7bd9cc049) --- docs/source-app/workflows/ssh/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/workflows/ssh/index.rst b/docs/source-app/workflows/ssh/index.rst index 28c25409dfbb4..e0bcbc103739c 100644 --- a/docs/source-app/workflows/ssh/index.rst +++ b/docs/source-app/workflows/ssh/index.rst @@ -48,7 +48,7 @@ You can add SSH keys using Lightning.ai website (Lightning.ai > Profile > Keys) .. code:: bash - $ lightning add ssh-key --public-key ~/.ssh/id_ed25519.pub + $ lightning create ssh-key --public-key ~/.ssh/id_ed25519.pub You are now ready to access your Lightning Flow and Work containers.