From 4f0e6adee56b3322378272e6d7a08394394ad162 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Thu, 15 Dec 2022 16:20:35 +0000 Subject: [PATCH 01/10] Add function to remove checkpoint to allow override for extended classes (#16067) (cherry picked from commit 10cc6773e66515dcbda126c442cc4d9a9bbf669d) --- src/pytorch_lightning/callbacks/model_checkpoint.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 2e7b9bbb27b29..b48a31c882306 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -640,7 +640,7 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[ previous, self.last_model_path = self.last_model_path, filepath self._save_checkpoint(trainer, filepath) if previous and previous != filepath: - trainer.strategy.remove_checkpoint(previous) + self._remove_checkpoint(trainer, previous) def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: assert self.monitor @@ -659,7 +659,7 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate previous, self.best_model_path = self.best_model_path, filepath self._save_checkpoint(trainer, filepath) if self.save_top_k == 1 and previous and previous != filepath: - trainer.strategy.remove_checkpoint(previous) + self._remove_checkpoint(trainer, previous) def _update_best_and_save( self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor] @@ -701,7 +701,7 @@ def _update_best_and_save( self._save_checkpoint(trainer, filepath) if del_filepath is not None and filepath != del_filepath: - trainer.strategy.remove_checkpoint(del_filepath) + self._remove_checkpoint(trainer, del_filepath) def to_yaml(self, filepath: Optional[_PATH] = None) -> None: """Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML @@ -718,3 +718,7 @@ def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool: state to diverge between ranks.""" exists = self._fs.exists(filepath) return trainer.strategy.broadcast(exists) + + def _remove_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None: + """Calls the strategy to remove the checkpoint file.""" + trainer.strategy.remove_checkpoint(filepath) From 688957441caadf9deeb29fe9a7ff9e583f37b4ca Mon Sep 17 00:00:00 2001 From: Qiushi Pan <17402261+qqpann@users.noreply.github.com> Date: Fri, 16 Dec 2022 01:44:10 +0900 Subject: [PATCH 02/10] minor fix: indent spaces in comment-out (#16076) (cherry picked from commit 385e5e2a406109866f4cb0f54490b706d2479471) --- examples/pl_loops/kfold.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pl_loops/kfold.py b/examples/pl_loops/kfold.py index af777df6f211d..38103d95ff3a6 100644 --- a/examples/pl_loops/kfold.py +++ b/examples/pl_loops/kfold.py @@ -152,12 +152,12 @@ def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None # self.reset(...) # # self.on_run_start(...) # # # -# while not self.done: # -# self.on_advance_start(...) # -# self.advance(...) # -# self.on_advance_end(...) # +# while not self.done: # +# self.on_advance_start(...) # +# self.advance(...) # +# self.on_advance_end(...) # # # -# return self.on_run_end(...) # +# return self.on_run_end(...) # ############################################################################################# From f11da17611fab98b4956d01146e358cca7a54748 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 16 Dec 2022 08:22:22 +0900 Subject: [PATCH 03/10] ci: print existing candidates (#16077) (cherry picked from commit 9e89aed037d258fc6477c7666cc2d59025c8c312) --- .github/workflows/release-pypi.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 41b84d6a343a0..23d931ce96256 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -108,6 +108,7 @@ jobs: branch = f"origin/builds/{os.getenv('TAG')}" while True: remote_refs = [b.name for b in repo.remote().refs] + print([n for n in remote_refs if "builds" in n]) if branch in remote_refs: break time.sleep(60) From e647b88b474f356b35575cbbc8bf3be925b1c0b5 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Fri, 16 Dec 2022 03:06:05 +0000 Subject: [PATCH 04/10] [App] Fix bug where previously deleted apps cannot be re-run from the CLI (#16082) (cherry picked from commit 5f7403e99cbf4b4f7e7aca8574d91fc4879b7c6e) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/runners/cloud.py | 38 +++++++----- tests/tests_app/cli/test_cloud_cli.py | 6 +- tests/tests_app/runners/test_cloud.py | 84 +++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 17 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 1b38b2d35546d..7f0000d1e448d 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -25,6 +25,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052)) - Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068)) +- Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082)) + ## [1.8.4] - 2022-12-08 diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 8a7170ee0244a..0011c29ce0e6b 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -320,52 +320,58 @@ def dispatch( self._ensure_cluster_project_binding(project.project_id, cluster_id) # Resolve the app name, instance, and cluster ID + existing_app = None existing_instance = None app_name = app_config.name - # List existing instances + # List existing apps # TODO: Add pagination, otherwise this could break if users have a lot of apps. - find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( + all_apps = self.backend.client.lightningapp_v2_service_list_lightningapps_v2( project_id=project.project_id - ) + ).lightningapps - # Seach for instances with the given name (possibly with some random characters appended) + # Seach for apps with the given name (possibly with some random characters appended) pattern = re.escape(f"{app_name}-") + ".{4}" - instances = [ + all_apps = [ lightningapp - for lightningapp in find_instances_resp.lightningapps + for lightningapp in all_apps if lightningapp.name == app_name or (re.fullmatch(pattern, lightningapp.name) is not None) ] - # If instances exist and cluster is None, mimic cluster selection logic to choose a default - if cluster_id is None and len(instances) > 0: + # If apps exist and cluster is None, mimic cluster selection logic to choose a default + if cluster_id is None and len(all_apps) > 0: # Determine the cluster ID cluster_id = self._get_default_cluster(project.project_id) # If an instance exists on the cluster with the same base name - restart it - for instance in instances: - if instance.spec.cluster_id == cluster_id: - existing_instance = instance + for app in all_apps: + instances = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( + project_id=project.project_id, + app_id=app.id, + ).lightningapps + if instances and instances[0].spec.cluster_id == cluster_id: + existing_app = app + existing_instance = instances[0] break - # If instances exist but not on the cluster - choose a randomised name - if len(instances) > 0 and existing_instance is None: + # If apps exist but not on the cluster - choose a randomised name + if len(all_apps) > 0 and existing_app is None: name_exists = True while name_exists: random_name = self._randomise_name(app_name) - name_exists = any([instance.name == random_name for instance in instances]) + name_exists = any([app.name == random_name for app in all_apps]) app_name = random_name # Create the app if it doesn't exist - if existing_instance is None: + if existing_app is None: app_body = Body7(name=app_name, can_download_source_code=True) lit_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2( project_id=project.project_id, body=app_body ) app_id = lit_app.id else: - app_id = existing_instance.spec.app_id + app_id = existing_app.id # check if user has sufficient credits to run an app # if so set the desired state to running otherwise, create the app in stopped state, diff --git a/tests/tests_app/cli/test_cloud_cli.py b/tests/tests_app/cli/test_cloud_cli.py index 598a90368da88..3b9317a3a9613 100644 --- a/tests/tests_app/cli/test_cloud_cli.py +++ b/tests/tests_app/cli/test_cloud_cli.py @@ -11,6 +11,7 @@ from lightning_cloud.openapi import ( V1LightningappV2, V1ListLightningappInstancesResponse, + V1ListLightningappsV2Response, V1ListMembershipsResponse, V1Membership, ) @@ -36,6 +37,9 @@ class FakeResponse: class FakeLightningClient: + def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs): + return V1ListLightningappsV2Response(lightningapps=[]) + def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs): return V1ListLightningappInstancesResponse(lightningapps=[]) @@ -182,7 +186,7 @@ def __init__(self, *args, message, **kwargs): super().__init__() self.message = message - def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs): + def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs): raise ApiException( http_resp=HttpHeaderDict( data=self.message, diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 4d1ebda6ccdc5..b5d05c3e7c5cd 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -31,6 +31,7 @@ V1LightningworkSpec, V1ListClustersResponse, V1ListLightningappInstancesResponse, + V1ListLightningappsV2Response, V1ListMembershipsResponse, V1ListProjectClusterBindingsResponse, V1Membership, @@ -210,6 +211,13 @@ def test_new_instance_on_different_cluster(self, cloud_backend, project_id, old_ app.flows = [] app.frontend = {} + existing_app = MagicMock() + existing_app.name = app_name + existing_app.id = "test-id" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=[existing_app] + ) + existing_instance = MagicMock() existing_instance.name = app_name existing_instance.status.phase = V1LightningappInstanceState.STOPPED @@ -234,6 +242,67 @@ def test_new_instance_on_different_cluster(self, cloud_backend, project_id, old_ assert args[1]["body"].name.startswith(app_name) assert args[1]["body"].cluster_id == new_cluster + def test_running_deleted_app(self, cloud_backend, project_id): + """Deleted apps show up in list apps but not in list instances. + + This tests that we don't try to reacreate a previously deleted app. + """ + app_name = "test-app" + + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="Default Project", project_id=project_id)] + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( + cluster_id=DEFAULT_CLUSTER + ) + + mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( + [ + Externalv1Cluster(id=DEFAULT_CLUSTER), + ] + ) + + mock_client.projects_service_list_project_cluster_bindings.return_value = V1ListProjectClusterBindingsResponse( + clusters=[ + V1ProjectClusterBinding(cluster_id=DEFAULT_CLUSTER), + ] + ) + + # Mock all clusters as global clusters + mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( + id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) + ) + + cloud_backend.client = mock_client + + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + + existing_app = MagicMock() + existing_app.name = app_name + existing_app.id = "test-id" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=[existing_app] + ) + + # Simulate the app as deleted so no instance to return + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + cloud_runtime.dispatch(name=app_name) + + # Check that a new name was used which starts with and does not equal the old name + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once() + args = mock_client.lightningapp_v2_service_create_lightningapp_release_instance.call_args + assert args[1]["body"].name != app_name + assert args[1]["body"].name.startswith(app_name) + @pytest.mark.parametrize("flow_cloud_compute", [None, CloudCompute(name="t2.medium")]) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) def test_run_with_default_flow_compute_config(self, monkeypatch, flow_cloud_compute): @@ -458,6 +527,9 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t lightningapps[0].name = "myapp" lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED lightningapps[0].spec.cluster_id = "test" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -632,6 +704,9 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch lightningapps[0].name = "myapp" lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED lightningapps[0].spec.cluster_id = "test" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -786,6 +861,9 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -912,6 +990,9 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -1127,6 +1208,9 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) From 4ecb340f1b6ff59b37320882ec156ec0fad1a089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 16 Dec 2022 10:49:17 +0100 Subject: [PATCH 05/10] Better check for programmatic lightningignore (#16080) Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit b1ce2639f4371b9cac55dcf2fc6ece3945e8a000) --- .github/workflows/ci-app-examples.yml | 3 +- .../basic/hello_components/pl_multinode.py | 2 +- examples/app_multi_node/train_lite.py | 2 +- examples/app_multi_node/train_lt.py | 4 +- examples/app_multi_node/train_lt_script.py | 4 +- examples/app_multi_node/train_pytorch.py | 2 +- .../app_multi_node/train_pytorch_spawn.py | 4 +- src/lightning_app/CHANGELOG.md | 4 ++ src/lightning_app/core/flow.py | 10 +---- src/lightning_app/core/work.py | 9 +--- .../components/multi_node/test_trainer.py | 2 +- tests/tests_app/runners/test_cloud.py | 4 -- tests/tests_examples_app/conftest.py | 10 ++--- tests/tests_examples_app/local/__init__.py | 6 +-- tests/tests_examples_app/public/__init__.py | 6 +-- .../public/test_multi_node.py | 43 +++++++------------ 16 files changed, 44 insertions(+), 71 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 2046cd940aafb..201f3981f2619 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -89,7 +89,8 @@ jobs: - name: Install Lightning package env: PACKAGE_NAME: ${{ matrix.pkg-name }} - run: pip install -e . + # do not use -e because it will make both packages available since it adds `src` to `sys.path` automatically + run: pip install . - name: Adjust tests if: ${{ matrix.pkg-name == 'lightning' }} diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py index e6764ee8fafae..9df12ec732684 100644 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py @@ -10,7 +10,7 @@ def run(self): trainer = L.Trainer(max_epochs=10, strategy="ddp") trainer.fit(model) -# 8 GPU: (2 nodes of 4 x v100) +# 8 GPUs: (2 nodes of 4 x v100) component = LightningTrainerMultiNode( LightningTrainerDistributed, num_nodes=4, diff --git a/examples/app_multi_node/train_lite.py b/examples/app_multi_node/train_lite.py index 8e546b270a693..89c983a1f4004 100644 --- a/examples/app_multi_node/train_lite.py +++ b/examples/app_multi_node/train_lite.py @@ -31,7 +31,7 @@ def run(self): optimizer.step() -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes of 4 x v100) app = L.LightningApp( LiteMultiNode( LitePyTorchDistributed, diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py index 4abe375c89b9b..8ed62a10fb9de 100644 --- a/examples/app_multi_node/train_lt.py +++ b/examples/app_multi_node/train_lt.py @@ -11,10 +11,10 @@ def run(self): trainer.fit(model) -# 8 GPU: (2 nodes of 4 x v100) +# 8 GPUs: (2 nodes of 4 x v100) component = LightningTrainerMultiNode( LightningTrainerDistributed, - num_nodes=4, + num_nodes=2, cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_lt_script.py b/examples/app_multi_node/train_lt_script.py index d2254e19daac0..58f847368346c 100644 --- a/examples/app_multi_node/train_lt_script.py +++ b/examples/app_multi_node/train_lt_script.py @@ -2,11 +2,11 @@ from lightning.app.components import LightningTrainerScript from lightning.app.utilities.packaging.cloud_compute import CloudCompute -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes of 4 x v100) app = L.LightningApp( LightningTrainerScript( "pl_boring_script.py", num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), + cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 ), ) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index cc9e84297c151..e5a9a1fc93e3b 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -56,6 +56,6 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) # 8 GPUs: (2 nodes x 4 v 100) -compute = L.CloudCompute("gpu-fast-multi") # 4xV100 +compute = L.CloudCompute("gpu-fast-multi") # 4 x v100 component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_pytorch_spawn.py b/examples/app_multi_node/train_pytorch_spawn.py index d29ec83562ffb..165a0c77dbfa9 100644 --- a/examples/app_multi_node/train_pytorch_spawn.py +++ b/examples/app_multi_node/train_pytorch_spawn.py @@ -42,11 +42,11 @@ def run( optimizer.step() -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes x 4 v 100) app = L.LightningApp( PyTorchSpawnMultiNode( PyTorchDistributed, num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 + cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) ) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7f0000d1e448d..9e570f9807f4d 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -25,6 +25,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052)) - Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068)) + +- Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080)) + + - Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082)) diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index 302ba344320d1..0be6b6f8ade98 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -10,13 +10,7 @@ from lightning_app.frontend import Frontend from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive -from lightning_app.utilities.app_helpers import ( - _is_json_serializable, - _lightning_dispatched, - _LightningAppRef, - _set_child_name, - is_overridden, -) +from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden from lightning_app.utilities.component import _sanitize_state from lightning_app.utilities.exceptions import ExitAppException from lightning_app.utilities.introspection import _is_init_context, _is_run_context @@ -325,7 +319,7 @@ def lightningignore(self) -> Tuple[str, ...]: @lightningignore.setter def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: - if _lightning_dispatched(): + if self._backend is not None: raise RuntimeError( f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" " effect" diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 43ffc0006d5ea..029f01fd2f7ae 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -11,12 +11,7 @@ from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive from lightning_app.storage.payload import Payload -from lightning_app.utilities.app_helpers import ( - _is_json_serializable, - _lightning_dispatched, - _LightningAppRef, - is_overridden, -) +from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden from lightning_app.utilities.component import _is_flow_context, _sanitize_state from lightning_app.utilities.enum import ( CacheCallsKeys, @@ -267,7 +262,7 @@ def lightningignore(self) -> Tuple[str, ...]: @lightningignore.setter def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: - if _lightning_dispatched(): + if self._backend is not None: raise RuntimeError( f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" " effect" diff --git a/tests/tests_app/components/multi_node/test_trainer.py b/tests/tests_app/components/multi_node/test_trainer.py index c86e0968e2ab0..3cfb26f1441b6 100644 --- a/tests/tests_app/components/multi_node/test_trainer.py +++ b/tests/tests_app/components/multi_node/test_trainer.py @@ -66,7 +66,7 @@ def test_trainer_run_executor_mps_forced_cpu(accelerator_given, accelerator_expe ({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}), ], ) -@pytest.mark.skipif(not module_available("pytorch"), reason="Lightning is not available") +@pytest.mark.skipif(not module_available("torch"), reason="PyTorch is not available") def test_trainer_run_executor_arguments_choices( args_given: dict, args_expected: dict, diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index b5d05c3e7c5cd..cb4bd5ddaa3c0 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1582,8 +1582,6 @@ def run(self): def test_programmatic_lightningignore(monkeypatch, caplog, tmpdir): - monkeypatch.setenv("LIGHTNING_DISPATCHED", "0") # this is not cleaned up - mock_client = mock.MagicMock() mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( memberships=[V1Membership(name="test-project", project_id="test-project-id")] @@ -1650,8 +1648,6 @@ def run(self): assert "2 files were uploaded" # a.txt and .lightningignore assert "files:\n5.0 MB: a.txt\nPerhaps" in caplog.text # only this file appears - # replicate how the app would dispatch the app, and call `run` - monkeypatch.setenv("LIGHTNING_DISPATCHED", "1") flow.run() diff --git a/tests/tests_examples_app/conftest.py b/tests/tests_examples_app/conftest.py index fcefa6287c3c6..fa2c2a14dcbba 100644 --- a/tests/tests_examples_app/conftest.py +++ b/tests/tests_examples_app/conftest.py @@ -5,8 +5,8 @@ import psutil import pytest +from tests_examples_app.public import _PATH_EXAMPLES -from lightning_app import _PROJECT_ROOT from lightning_app.storage.path import _storage_root_dir from lightning_app.utilities.component import _set_context from lightning_app.utilities.packaging import cloud_compute @@ -24,11 +24,11 @@ def pytest_sessionstart(*_): """Pytest hook that get called after the Session object has been created and before performing collection and entering the run test loop.""" for name, url in GITHUB_APP_URLS.items(): - if not os.path.exists(os.path.join(_PROJECT_ROOT, "examples", name)): - path_examples = os.path.join(_PROJECT_ROOT, "examples") - Popen(["git", "clone", url, name], cwd=path_examples).wait(timeout=90) + app_path = _PATH_EXAMPLES / name + if not os.path.exists(app_path): + Popen(["git", "clone", url, name], cwd=_PATH_EXAMPLES).wait(timeout=90) else: - Popen(["git", "pull", "main"], cwd=os.path.join(_PROJECT_ROOT, "examples", name)).wait(timeout=90) + Popen(["git", "pull", "main"], cwd=app_path).wait(timeout=90) def pytest_sessionfinish(session, exitstatus): diff --git a/tests/tests_examples_app/local/__init__.py b/tests/tests_examples_app/local/__init__.py index dbd6181f8e89a..1e7d17cc6b536 100644 --- a/tests/tests_examples_app/local/__init__.py +++ b/tests/tests_examples_app/local/__init__.py @@ -1,5 +1,3 @@ -import os +from pathlib import Path -from lightning_app import _PROJECT_ROOT - -_PATH_APPS = os.path.join(_PROJECT_ROOT, "tests", "tests_examples_app", "apps") +_PATH_APPS = Path(__file__).resolve().parents[1] / "apps" diff --git a/tests/tests_examples_app/public/__init__.py b/tests/tests_examples_app/public/__init__.py index b70149ce10b11..1f1db5e15eaee 100644 --- a/tests/tests_examples_app/public/__init__.py +++ b/tests/tests_examples_app/public/__init__.py @@ -1,5 +1,3 @@ -import os +from pathlib import Path -from lightning_app import _PROJECT_ROOT - -_PATH_EXAMPLES = os.path.join(_PROJECT_ROOT, "examples") +_PATH_EXAMPLES = Path(__file__).resolve().parents[3] / "examples" diff --git a/tests/tests_examples_app/public/test_multi_node.py b/tests/tests_examples_app/public/test_multi_node.py index a5cd2de40811f..e344fab41eeaa 100644 --- a/tests/tests_examples_app/public/test_multi_node.py +++ b/tests/tests_examples_app/public/test_multi_node.py @@ -1,10 +1,11 @@ import os -import sys from unittest import mock import pytest +from lightning_utilities.core.imports import package_available from tests_examples_app.public import _PATH_EXAMPLES +from lightning_app.testing.helpers import _RunIf from lightning_app.testing.testing import application_testing, LightningTestApp @@ -12,33 +13,13 @@ class LightningTestMultiNodeApp(LightningTestApp): def on_before_run_once(self): res = super().on_before_run_once() if self.works and all(w.has_stopped for w in self.works): - assert len([w for w in self.works]) == 2 + assert len(self.works) == 2 return True return res -@pytest.mark.skip(reason="flaky") -@mock.patch("lightning_app.components.multi_node.base.is_running_in_cloud", return_value=True) -def test_multi_node_example(_, monkeypatch): - monkeypatch.chdir(os.path.join(_PATH_EXAMPLES, "app_multi_node")) - command_line = [ - "app.py", - "--blocking", - "False", - "--open-ui", - "False", - ] - result = application_testing(LightningTestMultiNodeApp, command_line) - assert result.exit_code == 0 - - -class LightningTestMultiNodeWorksApp(LightningTestApp): - def on_before_run_once(self): - res = super().on_before_run_once() - if self.works and all(w.has_stopped for w in self.works): - assert len([w for w in self.works]) == 2 - return True - return res +# for the skip to work, the package needs to be installed without editable mode +_SKIP_LIGHTNING_UNAVAILABLE = pytest.mark.skipif(not package_available("lightning"), reason="script requires lightning") @pytest.mark.parametrize( @@ -46,14 +27,20 @@ def on_before_run_once(self): [ "train_pytorch.py", "train_any.py", - # "app_lite_work.py", "train_pytorch_spawn.py", - # "app_pl_work.py": TODO Add once https://github.com/Lightning-AI/lightning/issues/15556 is resolved. + pytest.param("train_lite.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), + pytest.param("train_lt_script.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), + pytest.param("train_lt.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), ], ) -@pytest.mark.skipif(sys.platform == "win32", reason="flaky") +@_RunIf(skip_windows=True) # flaky @mock.patch("lightning_app.components.multi_node.base.is_running_in_cloud", return_value=True) def test_multi_node_examples(_, app_name, monkeypatch): + # note: this test will fail locally: + # * if you installed `lightning_app`, then the examples need to be + # rewritten to use `lightning_app` imports (CI does this) + # * if you installed `lightning`, then the imports in this file and mocks + # need to be changed to use `lightning`. monkeypatch.chdir(os.path.join(_PATH_EXAMPLES, "app_multi_node")) command_line = [ app_name, @@ -62,5 +49,5 @@ def test_multi_node_examples(_, app_name, monkeypatch): "--open-ui", "False", ] - result = application_testing(LightningTestMultiNodeWorksApp, command_line) + result = application_testing(LightningTestMultiNodeApp, command_line) assert result.exit_code == 0 From af17267ce74160058b0c8255d916ee16fa17d01e Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Fri, 16 Dec 2022 10:16:04 +0000 Subject: [PATCH 06/10] [App] Removing single quote (#16079) (cherry picked from commit 005b6f237472319dee7475a28fbfb2e1dac08d68) --- src/lightning_app/utilities/cli_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 0ec6eabd3022c..b45840a0d9489 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -281,7 +281,7 @@ def _check_version_and_upgrade(): prompt = f"A newer version of {__package_name__} is available ({new_version}). Would you like to upgrade?" if click.confirm(prompt, default=True): - command = f"pip install '{__package_name__}=={new_version}'" + command = f"pip install {__package_name__}=={new_version}" logger.info(f"⚡ RUN: {command}") From 0b3c3eb6332d08d67f1ee145d222918bb2e51729 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 16 Dec 2022 11:24:37 +0100 Subject: [PATCH 07/10] version 1.8.5.post0 --- src/lightning/__version__.py | 2 +- src/lightning_app/CHANGELOG.md | 5 +---- src/lightning_app/__version__.py | 2 +- src/lightning_lite/__version__.py | 2 +- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/__version__.py | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lightning/__version__.py b/src/lightning/__version__.py index cb9f51643e523..4a295c3c7c531 100644 --- a/src/lightning/__version__.py +++ b/src/lightning/__version__.py @@ -1 +1 @@ -version = "1.8.5" +version = "1.8.5.post0" diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 9e570f9807f4d..eb428e3876642 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -24,12 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `AutoScaler` raising an exception when non-default cloud compute is specified ([#15991](https://github.com/Lightning-AI/lightning/pull/15991)) - Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052)) - Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068)) - - - Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080)) - - - Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082)) +- Fixed install/upgrade - removing single quote ([#16079](https://github.com/Lightning-AI/lightning/pull/16079)) ## [1.8.4] - 2022-12-08 diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py index cb9f51643e523..4a295c3c7c531 100644 --- a/src/lightning_app/__version__.py +++ b/src/lightning_app/__version__.py @@ -1 +1 @@ -version = "1.8.5" +version = "1.8.5.post0" diff --git a/src/lightning_lite/__version__.py b/src/lightning_lite/__version__.py index cb9f51643e523..4a295c3c7c531 100644 --- a/src/lightning_lite/__version__.py +++ b/src/lightning_lite/__version__.py @@ -1 +1 @@ -version = "1.8.5" +version = "1.8.5.post0" diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 7d9559ba97ad9..29cf1e28428d2 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.8.5] - 2022-12-15 -- minor cleaning +- Add function to remove checkpoint to allow override for extended classes ([#16067](https://github.com/Lightning-AI/lightning/pull/16067)) ## [1.8.4] - 2022-12-08 diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index cb9f51643e523..4a295c3c7c531 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.8.5" +version = "1.8.5.post0" From dd8c8d5ba78856ea8bbe5a55009c53e2c6dbe532 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 16 Dec 2022 13:17:36 +0100 Subject: [PATCH 08/10] skip example test that relies on unreleased lite code The examples use LightningLite syntax without the run method, which is only available in master --- tests/tests_examples_app/public/test_multi_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_examples_app/public/test_multi_node.py b/tests/tests_examples_app/public/test_multi_node.py index e344fab41eeaa..de9b6f8679b04 100644 --- a/tests/tests_examples_app/public/test_multi_node.py +++ b/tests/tests_examples_app/public/test_multi_node.py @@ -28,7 +28,7 @@ def on_before_run_once(self): "train_pytorch.py", "train_any.py", "train_pytorch_spawn.py", - pytest.param("train_lite.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), + # pytest.param("train_lite.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), pytest.param("train_lt_script.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), pytest.param("train_lt.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), ], From 61bb90e47547e8d2c096c0ca8a06f4fc2e632efc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 16 Dec 2022 13:52:17 +0100 Subject: [PATCH 09/10] fix can't instantiate abstract class [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci fix --- .../precision/test_native_amp_integration.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/tests_lite/plugins/precision/test_native_amp_integration.py b/tests/tests_lite/plugins/precision/test_native_amp_integration.py index cd927229cd8f8..cc111ca1aaa7a 100644 --- a/tests/tests_lite/plugins/precision/test_native_amp_integration.py +++ b/tests/tests_lite/plugins/precision/test_native_amp_integration.py @@ -74,17 +74,15 @@ def test_native_mixed_precision(accelerator, precision, expected_dtype): lite.run() -@RunIf(min_torch="1.13", min_cuda_gpus=1) -def test_native_mixed_precision_fused_optimizer_parity(): - def run(fused=False): +class FusedTest(LightningLite): + def run(self, fused=False): seed_everything(1234) - lite = LightningLite(accelerator="cuda", precision=16, devices=1) - model = nn.Linear(10, 10).to(lite.device) # TODO: replace with individual setup_model call + model = nn.Linear(10, 10).to(self.device) # TODO: replace with individual setup_model call optimizer = torch.optim.Adam(model.parameters(), lr=1.0, fused=fused) - model, optimizer = lite.setup(model, optimizer) - assert isinstance(lite._precision.scaler, torch.cuda.amp.GradScaler) + model, optimizer = self.setup(model, optimizer) + assert isinstance(self._precision.scaler, torch.cuda.amp.GradScaler) data = torch.randn(10, 10, device="cuda") target = torch.randn(10, 10, device="cuda") @@ -94,13 +92,18 @@ def run(fused=False): optimizer.zero_grad() output = model(data) loss = (output - target).abs().sum() - lite.backward(loss) + self.backward(loss) optimizer.step() losses.append(loss.detach()) return torch.stack(losses), model.parameters() - losses, params = run(fused=False) - losses_fused, params_fused = run(fused=True) + +@RunIf(min_torch="1.13", min_cuda_gpus=1) +def test_native_mixed_precision_fused_optimizer_parity(): + lite = FusedTest(accelerator="cuda", precision=16, devices=1) + losses, params = lite.run(fused=False) + lite = FusedTest(accelerator="cuda", precision=16, devices=1) + losses_fused, params_fused = lite.run(fused=True) # Both the regular and the fused version of Adam produce the same losses and model weights torch.testing.assert_close(losses, losses_fused) From 4ec2ce6b4b50247ad61b4af9dfb2f08eeaa1ab40 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 16 Dec 2022 14:20:05 +0100 Subject: [PATCH 10/10] skip bagua --- tests/tests_pytorch/strategies/test_bagua_strategy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests_pytorch/strategies/test_bagua_strategy.py b/tests/tests_pytorch/strategies/test_bagua_strategy.py index 27a965d4b1a05..962dd363f38e5 100644 --- a/tests/tests_pytorch/strategies/test_bagua_strategy.py +++ b/tests/tests_pytorch/strategies/test_bagua_strategy.py @@ -45,6 +45,8 @@ def test_bagua_default(tmpdir): assert isinstance(trainer.strategy, BaguaStrategy) +@pytest.mark.skip(reason="Bagua has issues in 1.8.x release.") +# Exception: Warning! autotune service not ready after 300 seconds... @RunIf(min_cuda_gpus=1, bagua=True) def test_manual_optimization(tmpdir): model = ManualOptimBoringModel()