Skip to content

Commit

Permalink
release: 1.8.5.post (#16086)
Browse files Browse the repository at this point in the history
* Add function to remove checkpoint to allow override for extended classes (#16067)

(cherry picked from commit 10cc677)

* minor fix: indent spaces in comment-out (#16076)

(cherry picked from commit 385e5e2)

* ci: print existing candidates (#16077)

(cherry picked from commit 9e89aed)

* [App] Fix bug where previously deleted apps cannot be re-run from the CLI (#16082)

(cherry picked from commit 5f7403e)

* Better check for programmatic lightningignore (#16080)

Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>

(cherry picked from commit b1ce263)

* [App] Removing single quote (#16079)

(cherry picked from commit 005b6f2)

* version 1.8.5.post0

* skip example test that relies on unreleased lite code

The examples use LightningLite syntax without the run method, which is only available in master

* fix can't instantiate abstract class


[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

fix

* skip bagua

Co-authored-by: Sean Naren <snarenthiran@nvidia.com>
Co-authored-by: Qiushi Pan <17402261+qqpann@users.noreply.github.com>
Co-authored-by: Ethan Harris <ethanwharris@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Sherin Thomas <sherin@lightning.ai>
Co-authored-by: awaelchli <aedu.waelchli@gmail.com>
  • Loading branch information
7 people committed Dec 16, 2022
1 parent e5d5901 commit a8a3519
Show file tree
Hide file tree
Showing 29 changed files with 188 additions and 112 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci-app-examples.yml
Expand Up @@ -89,7 +89,8 @@ jobs:
- name: Install Lightning package
env:
PACKAGE_NAME: ${{ matrix.pkg-name }}
run: pip install -e .
# do not use -e because it will make both packages available since it adds `src` to `sys.path` automatically
run: pip install .

- name: Adjust tests
if: ${{ matrix.pkg-name == 'lightning' }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/release-pypi.yml
Expand Up @@ -108,6 +108,7 @@ jobs:
branch = f"origin/builds/{os.getenv('TAG')}"
while True:
remote_refs = [b.name for b in repo.remote().refs]
print([n for n in remote_refs if "builds" in n])
if branch in remote_refs:
break
time.sleep(60)
Expand Down
Expand Up @@ -10,7 +10,7 @@ def run(self):
trainer = L.Trainer(max_epochs=10, strategy="ddp")
trainer.fit(model)

# 8 GPU: (2 nodes of 4 x v100)
# 8 GPUs: (2 nodes of 4 x v100)
component = LightningTrainerMultiNode(
LightningTrainerDistributed,
num_nodes=4,
Expand Down
2 changes: 1 addition & 1 deletion examples/app_multi_node/train_lite.py
Expand Up @@ -31,7 +31,7 @@ def run(self):
optimizer.step()


# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes of 4 x v100)
app = L.LightningApp(
LiteMultiNode(
LitePyTorchDistributed,
Expand Down
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_lt.py
Expand Up @@ -11,10 +11,10 @@ def run(self):
trainer.fit(model)


# 8 GPU: (2 nodes of 4 x v100)
# 8 GPUs: (2 nodes of 4 x v100)
component = LightningTrainerMultiNode(
LightningTrainerDistributed,
num_nodes=4,
num_nodes=2,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
)
app = L.LightningApp(component)
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_lt_script.py
Expand Up @@ -2,11 +2,11 @@
from lightning.app.components import LightningTrainerScript
from lightning.app.utilities.packaging.cloud_compute import CloudCompute

# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes of 4 x v100)
app = L.LightningApp(
LightningTrainerScript(
"pl_boring_script.py",
num_nodes=2,
cloud_compute=CloudCompute("gpu-fast-multi"),
cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100
),
)
2 changes: 1 addition & 1 deletion examples/app_multi_node/train_pytorch.py
Expand Up @@ -56,6 +56,6 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int)


# 8 GPUs: (2 nodes x 4 v 100)
compute = L.CloudCompute("gpu-fast-multi") # 4xV100
compute = L.CloudCompute("gpu-fast-multi") # 4 x v100
component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute)
app = L.LightningApp(component)
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_pytorch_spawn.py
Expand Up @@ -42,11 +42,11 @@ def run(
optimizer.step()


# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes x 4 v 100)
app = L.LightningApp(
PyTorchSpawnMultiNode(
PyTorchDistributed,
num_nodes=2,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
)
)
10 changes: 5 additions & 5 deletions examples/pl_loops/kfold.py
Expand Up @@ -152,12 +152,12 @@ def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None
# self.reset(...) #
# self.on_run_start(...) #
# #
# while not self.done: #
# self.on_advance_start(...) #
# self.advance(...) #
# self.on_advance_end(...) #
# while not self.done: #
# self.on_advance_start(...) #
# self.advance(...) #
# self.on_advance_end(...) #
# #
# return self.on_run_end(...) #
# return self.on_run_end(...) #
#############################################################################################


Expand Down
2 changes: 1 addition & 1 deletion src/lightning/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
3 changes: 3 additions & 0 deletions src/lightning_app/CHANGELOG.md
Expand Up @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `AutoScaler` raising an exception when non-default cloud compute is specified ([#15991](https://github.com/Lightning-AI/lightning/pull/15991))
- Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052))
- Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068))
- Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080))
- Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082))
- Fixed install/upgrade - removing single quote ([#16079](https://github.com/Lightning-AI/lightning/pull/16079))


## [1.8.4] - 2022-12-08
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_app/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
10 changes: 2 additions & 8 deletions src/lightning_app/core/flow.py
Expand Up @@ -10,13 +10,7 @@
from lightning_app.frontend import Frontend
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
_set_child_name,
is_overridden,
)
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden
from lightning_app.utilities.component import _sanitize_state
from lightning_app.utilities.exceptions import ExitAppException
from lightning_app.utilities.introspection import _is_init_context, _is_run_context
Expand Down Expand Up @@ -325,7 +319,7 @@ def lightningignore(self) -> Tuple[str, ...]:

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
if self._backend is not None:
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
Expand Down
9 changes: 2 additions & 7 deletions src/lightning_app/core/work.py
Expand Up @@ -11,12 +11,7 @@
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.storage.payload import Payload
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
is_overridden,
)
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden
from lightning_app.utilities.component import _is_flow_context, _sanitize_state
from lightning_app.utilities.enum import (
CacheCallsKeys,
Expand Down Expand Up @@ -267,7 +262,7 @@ def lightningignore(self) -> Tuple[str, ...]:

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
if self._backend is not None:
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
Expand Down
38 changes: 22 additions & 16 deletions src/lightning_app/runners/cloud.py
Expand Up @@ -320,52 +320,58 @@ def dispatch(
self._ensure_cluster_project_binding(project.project_id, cluster_id)

# Resolve the app name, instance, and cluster ID
existing_app = None
existing_instance = None
app_name = app_config.name

# List existing instances
# List existing apps
# TODO: Add pagination, otherwise this could break if users have a lot of apps.
find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances(
all_apps = self.backend.client.lightningapp_v2_service_list_lightningapps_v2(
project_id=project.project_id
)
).lightningapps

# Seach for instances with the given name (possibly with some random characters appended)
# Seach for apps with the given name (possibly with some random characters appended)
pattern = re.escape(f"{app_name}-") + ".{4}"
instances = [
all_apps = [
lightningapp
for lightningapp in find_instances_resp.lightningapps
for lightningapp in all_apps
if lightningapp.name == app_name or (re.fullmatch(pattern, lightningapp.name) is not None)
]

# If instances exist and cluster is None, mimic cluster selection logic to choose a default
if cluster_id is None and len(instances) > 0:
# If apps exist and cluster is None, mimic cluster selection logic to choose a default
if cluster_id is None and len(all_apps) > 0:
# Determine the cluster ID
cluster_id = self._get_default_cluster(project.project_id)

# If an instance exists on the cluster with the same base name - restart it
for instance in instances:
if instance.spec.cluster_id == cluster_id:
existing_instance = instance
for app in all_apps:
instances = self.backend.client.lightningapp_instance_service_list_lightningapp_instances(
project_id=project.project_id,
app_id=app.id,
).lightningapps
if instances and instances[0].spec.cluster_id == cluster_id:
existing_app = app
existing_instance = instances[0]
break

# If instances exist but not on the cluster - choose a randomised name
if len(instances) > 0 and existing_instance is None:
# If apps exist but not on the cluster - choose a randomised name
if len(all_apps) > 0 and existing_app is None:
name_exists = True
while name_exists:
random_name = self._randomise_name(app_name)
name_exists = any([instance.name == random_name for instance in instances])
name_exists = any([app.name == random_name for app in all_apps])

app_name = random_name

# Create the app if it doesn't exist
if existing_instance is None:
if existing_app is None:
app_body = Body7(name=app_name, can_download_source_code=True)
lit_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2(
project_id=project.project_id, body=app_body
)
app_id = lit_app.id
else:
app_id = existing_instance.spec.app_id
app_id = existing_app.id

# check if user has sufficient credits to run an app
# if so set the desired state to running otherwise, create the app in stopped state,
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_app/utilities/cli_helpers.py
Expand Up @@ -281,7 +281,7 @@ def _check_version_and_upgrade():
prompt = f"A newer version of {__package_name__} is available ({new_version}). Would you like to upgrade?"

if click.confirm(prompt, default=True):
command = f"pip install '{__package_name__}=={new_version}'"
command = f"pip install {__package_name__}=={new_version}"

logger.info(f"⚡ RUN: {command}")

Expand Down
2 changes: 1 addition & 1 deletion src/lightning_lite/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
2 changes: 1 addition & 1 deletion src/pytorch_lightning/CHANGELOG.md
Expand Up @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [1.8.5] - 2022-12-15

- minor cleaning
- Add function to remove checkpoint to allow override for extended classes ([#16067](https://github.com/Lightning-AI/lightning/pull/16067))


## [1.8.4] - 2022-12-08
Expand Down
2 changes: 1 addition & 1 deletion src/pytorch_lightning/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
10 changes: 7 additions & 3 deletions src/pytorch_lightning/callbacks/model_checkpoint.py
Expand Up @@ -640,7 +640,7 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[
previous, self.last_model_path = self.last_model_path, filepath
self._save_checkpoint(trainer, filepath)
if previous and previous != filepath:
trainer.strategy.remove_checkpoint(previous)
self._remove_checkpoint(trainer, previous)

def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None:
assert self.monitor
Expand All @@ -659,7 +659,7 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate
previous, self.best_model_path = self.best_model_path, filepath
self._save_checkpoint(trainer, filepath)
if self.save_top_k == 1 and previous and previous != filepath:
trainer.strategy.remove_checkpoint(previous)
self._remove_checkpoint(trainer, previous)

def _update_best_and_save(
self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]
Expand Down Expand Up @@ -701,7 +701,7 @@ def _update_best_and_save(
self._save_checkpoint(trainer, filepath)

if del_filepath is not None and filepath != del_filepath:
trainer.strategy.remove_checkpoint(del_filepath)
self._remove_checkpoint(trainer, del_filepath)

def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
"""Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML
Expand All @@ -718,3 +718,7 @@ def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
state to diverge between ranks."""
exists = self._fs.exists(filepath)
return trainer.strategy.broadcast(exists)

def _remove_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None:
"""Calls the strategy to remove the checkpoint file."""
trainer.strategy.remove_checkpoint(filepath)
6 changes: 5 additions & 1 deletion tests/tests_app/cli/test_cloud_cli.py
Expand Up @@ -11,6 +11,7 @@
from lightning_cloud.openapi import (
V1LightningappV2,
V1ListLightningappInstancesResponse,
V1ListLightningappsV2Response,
V1ListMembershipsResponse,
V1Membership,
)
Expand All @@ -36,6 +37,9 @@ class FakeResponse:


class FakeLightningClient:
def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs):
return V1ListLightningappsV2Response(lightningapps=[])

def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs):
return V1ListLightningappInstancesResponse(lightningapps=[])

Expand Down Expand Up @@ -182,7 +186,7 @@ def __init__(self, *args, message, **kwargs):
super().__init__()
self.message = message

def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs):
def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs):
raise ApiException(
http_resp=HttpHeaderDict(
data=self.message,
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_app/components/multi_node/test_trainer.py
Expand Up @@ -66,7 +66,7 @@ def test_trainer_run_executor_mps_forced_cpu(accelerator_given, accelerator_expe
({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}),
],
)
@pytest.mark.skipif(not module_available("pytorch"), reason="Lightning is not available")
@pytest.mark.skipif(not module_available("torch"), reason="PyTorch is not available")
def test_trainer_run_executor_arguments_choices(
args_given: dict,
args_expected: dict,
Expand Down

0 comments on commit a8a3519

Please sign in to comment.