Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release: 1.8.5.post #16086

Merged
merged 10 commits into from Dec 16, 2022
3 changes: 2 additions & 1 deletion .github/workflows/ci-app-examples.yml
Expand Up @@ -89,7 +89,8 @@ jobs:
- name: Install Lightning package
env:
PACKAGE_NAME: ${{ matrix.pkg-name }}
run: pip install -e .
# do not use -e because it will make both packages available since it adds `src` to `sys.path` automatically
run: pip install .

- name: Adjust tests
if: ${{ matrix.pkg-name == 'lightning' }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/release-pypi.yml
Expand Up @@ -108,6 +108,7 @@ jobs:
branch = f"origin/builds/{os.getenv('TAG')}"
while True:
remote_refs = [b.name for b in repo.remote().refs]
print([n for n in remote_refs if "builds" in n])
if branch in remote_refs:
break
time.sleep(60)
Expand Down
Expand Up @@ -10,7 +10,7 @@ def run(self):
trainer = L.Trainer(max_epochs=10, strategy="ddp")
trainer.fit(model)

# 8 GPU: (2 nodes of 4 x v100)
# 8 GPUs: (2 nodes of 4 x v100)
component = LightningTrainerMultiNode(
LightningTrainerDistributed,
num_nodes=4,
Expand Down
2 changes: 1 addition & 1 deletion examples/app_multi_node/train_lite.py
Expand Up @@ -31,7 +31,7 @@ def run(self):
optimizer.step()


# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes of 4 x v100)
app = L.LightningApp(
LiteMultiNode(
LitePyTorchDistributed,
Expand Down
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_lt.py
Expand Up @@ -11,10 +11,10 @@ def run(self):
trainer.fit(model)


# 8 GPU: (2 nodes of 4 x v100)
# 8 GPUs: (2 nodes of 4 x v100)
component = LightningTrainerMultiNode(
LightningTrainerDistributed,
num_nodes=4,
num_nodes=2,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
)
app = L.LightningApp(component)
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_lt_script.py
Expand Up @@ -2,11 +2,11 @@
from lightning.app.components import LightningTrainerScript
from lightning.app.utilities.packaging.cloud_compute import CloudCompute

# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes of 4 x v100)
app = L.LightningApp(
LightningTrainerScript(
"pl_boring_script.py",
num_nodes=2,
cloud_compute=CloudCompute("gpu-fast-multi"),
cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100
),
)
2 changes: 1 addition & 1 deletion examples/app_multi_node/train_pytorch.py
Expand Up @@ -56,6 +56,6 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int)


# 8 GPUs: (2 nodes x 4 v 100)
compute = L.CloudCompute("gpu-fast-multi") # 4xV100
compute = L.CloudCompute("gpu-fast-multi") # 4 x v100
component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute)
app = L.LightningApp(component)
4 changes: 2 additions & 2 deletions examples/app_multi_node/train_pytorch_spawn.py
Expand Up @@ -42,11 +42,11 @@ def run(
optimizer.step()


# Run over 2 nodes of 4 x V100
# 8 GPUs: (2 nodes x 4 v 100)
app = L.LightningApp(
PyTorchSpawnMultiNode(
PyTorchDistributed,
num_nodes=2,
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100
cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100
)
)
10 changes: 5 additions & 5 deletions examples/pl_loops/kfold.py
Expand Up @@ -152,12 +152,12 @@ def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None
# self.reset(...) #
# self.on_run_start(...) #
# #
# while not self.done: #
# self.on_advance_start(...) #
# self.advance(...) #
# self.on_advance_end(...) #
# while not self.done: #
# self.on_advance_start(...) #
# self.advance(...) #
# self.on_advance_end(...) #
# #
# return self.on_run_end(...) #
# return self.on_run_end(...) #
#############################################################################################


Expand Down
2 changes: 1 addition & 1 deletion src/lightning/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
3 changes: 3 additions & 0 deletions src/lightning_app/CHANGELOG.md
Expand Up @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `AutoScaler` raising an exception when non-default cloud compute is specified ([#15991](https://github.com/Lightning-AI/lightning/pull/15991))
- Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052))
- Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068))
- Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080))
- Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082))
- Fixed install/upgrade - removing single quote ([#16079](https://github.com/Lightning-AI/lightning/pull/16079))


## [1.8.4] - 2022-12-08
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_app/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
10 changes: 2 additions & 8 deletions src/lightning_app/core/flow.py
Expand Up @@ -10,13 +10,7 @@
from lightning_app.frontend import Frontend
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
_set_child_name,
is_overridden,
)
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden
from lightning_app.utilities.component import _sanitize_state
from lightning_app.utilities.exceptions import ExitAppException
from lightning_app.utilities.introspection import _is_init_context, _is_run_context
Expand Down Expand Up @@ -325,7 +319,7 @@ def lightningignore(self) -> Tuple[str, ...]:

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
if self._backend is not None:
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
Expand Down
9 changes: 2 additions & 7 deletions src/lightning_app/core/work.py
Expand Up @@ -11,12 +11,7 @@
from lightning_app.storage import Path
from lightning_app.storage.drive import _maybe_create_drive, Drive
from lightning_app.storage.payload import Payload
from lightning_app.utilities.app_helpers import (
_is_json_serializable,
_lightning_dispatched,
_LightningAppRef,
is_overridden,
)
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden
from lightning_app.utilities.component import _is_flow_context, _sanitize_state
from lightning_app.utilities.enum import (
CacheCallsKeys,
Expand Down Expand Up @@ -267,7 +262,7 @@ def lightningignore(self) -> Tuple[str, ...]:

@lightningignore.setter
def lightningignore(self, lightningignore: Tuple[str, ...]) -> None:
if _lightning_dispatched():
if self._backend is not None:
raise RuntimeError(
f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an"
" effect"
Expand Down
38 changes: 22 additions & 16 deletions src/lightning_app/runners/cloud.py
Expand Up @@ -320,52 +320,58 @@ def dispatch(
self._ensure_cluster_project_binding(project.project_id, cluster_id)

# Resolve the app name, instance, and cluster ID
existing_app = None
existing_instance = None
app_name = app_config.name

# List existing instances
# List existing apps
# TODO: Add pagination, otherwise this could break if users have a lot of apps.
find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances(
all_apps = self.backend.client.lightningapp_v2_service_list_lightningapps_v2(
project_id=project.project_id
)
).lightningapps

# Seach for instances with the given name (possibly with some random characters appended)
# Seach for apps with the given name (possibly with some random characters appended)
pattern = re.escape(f"{app_name}-") + ".{4}"
instances = [
all_apps = [
lightningapp
for lightningapp in find_instances_resp.lightningapps
for lightningapp in all_apps
if lightningapp.name == app_name or (re.fullmatch(pattern, lightningapp.name) is not None)
]

# If instances exist and cluster is None, mimic cluster selection logic to choose a default
if cluster_id is None and len(instances) > 0:
# If apps exist and cluster is None, mimic cluster selection logic to choose a default
if cluster_id is None and len(all_apps) > 0:
# Determine the cluster ID
cluster_id = self._get_default_cluster(project.project_id)

# If an instance exists on the cluster with the same base name - restart it
for instance in instances:
if instance.spec.cluster_id == cluster_id:
existing_instance = instance
for app in all_apps:
instances = self.backend.client.lightningapp_instance_service_list_lightningapp_instances(
project_id=project.project_id,
app_id=app.id,
).lightningapps
if instances and instances[0].spec.cluster_id == cluster_id:
existing_app = app
existing_instance = instances[0]
break

# If instances exist but not on the cluster - choose a randomised name
if len(instances) > 0 and existing_instance is None:
# If apps exist but not on the cluster - choose a randomised name
if len(all_apps) > 0 and existing_app is None:
name_exists = True
while name_exists:
random_name = self._randomise_name(app_name)
name_exists = any([instance.name == random_name for instance in instances])
name_exists = any([app.name == random_name for app in all_apps])

app_name = random_name

# Create the app if it doesn't exist
if existing_instance is None:
if existing_app is None:
app_body = Body7(name=app_name, can_download_source_code=True)
lit_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2(
project_id=project.project_id, body=app_body
)
app_id = lit_app.id
else:
app_id = existing_instance.spec.app_id
app_id = existing_app.id

# check if user has sufficient credits to run an app
# if so set the desired state to running otherwise, create the app in stopped state,
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_app/utilities/cli_helpers.py
Expand Up @@ -281,7 +281,7 @@ def _check_version_and_upgrade():
prompt = f"A newer version of {__package_name__} is available ({new_version}). Would you like to upgrade?"

if click.confirm(prompt, default=True):
command = f"pip install '{__package_name__}=={new_version}'"
command = f"pip install {__package_name__}=={new_version}"

logger.info(f"⚡ RUN: {command}")

Expand Down
2 changes: 1 addition & 1 deletion src/lightning_lite/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
2 changes: 1 addition & 1 deletion src/pytorch_lightning/CHANGELOG.md
Expand Up @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [1.8.5] - 2022-12-15

- minor cleaning
- Add function to remove checkpoint to allow override for extended classes ([#16067](https://github.com/Lightning-AI/lightning/pull/16067))


## [1.8.4] - 2022-12-08
Expand Down
2 changes: 1 addition & 1 deletion src/pytorch_lightning/__version__.py
@@ -1 +1 @@
version = "1.8.5"
version = "1.8.5.post0"
10 changes: 7 additions & 3 deletions src/pytorch_lightning/callbacks/model_checkpoint.py
Expand Up @@ -640,7 +640,7 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[
previous, self.last_model_path = self.last_model_path, filepath
self._save_checkpoint(trainer, filepath)
if previous and previous != filepath:
trainer.strategy.remove_checkpoint(previous)
self._remove_checkpoint(trainer, previous)

def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None:
assert self.monitor
Expand All @@ -659,7 +659,7 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate
previous, self.best_model_path = self.best_model_path, filepath
self._save_checkpoint(trainer, filepath)
if self.save_top_k == 1 and previous and previous != filepath:
trainer.strategy.remove_checkpoint(previous)
self._remove_checkpoint(trainer, previous)

def _update_best_and_save(
self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]
Expand Down Expand Up @@ -701,7 +701,7 @@ def _update_best_and_save(
self._save_checkpoint(trainer, filepath)

if del_filepath is not None and filepath != del_filepath:
trainer.strategy.remove_checkpoint(del_filepath)
self._remove_checkpoint(trainer, del_filepath)

def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
"""Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML
Expand All @@ -718,3 +718,7 @@ def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
state to diverge between ranks."""
exists = self._fs.exists(filepath)
return trainer.strategy.broadcast(exists)

def _remove_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None:
"""Calls the strategy to remove the checkpoint file."""
trainer.strategy.remove_checkpoint(filepath)
6 changes: 5 additions & 1 deletion tests/tests_app/cli/test_cloud_cli.py
Expand Up @@ -11,6 +11,7 @@
from lightning_cloud.openapi import (
V1LightningappV2,
V1ListLightningappInstancesResponse,
V1ListLightningappsV2Response,
V1ListMembershipsResponse,
V1Membership,
)
Expand All @@ -36,6 +37,9 @@ class FakeResponse:


class FakeLightningClient:
def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs):
return V1ListLightningappsV2Response(lightningapps=[])

def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs):
return V1ListLightningappInstancesResponse(lightningapps=[])

Expand Down Expand Up @@ -182,7 +186,7 @@ def __init__(self, *args, message, **kwargs):
super().__init__()
self.message = message

def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs):
def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs):
raise ApiException(
http_resp=HttpHeaderDict(
data=self.message,
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_app/components/multi_node/test_trainer.py
Expand Up @@ -66,7 +66,7 @@ def test_trainer_run_executor_mps_forced_cpu(accelerator_given, accelerator_expe
({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}),
],
)
@pytest.mark.skipif(not module_available("pytorch"), reason="Lightning is not available")
@pytest.mark.skipif(not module_available("torch"), reason="PyTorch is not available")
def test_trainer_run_executor_arguments_choices(
args_given: dict,
args_expected: dict,
Expand Down