Skip to content

Commit

Permalink
Cleanup cluster waiting (#16054)
Browse files Browse the repository at this point in the history
  • Loading branch information
luca3rd authored and awaelchli committed Dec 15, 2022
1 parent 7ce2453 commit 82c62a6
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions src/lightning_app/cli/cmd_clusters.py
Expand Up @@ -231,7 +231,7 @@ def _wait_for_cluster_state(
cluster_id: str,
target_state: V1ClusterState,
timeout_seconds: int = MAX_CLUSTER_WAIT_TIME,
poll_duration_seconds: int = 10,
poll_duration_seconds: int = 60,
) -> None:
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
Expand Down Expand Up @@ -307,21 +307,24 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
duration = _format_elapsed_seconds(elapsed)

if current_state == V1ClusterState.FAILED:
return dedent(
f"""\
The requested cluster operation for cluster {cluster_id} has errors:
{current_reason}
if not _is_retryable_error(current_reason):
return dedent(
f"""\
The requested cluster operation for cluster {cluster_id} has errors:
---
We are automatically retrying, and an automated alert has been created
{current_reason}
WARNING: Any non-deleted cluster may be using resources.
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
lightning delete cluster {cluster_id}
--------------------------------------------------------------
Contact support@lightning.ai for additional help
"""
)
We are automatically retrying, and an automated alert has been created
WARNING: Any non-deleted cluster may be using resources.
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
lightning delete cluster {cluster_id}
Contact support@lightning.ai for additional help
"""
)

if desired_state == current_state == V1ClusterState.RUNNING:
return dedent(
Expand Down Expand Up @@ -352,6 +355,10 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
raise click.ClickException(f"Unknown cluster desired state {desired_state}")


def _is_retryable_error(error_message: str) -> bool:
return "resources failed to delete" in error_message


def _format_elapsed_seconds(seconds: Union[float, int]) -> str:
"""Turns seconds into a duration string.
Expand Down

0 comments on commit 82c62a6

Please sign in to comment.