Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup cluster waiting #16054

Merged
merged 3 commits into from Dec 14, 2022
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 20 additions & 13 deletions src/lightning_app/cli/cmd_clusters.py
Expand Up @@ -231,7 +231,7 @@ def _wait_for_cluster_state(
cluster_id: str,
target_state: V1ClusterState,
timeout_seconds: int = MAX_CLUSTER_WAIT_TIME,
poll_duration_seconds: int = 10,
poll_duration_seconds: int = 60,
) -> None:
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.

Expand Down Expand Up @@ -307,21 +307,24 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
duration = _format_elapsed_seconds(elapsed)

if current_state == V1ClusterState.FAILED:
return dedent(
f"""\
The requested cluster operation for cluster {cluster_id} has errors:
{current_reason}
if not _is_retryable_error(current_reason):
return dedent(
f"""\
The requested cluster operation for cluster {cluster_id} has errors:

---
We are automatically retrying, and an automated alert has been created
{current_reason}

WARNING: Any non-deleted cluster may be using resources.
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
lightning delete cluster {cluster_id}
--------------------------------------------------------------

Contact support@lightning.ai for additional help
"""
)
We are automatically retrying, and an automated alert has been created
ethanwharris marked this conversation as resolved.
Show resolved Hide resolved

WARNING: Any non-deleted cluster may be using resources.
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
lightning delete cluster {cluster_id}

Contact support@lightning.ai for additional help
"""
)

if desired_state == current_state == V1ClusterState.RUNNING:
return dedent(
Expand Down Expand Up @@ -352,6 +355,10 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
raise click.ClickException(f"Unknown cluster desired state {desired_state}")


def _is_retryable_error(error_message: str) -> bool:
return "resources failed to delete" in error_message


def _format_elapsed_seconds(seconds: Union[float, int]) -> str:
"""Turns seconds into a duration string.

Expand Down