Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add better error handling when tearing down service #4161

Merged
merged 13 commits into from Aug 26, 2022
30 changes: 30 additions & 0 deletions tests/unit_tests/test_wandb.py
Expand Up @@ -582,6 +582,36 @@ def test_attach_usage_errors(wandb_init):
run.finish()


# ----------------------------------
# wandb.teardown
# ----------------------------------

# In a notebook environment we might get into a situation where the service process will be removed
# but the singleton setup instance still exists, hence it will try to do the teardown.
# Howeverwandb.teardown will encounter an error because the service process is already gone.
# but since we have an error handle logic in the teardown, we don't see the error
# only informational message about the error.
def test_teardown_error_path(capsys):
with mock.patch.dict(
os.environ, {wandb.env.SERVICE: "2-96604-tcp-localhost-57337"}
):
with mock.patch.object(
wandb.sdk.wandb_manager._Manager,
"_get_service_interface",
return_value=mock.MagicMock(),
):
wandb.setup()
assert wandb.wandb_sdk.wandb_setup._WandbSetup._instance
wandb.teardown()
assert wandb.env.SERVICE not in os.environ
assert not wandb.wandb_sdk.wandb_setup._WandbSetup._instance
_, err = capsys.readouterr()
assert (
"While tearing down the service manager. The following error has occured:"
in err
)


# TODO: test these or make sure they are tested somewhere
# run.use_artifact()
# run.log_artifact()
19 changes: 14 additions & 5 deletions wandb/sdk/wandb_manager.py
Expand Up @@ -7,6 +7,7 @@
import os
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional

import wandb
from wandb import env, trigger
from wandb.sdk.lib.exit_hooks import ExitHooks
from wandb.sdk.lib.import_hooks import unregister_all_post_import_hooks
Expand Down Expand Up @@ -90,6 +91,7 @@ class _Manager:
_atexit_lambda: Optional[Callable[[], None]]
_hooks: Optional[ExitHooks]
_settings: "Settings"
_service: "service._Service"

def __init__(self, settings: "Settings", _use_grpc: bool = False) -> None:
# TODO: warn if user doesnt have grpc installed
Expand Down Expand Up @@ -141,11 +143,18 @@ def _teardown(self, exit_code: int) -> None:
atexit.unregister(self._atexit_lambda)
self._atexit_lambda = None

self._inform_teardown(exit_code)
result = self._service.join()
if result and not self._settings._jupyter:
os._exit(result)
self._token.reset_environment()
try:
self._inform_teardown(exit_code)
result = self._service.join()
if result and not self._settings._jupyter:
os._exit(result)
except Exception as e:
wandb.termlog(
f"While tearing down the service manager. The following error has occured: {e}",
repeat=False,
)
finally:
self._token.reset_environment()

def _get_service(self) -> "service._Service":
return self._service
Expand Down