From 12ece168df5c23603e22bc2c5d21a3cbd634fbcc Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 16:40:08 -0700 Subject: [PATCH 01/10] check this --- wandb/sdk/lib/sock_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wandb/sdk/lib/sock_client.py b/wandb/sdk/lib/sock_client.py index cc47c1a685f..a748216df8a 100644 --- a/wandb/sdk/lib/sock_client.py +++ b/wandb/sdk/lib/sock_client.py @@ -157,7 +157,7 @@ def send_server_request(self, msg: Any) -> None: def send_server_response(self, msg: Any) -> None: try: self._send_message(msg) - except BrokenPipeError: + except (BrokenPipeError, SockClientClosedError): # TODO(jhr): user thread might no longer be around to receive responses to # things like network status poll loop, there might be a better way to quiesce pass From 6f63b8dac56fe70a479e718be2b39a0ea39abb97 Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 16:46:55 -0700 Subject: [PATCH 02/10] print message --- wandb/sdk/lib/sock_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wandb/sdk/lib/sock_client.py b/wandb/sdk/lib/sock_client.py index a748216df8a..751db0217ed 100644 --- a/wandb/sdk/lib/sock_client.py +++ b/wandb/sdk/lib/sock_client.py @@ -158,6 +158,7 @@ def send_server_response(self, msg: Any) -> None: try: self._send_message(msg) except (BrokenPipeError, SockClientClosedError): + print("HERE!!!!") # TODO(jhr): user thread might no longer be around to receive responses to # things like network status poll loop, there might be a better way to quiesce pass From 8588d5000041ec572baf44bf8f273b3fb790fffe Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 16:49:44 -0700 Subject: [PATCH 03/10] more debug --- wandb/sdk/lib/sock_client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wandb/sdk/lib/sock_client.py b/wandb/sdk/lib/sock_client.py index 751db0217ed..0b7c7fed28c 100644 --- a/wandb/sdk/lib/sock_client.py +++ b/wandb/sdk/lib/sock_client.py @@ -141,6 +141,9 @@ def _sendall_with_error_handle(self, data: bytes) -> None: delta_time = time.monotonic() - start_time if delta_time < self._retry_delay: time.sleep(self._retry_delay - delta_time) + except BrokenPipeError: + print("broken pipe!!!!!!") + raise SockClientClosedError("socket connection broken") def _send_message(self, msg: Any) -> None: tracelog.log_message_send(msg, self._sockid) From 34baf07db02f922c2bcc668b37cf4eee1066c965 Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 20:44:08 -0700 Subject: [PATCH 04/10] manager --- wandb/sdk/lib/sock_client.py | 6 +----- wandb/sdk/wandb_manager.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/wandb/sdk/lib/sock_client.py b/wandb/sdk/lib/sock_client.py index 0b7c7fed28c..cc47c1a685f 100644 --- a/wandb/sdk/lib/sock_client.py +++ b/wandb/sdk/lib/sock_client.py @@ -141,9 +141,6 @@ def _sendall_with_error_handle(self, data: bytes) -> None: delta_time = time.monotonic() - start_time if delta_time < self._retry_delay: time.sleep(self._retry_delay - delta_time) - except BrokenPipeError: - print("broken pipe!!!!!!") - raise SockClientClosedError("socket connection broken") def _send_message(self, msg: Any) -> None: tracelog.log_message_send(msg, self._sockid) @@ -160,8 +157,7 @@ def send_server_request(self, msg: Any) -> None: def send_server_response(self, msg: Any) -> None: try: self._send_message(msg) - except (BrokenPipeError, SockClientClosedError): - print("HERE!!!!") + except BrokenPipeError: # TODO(jhr): user thread might no longer be around to receive responses to # things like network status poll loop, there might be a better way to quiesce pass diff --git a/wandb/sdk/wandb_manager.py b/wandb/sdk/wandb_manager.py index bac66e0693d..1af57906669 100644 --- a/wandb/sdk/wandb_manager.py +++ b/wandb/sdk/wandb_manager.py @@ -141,11 +141,16 @@ def _teardown(self, exit_code: int) -> None: atexit.unregister(self._atexit_lambda) self._atexit_lambda = None - self._inform_teardown(exit_code) - result = self._service.join() - if result and not self._settings._jupyter: - os._exit(result) - self._token.reset_environment() + try: + self._inform_teardown(exit_code) + result = self._service.join() + if result and not self._settings._jupyter: + os._exit(result) + except Exception as e: + print("teardown error", e) + pass + finally: + self._token.reset_environment() def _get_service(self) -> "service._Service": return self._service From c33e19fe728395d78093a78d42311fd00d5dc690 Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 20:55:01 -0700 Subject: [PATCH 05/10] teardown --- wandb/sdk/wandb_run.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index 990c1119d01..cdd45bd5288 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -1933,6 +1933,8 @@ def _atexit_cleanup(self, exit_code: int = None) -> None: wandb.termerror("Control-C detected -- Run data was not synced") if not self._settings._jupyter: os._exit(-1) + else: + wandb.teardown() except Exception as e: if not self._settings._jupyter: report_failure = True @@ -1946,6 +1948,8 @@ def _atexit_cleanup(self, exit_code: int = None) -> None: finally: if report_failure: os._exit(-1) + else: + wandb.teardown() def _console_start(self) -> None: logger.info("atexit reg") From ee6daab8d1dae7d667d01492222bd9f7e3aaff9e Mon Sep 17 00:00:00 2001 From: Katia Date: Mon, 22 Aug 2022 20:58:35 -0700 Subject: [PATCH 06/10] restore run --- wandb/sdk/wandb_run.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index cdd45bd5288..990c1119d01 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -1933,8 +1933,6 @@ def _atexit_cleanup(self, exit_code: int = None) -> None: wandb.termerror("Control-C detected -- Run data was not synced") if not self._settings._jupyter: os._exit(-1) - else: - wandb.teardown() except Exception as e: if not self._settings._jupyter: report_failure = True @@ -1948,8 +1946,6 @@ def _atexit_cleanup(self, exit_code: int = None) -> None: finally: if report_failure: os._exit(-1) - else: - wandb.teardown() def _console_start(self) -> None: logger.info("atexit reg") From bc0ecf9654ee90b85c4d6aad95e9101ae138d462 Mon Sep 17 00:00:00 2001 From: Katia Date: Tue, 23 Aug 2022 19:46:05 -0700 Subject: [PATCH 07/10] infomration about teardown error --- wandb/sdk/wandb_manager.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/wandb/sdk/wandb_manager.py b/wandb/sdk/wandb_manager.py index 1af57906669..828be005960 100644 --- a/wandb/sdk/wandb_manager.py +++ b/wandb/sdk/wandb_manager.py @@ -7,6 +7,7 @@ import os from typing import TYPE_CHECKING, Any, Callable, Dict, Optional +import wandb from wandb import env, trigger from wandb.sdk.lib.exit_hooks import ExitHooks from wandb.sdk.lib.import_hooks import unregister_all_post_import_hooks @@ -114,7 +115,9 @@ def __init__(self, settings: "Settings", _use_grpc: bool = False) -> None: assert port token = _ManagerToken.from_params(transport=transport, host=host, port=port) token.set_environment() - self._atexit_setup() + # don't setup atexit hook when using a nootebook + if not self._settings._jupyter: + self._atexit_setup() self._token = token @@ -147,8 +150,10 @@ def _teardown(self, exit_code: int) -> None: if result and not self._settings._jupyter: os._exit(result) except Exception as e: - print("teardown error", e) - pass + wandb.termlog( + f"While tearing down the service manager. The following error has occured: {e}", + repeat=False, + ) finally: self._token.reset_environment() From c4ec543104492c8c56d7596c714499876980cbf6 Mon Sep 17 00:00:00 2001 From: Katia Date: Tue, 23 Aug 2022 20:41:51 -0700 Subject: [PATCH 08/10] add coverage test --- tests/unit_tests/test_wandb.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/unit_tests/test_wandb.py b/tests/unit_tests/test_wandb.py index 015a63fcbe8..9cd435858be 100644 --- a/tests/unit_tests/test_wandb.py +++ b/tests/unit_tests/test_wandb.py @@ -582,6 +582,32 @@ def test_attach_usage_errors(wandb_init): run.finish() +# ---------------------------------- +# wandb.teardown +# ---------------------------------- + +# In a notebook environment we might get into a situation where the service process will be removed +# but the singleton setup instance still exists, hence it will try to do the teardown. +# Howeverwandb.teardown will encounter an error because the service process is already gone. +# but since we have an error handle logic in the teardown, we don't see the error +# only informational message about the error. +def test_teardown_error_path(capsys): + with mock.patch.dict( + os.environ, {wandb.env.SERVICE: "2-96604-tcp-localhost-57337"} + ): + with mock.patch.object( + wandb.sdk.wandb_manager._Manager, "_get_service_interface", mock.MagicMock() + ): + wandb.setup() + assert wandb.wandb_sdk.wandb_setup._WandbSetup._instance + wandb.teardown() + _, err = capsys.readouterr() + assert ( + "While tearing down the service manager. The following error has occured:" + in err + ) + + # TODO: test these or make sure they are tested somewhere # run.use_artifact() # run.log_artifact() From 6b56a6b2749208926a12ba1f660ec52cee9074bf Mon Sep 17 00:00:00 2001 From: Katia Date: Tue, 23 Aug 2022 20:58:32 -0700 Subject: [PATCH 09/10] add assertion for env var --- tests/unit_tests/test_wandb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit_tests/test_wandb.py b/tests/unit_tests/test_wandb.py index 9cd435858be..1628bcf0656 100644 --- a/tests/unit_tests/test_wandb.py +++ b/tests/unit_tests/test_wandb.py @@ -601,6 +601,7 @@ def test_teardown_error_path(capsys): wandb.setup() assert wandb.wandb_sdk.wandb_setup._WandbSetup._instance wandb.teardown() + assert wandb.env.SERVICE not in os.environ _, err = capsys.readouterr() assert ( "While tearing down the service manager. The following error has occured:" From 4cdb6dceafdba227e333fb109941396fdeaeb4ac Mon Sep 17 00:00:00 2001 From: Katia Date: Wed, 24 Aug 2022 12:17:20 -0700 Subject: [PATCH 10/10] remove atexit condition --- tests/unit_tests/test_wandb.py | 5 ++++- wandb/sdk/wandb_manager.py | 5 ++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unit_tests/test_wandb.py b/tests/unit_tests/test_wandb.py index 1628bcf0656..48025d29921 100644 --- a/tests/unit_tests/test_wandb.py +++ b/tests/unit_tests/test_wandb.py @@ -596,12 +596,15 @@ def test_teardown_error_path(capsys): os.environ, {wandb.env.SERVICE: "2-96604-tcp-localhost-57337"} ): with mock.patch.object( - wandb.sdk.wandb_manager._Manager, "_get_service_interface", mock.MagicMock() + wandb.sdk.wandb_manager._Manager, + "_get_service_interface", + return_value=mock.MagicMock(), ): wandb.setup() assert wandb.wandb_sdk.wandb_setup._WandbSetup._instance wandb.teardown() assert wandb.env.SERVICE not in os.environ + assert not wandb.wandb_sdk.wandb_setup._WandbSetup._instance _, err = capsys.readouterr() assert ( "While tearing down the service manager. The following error has occured:" diff --git a/wandb/sdk/wandb_manager.py b/wandb/sdk/wandb_manager.py index 828be005960..8946cb41baf 100644 --- a/wandb/sdk/wandb_manager.py +++ b/wandb/sdk/wandb_manager.py @@ -91,6 +91,7 @@ class _Manager: _atexit_lambda: Optional[Callable[[], None]] _hooks: Optional[ExitHooks] _settings: "Settings" + _service: "service._Service" def __init__(self, settings: "Settings", _use_grpc: bool = False) -> None: # TODO: warn if user doesnt have grpc installed @@ -115,9 +116,7 @@ def __init__(self, settings: "Settings", _use_grpc: bool = False) -> None: assert port token = _ManagerToken.from_params(transport=transport, host=host, port=port) token.set_environment() - # don't setup atexit hook when using a nootebook - if not self._settings._jupyter: - self._atexit_setup() + self._atexit_setup() self._token = token