From 6d2a4a3b7f9bd842c6ffbe0647706a9a6d8da75a Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 3 Nov 2022 17:16:09 -0700 Subject: [PATCH 1/8] debug sm join error --- wandb/sdk/internal/system/system_monitor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wandb/sdk/internal/system/system_monitor.py b/wandb/sdk/internal/system/system_monitor.py index 81082f551c1..9fdc0bd5f9f 100644 --- a/wandb/sdk/internal/system/system_monitor.py +++ b/wandb/sdk/internal/system/system_monitor.py @@ -83,6 +83,7 @@ def __init__( shutdown_event=self._shutdown_event, ) ) + print(self.assets[-1].name) # static system info, both hardware and software self.system_info: SystemInfo = SystemInfo( From 369efe86e1626a64d7de2a9791cecdcc5e5ae427 Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 3 Nov 2022 17:20:28 -0700 Subject: [PATCH 2/8] debug sm join error --- wandb/sdk/internal/system/system_monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wandb/sdk/internal/system/system_monitor.py b/wandb/sdk/internal/system/system_monitor.py index 9fdc0bd5f9f..00b7f70b731 100644 --- a/wandb/sdk/internal/system/system_monitor.py +++ b/wandb/sdk/internal/system/system_monitor.py @@ -4,6 +4,7 @@ import threading from typing import TYPE_CHECKING, List, Optional, Union +import wandb from .assets.asset_registry import asset_registry from .assets.interfaces import Asset, Interface from .system_info import SystemInfo @@ -83,7 +84,7 @@ def __init__( shutdown_event=self._shutdown_event, ) ) - print(self.assets[-1].name) + wandb.termerror(self.assets[-1].name) # static system info, both hardware and software self.system_info: SystemInfo = SystemInfo( From 050255fe748ad6d91ec2af808053bbf37835a1cc Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 3 Nov 2022 17:57:03 -0700 Subject: [PATCH 3/8] exercise extra caution when joining threads --- wandb/sdk/internal/system/assets/interfaces.py | 6 ++++-- wandb/sdk/internal/system/system_monitor.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/wandb/sdk/internal/system/assets/interfaces.py b/wandb/sdk/internal/system/assets/interfaces.py index 74062d0030e..24a2b15d136 100644 --- a/wandb/sdk/internal/system/assets/interfaces.py +++ b/wandb/sdk/internal/system/assets/interfaces.py @@ -168,7 +168,9 @@ def start(self) -> None: logger.info(f"Started {self._process.name}") def finish(self) -> None: - if self._process is not None: + if self._process is None: + return None + if self._process.is_alive(): self._process.join() logger.info(f"Joined {self._process.name}") - self._process = None + self._process = None diff --git a/wandb/sdk/internal/system/system_monitor.py b/wandb/sdk/internal/system/system_monitor.py index 00b7f70b731..cd46c00cd35 100644 --- a/wandb/sdk/internal/system/system_monitor.py +++ b/wandb/sdk/internal/system/system_monitor.py @@ -84,7 +84,6 @@ def __init__( shutdown_event=self._shutdown_event, ) ) - wandb.termerror(self.assets[-1].name) # static system info, both hardware and software self.system_info: SystemInfo = SystemInfo( @@ -163,7 +162,8 @@ def finish(self) -> None: self._shutdown_event.set() for asset in self.assets: asset.finish() - self._process.join() + if self._process.is_alive(): + self._process.join() self._process = None def probe(self, publish: bool = True) -> None: From 4abccba7267ab5eed9303a6225bb5449d41a3a46 Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 3 Nov 2022 18:09:13 -0700 Subject: [PATCH 4/8] fix potential error raised when git root probing fails --- wandb/sdk/lib/git.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wandb/sdk/lib/git.py b/wandb/sdk/lib/git.py index 7b1a3117653..32e9dd5f9bc 100644 --- a/wandb/sdk/lib/git.py +++ b/wandb/sdk/lib/git.py @@ -62,7 +62,11 @@ def enabled(self) -> bool: def root(self) -> Optional[str]: if not self.repo: return None - return self.repo.git.rev_parse("--show-toplevel") + try: + return self.repo.git.rev_parse("--show-toplevel") + except exc.GitCommandError as e: + logger.error(f"git root error: {e}") + return None @property def dirty(self) -> bool: From 68ea237ce2703838c7bfd27bbc3d6db27c691a58 Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 3 Nov 2022 18:10:28 -0700 Subject: [PATCH 5/8] clean up --- wandb/sdk/internal/system/system_monitor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wandb/sdk/internal/system/system_monitor.py b/wandb/sdk/internal/system/system_monitor.py index cd46c00cd35..6628f451e5c 100644 --- a/wandb/sdk/internal/system/system_monitor.py +++ b/wandb/sdk/internal/system/system_monitor.py @@ -4,7 +4,6 @@ import threading from typing import TYPE_CHECKING, List, Optional, Union -import wandb from .assets.asset_registry import asset_registry from .assets.interfaces import Asset, Interface from .system_info import SystemInfo From c8bfd3b5f7c7de34bfb734316b73764f7c9a76c0 Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 10 Nov 2022 13:43:53 -0800 Subject: [PATCH 6/8] add more guardrails --- wandb/sdk/internal/system/assets/cpu.py | 3 +++ wandb/sdk/internal/system/assets/memory.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/wandb/sdk/internal/system/assets/cpu.py b/wandb/sdk/internal/system/assets/cpu.py index 9394243c14b..3f0e48ddb0d 100644 --- a/wandb/sdk/internal/system/assets/cpu.py +++ b/wandb/sdk/internal/system/assets/cpu.py @@ -167,3 +167,6 @@ def start(self) -> None: def finish(self) -> None: self.metrics_monitor.finish() + for metric in self.metrics: + if hasattr(metric, "process"): + metric.process = None diff --git a/wandb/sdk/internal/system/assets/memory.py b/wandb/sdk/internal/system/assets/memory.py index 75a0eb576fc..d4c248b2657 100644 --- a/wandb/sdk/internal/system/assets/memory.py +++ b/wandb/sdk/internal/system/assets/memory.py @@ -156,6 +156,9 @@ def start(self) -> None: def finish(self) -> None: self.metrics_monitor.finish() + for metric in self.metrics: + if hasattr(metric, "process"): + metric.process = None @classmethod def is_available(cls) -> bool: From 000c6c390eb3cd4ded721f88e224c1b88182952b Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Thu, 10 Nov 2022 13:47:45 -0800 Subject: [PATCH 7/8] add more guardrails --- wandb/sdk/internal/system/assets/cpu.py | 3 --- wandb/sdk/internal/system/assets/interfaces.py | 4 ++++ wandb/sdk/internal/system/assets/memory.py | 3 --- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/wandb/sdk/internal/system/assets/cpu.py b/wandb/sdk/internal/system/assets/cpu.py index 3f0e48ddb0d..9394243c14b 100644 --- a/wandb/sdk/internal/system/assets/cpu.py +++ b/wandb/sdk/internal/system/assets/cpu.py @@ -167,6 +167,3 @@ def start(self) -> None: def finish(self) -> None: self.metrics_monitor.finish() - for metric in self.metrics: - if hasattr(metric, "process"): - metric.process = None diff --git a/wandb/sdk/internal/system/assets/interfaces.py b/wandb/sdk/internal/system/assets/interfaces.py index 24a2b15d136..85ae705600b 100644 --- a/wandb/sdk/internal/system/assets/interfaces.py +++ b/wandb/sdk/internal/system/assets/interfaces.py @@ -174,3 +174,7 @@ def finish(self) -> None: self._process.join() logger.info(f"Joined {self._process.name}") self._process = None + + for metric in self.metrics: + if hasattr(metric, "process"): + metric.process = None diff --git a/wandb/sdk/internal/system/assets/memory.py b/wandb/sdk/internal/system/assets/memory.py index d4c248b2657..75a0eb576fc 100644 --- a/wandb/sdk/internal/system/assets/memory.py +++ b/wandb/sdk/internal/system/assets/memory.py @@ -156,9 +156,6 @@ def start(self) -> None: def finish(self) -> None: self.metrics_monitor.finish() - for metric in self.metrics: - if hasattr(metric, "process"): - metric.process = None @classmethod def is_available(cls) -> bool: From a0606c200fb8155a2e6787ea0b6cfd602d941e86 Mon Sep 17 00:00:00 2001 From: Dmitry Duev Date: Sun, 4 Dec 2022 23:51:28 -0800 Subject: [PATCH 8/8] simplify hardening --- wandb/sdk/internal/system/assets/interfaces.py | 8 +++----- wandb/sdk/internal/system/system_monitor.py | 4 +++- wandb/sdk/lib/git.py | 1 + wandb/sdk/wandb_settings.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/wandb/sdk/internal/system/assets/interfaces.py b/wandb/sdk/internal/system/assets/interfaces.py index 85ae705600b..bfb33f51865 100644 --- a/wandb/sdk/internal/system/assets/interfaces.py +++ b/wandb/sdk/internal/system/assets/interfaces.py @@ -170,11 +170,9 @@ def start(self) -> None: def finish(self) -> None: if self._process is None: return None - if self._process.is_alive(): + try: self._process.join() logger.info(f"Joined {self._process.name}") + except Exception as e: + logger.warning(f"Failed to join {self._process.name}: {e}") self._process = None - - for metric in self.metrics: - if hasattr(metric, "process"): - metric.process = None diff --git a/wandb/sdk/internal/system/system_monitor.py b/wandb/sdk/internal/system/system_monitor.py index 6628f451e5c..5d856ec4d19 100644 --- a/wandb/sdk/internal/system/system_monitor.py +++ b/wandb/sdk/internal/system/system_monitor.py @@ -161,8 +161,10 @@ def finish(self) -> None: self._shutdown_event.set() for asset in self.assets: asset.finish() - if self._process.is_alive(): + try: self._process.join() + except Exception as e: + logger.error(f"Error joining system monitor process: {e}") self._process = None def probe(self, publish: bool = True) -> None: diff --git a/wandb/sdk/lib/git.py b/wandb/sdk/lib/git.py index 32e9dd5f9bc..13ecc2964ed 100644 --- a/wandb/sdk/lib/git.py +++ b/wandb/sdk/lib/git.py @@ -65,6 +65,7 @@ def root(self) -> Optional[str]: try: return self.repo.git.rev_parse("--show-toplevel") except exc.GitCommandError as e: + # todo: collect telemetry on this logger.error(f"git root error: {e}") return None diff --git a/wandb/sdk/wandb_settings.py b/wandb/sdk/wandb_settings.py index 9b059d2a8bd..5e8a37a0dbc 100644 --- a/wandb/sdk/wandb_settings.py +++ b/wandb/sdk/wandb_settings.py @@ -1008,7 +1008,7 @@ def __init__(self, **kwargs: Any) -> None: self.__unexpected_args: Set[str] = set() # Set default settings values - # We start off with the class attributes and `default_props`' dicts + # We start off with the class attributes and `default_props` dicts # and then create Property objects. # Once initialized, attributes are to only be updated using the `update` method default_props = self._default_props()