From 4ea2383123179b8e53d593014ad7436a653312bc Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Thu, 25 Aug 2022 10:49:16 -0400 Subject: [PATCH 1/8] artifact wait timeout code --- wandb/sdk/wandb_artifacts.py | 7 +++---- wandb/sdk/wandb_run.py | 9 +++++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/wandb/sdk/wandb_artifacts.py b/wandb/sdk/wandb_artifacts.py index 0c3d4083e02..3e6f8fd16d3 100644 --- a/wandb/sdk/wandb_artifacts.py +++ b/wandb/sdk/wandb_artifacts.py @@ -25,7 +25,6 @@ from urllib.parse import parse_qsl, quote, urlparse import requests -import urllib3 import wandb from wandb import env from wandb import util @@ -60,7 +59,7 @@ # This makes the first sleep 1s, and then doubles it up to total times, # which makes for ~18 hours. -_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry( +_REQUEST_RETRY_STRATEGY = requests.packages.urllib3.util.retry.Retry( backoff_factor=1, total=16, status_forcelist=(308, 408, 409, 429, 500, 502, 503, 504), @@ -647,9 +646,9 @@ def delete(self) -> None: "Cannot call delete on an artifact before it has been logged or in offline mode" ) - def wait(self) -> ArtifactInterface: + def wait(self, wait_timeout_secs: int = None) -> ArtifactInterface: if self._logged_artifact: - return self._logged_artifact.wait() + return self._logged_artifact.wait(wait_timeout_secs) raise ValueError( "Cannot call wait on an artifact before it has been logged or in offline mode" diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index 38c28793c64..5df8f96aed3 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -3591,9 +3591,14 @@ def __getattr__(self, item: str) -> Any: self._assert_instance() return getattr(self._instance, item) - def wait(self) -> ArtifactInterface: + def wait(self, wait_timeout_secs: int = None) -> ArtifactInterface: if not self._instance: - resp = self._future.get().response.log_artifact_response + future_get = self._future.get(wait_timeout_secs) + if not future_get: + raise TimeoutError( + "Artifact wait timed out, failed to fetch Artifact response" + ) + resp = future_get.response.log_artifact_response if resp.error_message: raise ValueError(resp.error_message) self._instance = public.Artifact.from_id(resp.artifact_id, self._api.client) From d01858f077e2ebac53b6f581bc5566293545e24f Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Thu, 25 Aug 2022 11:07:05 -0400 Subject: [PATCH 2/8] add unit test for wait() timeout --- tests/unit_tests/test_wandb_run.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/unit_tests/test_wandb_run.py b/tests/unit_tests/test_wandb_run.py index 3f88e7cb3cb..97254689a65 100644 --- a/tests/unit_tests/test_wandb_run.py +++ b/tests/unit_tests/test_wandb_run.py @@ -2,6 +2,7 @@ import pickle import platform import sys +import time from unittest import mock import numpy as np @@ -454,3 +455,29 @@ def test_settings_unexpected_args_telemetry(runner, relay_server, capsys, user): # whose field 2 corresponds to unexpected arguments in Settings assert 2 in telemetry.get("11", []) run.finish() + + +def test_artifact_wait_success(user, test_settings): + # Test artifact wait() timeout parameter + timeout = 2 + leeway = 0.50 + run = wandb.init(settings=test_settings()) + artifact = wandb.Artifact("art", type="dataset") + start_timestamp = time.time() + run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + elapsed_time = time.time() - start_timestamp + assert elapsed_time < timeout * (1 + leeway) + run.finish() + + +@pytest.mark.parametrize("timeout", [0, 2]) +def test_artifact_wait_failure(user, test_settings, timeout): + # Test to expect TimeoutError when wait timeout is reached and large image + # wasn't uploaded yet + large_image = wandb.Image(np.random.randint(0, 255, (10000, 10000, 3))) + run = wandb.init(settings=test_settings()) + with pytest.raises(TimeoutError): + artifact = wandb.Artifact("art", type="image") + artifact.add(large_image, "image") + run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + run.finish() From cb8513e8a8087571dd915dd5f8f91b568a72ead0 Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Thu, 25 Aug 2022 13:18:09 -0400 Subject: [PATCH 3/8] change request_retry_strategy back --- wandb/sdk/wandb_artifacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wandb/sdk/wandb_artifacts.py b/wandb/sdk/wandb_artifacts.py index 360c8c36a52..4199c3906fa 100644 --- a/wandb/sdk/wandb_artifacts.py +++ b/wandb/sdk/wandb_artifacts.py @@ -60,7 +60,7 @@ # This makes the first sleep 1s, and then doubles it up to total times, # which makes for ~18 hours. -_REQUEST_RETRY_STRATEGY = requests.packages.urllib3.util.retry.Retry( +_REQUEST_RETRY_STRATEGY = urllib3.util.retry.Retry( backoff_factor=1, total=16, status_forcelist=(308, 408, 409, 429, 500, 502, 503, 504), From e41b799d94565d30dda1e7a31e57ad5f9bb7c3c3 Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Thu, 25 Aug 2022 16:50:17 -0400 Subject: [PATCH 4/8] move unit test and reduce image size --- tests/unit_tests/test_wandb_artifacts_full.py | 27 +++++++++++++++++++ tests/unit_tests/test_wandb_run.py | 26 ------------------ wandb/sdk/wandb_run.py | 8 +++++- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/tests/unit_tests/test_wandb_artifacts_full.py b/tests/unit_tests/test_wandb_artifacts_full.py index d160dcecce7..af4f83ca839 100644 --- a/tests/unit_tests/test_wandb_artifacts_full.py +++ b/tests/unit_tests/test_wandb_artifacts_full.py @@ -4,6 +4,7 @@ import numpy as np import pytest import wandb +from wandb.sdk.wandb_run import WaitTimeoutError sm = wandb.wandb_sdk.internal.sender.SendManager @@ -193,3 +194,29 @@ def make_table(): artifact2.add(t1, "t2") assert artifact2.manifest.entries["t2.table.json"].ref is not None run.finish() + + +def test_artifact_wait_success(wandb_init): + # Test artifact wait() timeout parameter + timeout = 2 + leeway = 0.50 + run = wandb_init() + artifact = wandb.Artifact("art", type="dataset") + start_timestamp = time.time() + run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + elapsed_time = time.time() - start_timestamp + assert elapsed_time < timeout * (1 + leeway) + run.finish() + + +@pytest.mark.parametrize("timeout", [0, 1e-1]) +def test_artifact_wait_failure(wandb_init, timeout): + # Test to expect WaitTimeoutError when wait timeout is reached and large image + # wasn't uploaded yet + large_image = wandb.Image(np.zeros((1000, 1000))) + run = wandb_init() + with pytest.raises(WaitTimeoutError): + artifact = wandb.Artifact("art", type="image") + artifact.add(large_image, "image") + run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + run.finish() diff --git a/tests/unit_tests/test_wandb_run.py b/tests/unit_tests/test_wandb_run.py index 97254689a65..efa8bd6bfc5 100644 --- a/tests/unit_tests/test_wandb_run.py +++ b/tests/unit_tests/test_wandb_run.py @@ -455,29 +455,3 @@ def test_settings_unexpected_args_telemetry(runner, relay_server, capsys, user): # whose field 2 corresponds to unexpected arguments in Settings assert 2 in telemetry.get("11", []) run.finish() - - -def test_artifact_wait_success(user, test_settings): - # Test artifact wait() timeout parameter - timeout = 2 - leeway = 0.50 - run = wandb.init(settings=test_settings()) - artifact = wandb.Artifact("art", type="dataset") - start_timestamp = time.time() - run.log_artifact(artifact).wait(wait_timeout_secs=timeout) - elapsed_time = time.time() - start_timestamp - assert elapsed_time < timeout * (1 + leeway) - run.finish() - - -@pytest.mark.parametrize("timeout", [0, 2]) -def test_artifact_wait_failure(user, test_settings, timeout): - # Test to expect TimeoutError when wait timeout is reached and large image - # wasn't uploaded yet - large_image = wandb.Image(np.random.randint(0, 255, (10000, 10000, 3))) - run = wandb.init(settings=test_settings()) - with pytest.raises(TimeoutError): - artifact = wandb.Artifact("art", type="image") - artifact.add(large_image, "image") - run.log_artifact(artifact).wait(wait_timeout_secs=timeout) - run.finish() diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index b2a4f67e70b..6168effff99 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -135,6 +135,12 @@ class TeardownHook(NamedTuple): stage: TeardownStage +class WaitTimeoutError(Exception): + def __init__(self, message): + super().__init__(message) + self.message = message + + class RunStatusChecker: """Periodically polls the background process for relevant updates. @@ -3579,7 +3585,7 @@ def wait(self, wait_timeout_secs: int = None) -> ArtifactInterface: if not self._instance: future_get = self._future.get(wait_timeout_secs) if not future_get: - raise TimeoutError( + raise WaitTimeoutError( "Artifact wait timed out, failed to fetch Artifact response" ) resp = future_get.response.log_artifact_response From e35fb38027b28959559e4ee8155194cf532be112 Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Mon, 29 Aug 2022 16:54:04 -0400 Subject: [PATCH 5/8] make testing timeout looser and rename wait_timeout_secs to timeout --- tests/unit_tests/test_wandb_artifacts_full.py | 8 ++++---- tests/unit_tests/test_wandb_run.py | 1 - wandb/sdk/wandb_artifacts.py | 8 ++++++-- wandb/sdk/wandb_run.py | 6 +++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/unit_tests/test_wandb_artifacts_full.py b/tests/unit_tests/test_wandb_artifacts_full.py index af4f83ca839..eedb4cdd8ce 100644 --- a/tests/unit_tests/test_wandb_artifacts_full.py +++ b/tests/unit_tests/test_wandb_artifacts_full.py @@ -198,18 +198,18 @@ def make_table(): def test_artifact_wait_success(wandb_init): # Test artifact wait() timeout parameter - timeout = 2 + timeout = 60 leeway = 0.50 run = wandb_init() artifact = wandb.Artifact("art", type="dataset") start_timestamp = time.time() - run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + run.log_artifact(artifact).wait(timeout=timeout) elapsed_time = time.time() - start_timestamp assert elapsed_time < timeout * (1 + leeway) run.finish() -@pytest.mark.parametrize("timeout", [0, 1e-1]) +@pytest.mark.parametrize("timeout", [0, 1e-6]) def test_artifact_wait_failure(wandb_init, timeout): # Test to expect WaitTimeoutError when wait timeout is reached and large image # wasn't uploaded yet @@ -218,5 +218,5 @@ def test_artifact_wait_failure(wandb_init, timeout): with pytest.raises(WaitTimeoutError): artifact = wandb.Artifact("art", type="image") artifact.add(large_image, "image") - run.log_artifact(artifact).wait(wait_timeout_secs=timeout) + run.log_artifact(artifact).wait(timeout=timeout) run.finish() diff --git a/tests/unit_tests/test_wandb_run.py b/tests/unit_tests/test_wandb_run.py index efa8bd6bfc5..3f88e7cb3cb 100644 --- a/tests/unit_tests/test_wandb_run.py +++ b/tests/unit_tests/test_wandb_run.py @@ -2,7 +2,6 @@ import pickle import platform import sys -import time from unittest import mock import numpy as np diff --git a/wandb/sdk/wandb_artifacts.py b/wandb/sdk/wandb_artifacts.py index 4199c3906fa..36e695f7a79 100644 --- a/wandb/sdk/wandb_artifacts.py +++ b/wandb/sdk/wandb_artifacts.py @@ -647,9 +647,13 @@ def delete(self) -> None: "Cannot call delete on an artifact before it has been logged or in offline mode" ) - def wait(self, wait_timeout_secs: int = None) -> ArtifactInterface: + def wait(self, timeout: Optional[int] = None) -> ArtifactInterface: + """ + Arguments: + timeout: (int, optional) Waits in seconds for artifact to finish logging if needed. + """ if self._logged_artifact: - return self._logged_artifact.wait(wait_timeout_secs) + return self._logged_artifact.wait(timeout) raise ValueError( "Cannot call wait on an artifact before it has been logged or in offline mode" diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index 6168effff99..32be1f702c7 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -3581,12 +3581,12 @@ def __getattr__(self, item: str) -> Any: self._assert_instance() return getattr(self._instance, item) - def wait(self, wait_timeout_secs: int = None) -> ArtifactInterface: + def wait(self, timeout: Optional[int] = None) -> ArtifactInterface: if not self._instance: - future_get = self._future.get(wait_timeout_secs) + future_get = self._future.get(timeout) if not future_get: raise WaitTimeoutError( - "Artifact wait timed out, failed to fetch Artifact response" + "Artifact upload wait timed out, failed to fetch Artifact response" ) resp = future_get.response.log_artifact_response if resp.error_message: From c5c3362086631ed0646a32440fc440475fc8e37e Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Tue, 30 Aug 2022 14:12:07 -0400 Subject: [PATCH 6/8] retrigger checks From f92c3adc87af76517feb683be56bf7d733c3f455 Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Tue, 30 Aug 2022 18:47:10 -0400 Subject: [PATCH 7/8] move error to __init__.py and fix mypy error --- tests/unit_tests/test_wandb_artifacts_full.py | 2 +- wandb/errors/__init__.py | 7 +++++++ wandb/sdk/wandb_artifacts.py | 2 +- wandb/sdk/wandb_run.py | 8 +------- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/unit_tests/test_wandb_artifacts_full.py b/tests/unit_tests/test_wandb_artifacts_full.py index eedb4cdd8ce..eb52c9716b7 100644 --- a/tests/unit_tests/test_wandb_artifacts_full.py +++ b/tests/unit_tests/test_wandb_artifacts_full.py @@ -4,7 +4,7 @@ import numpy as np import pytest import wandb -from wandb.sdk.wandb_run import WaitTimeoutError +from wandb.errors import WaitTimeoutError sm = wandb.wandb_sdk.internal.sender.SendManager diff --git a/wandb/errors/__init__.py b/wandb/errors/__init__.py index 6659f6a2af3..b50dba7cbe9 100644 --- a/wandb/errors/__init__.py +++ b/wandb/errors/__init__.py @@ -102,6 +102,12 @@ class SweepError(Error): pass +class WaitTimeoutError(Error): + """Raised when wait() timeout occurs before process is finished""" + + pass + + __all__ = [ "Error", "UsageError", @@ -114,4 +120,5 @@ class SweepError(Error): "ExecutionError", "LaunchError", "SweepError", + "WaitTimeoutError", ] diff --git a/wandb/sdk/wandb_artifacts.py b/wandb/sdk/wandb_artifacts.py index 36e695f7a79..096ad17c150 100644 --- a/wandb/sdk/wandb_artifacts.py +++ b/wandb/sdk/wandb_artifacts.py @@ -653,7 +653,7 @@ def wait(self, timeout: Optional[int] = None) -> ArtifactInterface: timeout: (int, optional) Waits in seconds for artifact to finish logging if needed. """ if self._logged_artifact: - return self._logged_artifact.wait(timeout) + return self._logged_artifact.wait(timeout) # type: ignore [call-arg] raise ValueError( "Cannot call wait on an artifact before it has been logged or in offline mode" diff --git a/wandb/sdk/wandb_run.py b/wandb/sdk/wandb_run.py index dc0960e8c31..d85312209ba 100644 --- a/wandb/sdk/wandb_run.py +++ b/wandb/sdk/wandb_run.py @@ -138,12 +138,6 @@ class TeardownHook(NamedTuple): stage: TeardownStage -class WaitTimeoutError(Exception): - def __init__(self, message): - super().__init__(message) - self.message = message - - class RunStatusChecker: """Periodically polls the background process for relevant updates. @@ -3596,7 +3590,7 @@ def wait(self, timeout: Optional[int] = None) -> ArtifactInterface: if not self._instance: future_get = self._future.get(timeout) if not future_get: - raise WaitTimeoutError( + raise errors.WaitTimeoutError( "Artifact upload wait timed out, failed to fetch Artifact response" ) resp = future_get.response.log_artifact_response From 145fc6e7bb00c3edc351e956525c4f0dbf0c8b69 Mon Sep 17 00:00:00 2001 From: Estella Xin Date: Tue, 30 Aug 2022 22:56:20 -0400 Subject: [PATCH 8/8] make image smaller for timeout test --- tests/unit_tests/test_wandb_artifacts_full.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_wandb_artifacts_full.py b/tests/unit_tests/test_wandb_artifacts_full.py index eb52c9716b7..75ea605c679 100644 --- a/tests/unit_tests/test_wandb_artifacts_full.py +++ b/tests/unit_tests/test_wandb_artifacts_full.py @@ -213,10 +213,10 @@ def test_artifact_wait_success(wandb_init): def test_artifact_wait_failure(wandb_init, timeout): # Test to expect WaitTimeoutError when wait timeout is reached and large image # wasn't uploaded yet - large_image = wandb.Image(np.zeros((1000, 1000))) + image = wandb.Image(np.random.randint(0, 255, (10, 10))) run = wandb_init() with pytest.raises(WaitTimeoutError): artifact = wandb.Artifact("art", type="image") - artifact.add(large_image, "image") + artifact.add(image, "image") run.log_artifact(artifact).wait(timeout=timeout) run.finish()