Skip to content

Commit

Permalink
Dump cluster state on all test failures
Browse files Browse the repository at this point in the history
  • Loading branch information
crusaderky committed Jan 18, 2022
1 parent a26c770 commit 24387cb
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Expand Up @@ -147,5 +147,5 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: ${{ env.TEST_ID }}-timeouts
path: test_timeout_dump
path: test_cluster_dump
if-no-files-found: ignore
4 changes: 2 additions & 2 deletions .gitignore
Expand Up @@ -31,5 +31,5 @@ tags
.ipynb_checkpoints
.venv/

# Test timeouts will dump the cluster state in here
test_timeout_dump/
# Test failures will dump the cluster state in here
test_cluster_dump/
28 changes: 23 additions & 5 deletions distributed/utils_test.py
Expand Up @@ -895,7 +895,7 @@ def gen_cluster(
config: dict[str, Any] = {},
clean_kwargs: dict[str, Any] = {},
allow_unclosed: bool = False,
cluster_dump_directory: str | Literal[False] = "test_timeout_dump",
cluster_dump_directory: str | Literal[False] = "test_cluster_dump",
) -> Callable[[Callable], Callable]:
from distributed import Client

Expand Down Expand Up @@ -979,15 +979,16 @@ async def coro():
**client_kwargs,
)
args = [c] + args

try:
coro = func(*args, *outer_args, **kwargs)
task = asyncio.create_task(coro)

coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
result = await coro2
if s.validate:
s.validate_state()
except asyncio.TimeoutError as e:

except asyncio.TimeoutError:
assert task
buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
Expand All @@ -1004,9 +1005,26 @@ async def coro():
task.cancel()
while not task.cancelled():
await asyncio.sleep(0.01)

# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio and
# not from the code being tested.
raise TimeoutError(
f"Test timeout after {timeout}s.\n{buffer.getvalue()}"
) from e
f"Test timeout after {timeout}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
) from None

except Exception:
if cluster_dump_directory:
await dump_cluster_state(
s,
ws,
output_dir=cluster_dump_directory,
func_name=func.__name__,
)
raise

finally:
if client and c.status not in ("closing", "closed"):
await c._close(fast=s.status == Status.closed)
Expand Down

0 comments on commit 24387cb

Please sign in to comment.