Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose paused and retired workers separately in prometheus #8613

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions distributed/http/scheduler/prometheus/core.py
Expand Up @@ -7,6 +7,7 @@
import toolz
from prometheus_client.core import CounterMetricFamily, GaugeMetricFamily

from distributed.core import Status
from distributed.http.prometheus import PrometheusCollector
from distributed.http.scheduler.prometheus.semaphore import SemaphoreMetricCollector
from distributed.http.scheduler.prometheus.stealing import WorkStealingMetricCollector
Expand Down Expand Up @@ -50,6 +51,15 @@ def collect(self) -> Iterator[GaugeMetricFamily | CounterMetricFamily]:
worker_states.add_metric(
["paused_or_retiring"], len(self.server.workers) - len(self.server.running)
)
paused_workers = len(
[w for w in self.server.workers.values() if w.status == Status.paused]
)
worker_states.add_metric(["paused"], paused_workers)
worker_states.add_metric(
["retiring"],
len(self.server.workers) - paused_workers - len(self.server.running),
)

yield worker_states

if self.server.monitor.monitor_gil_contention:
Expand Down
20 changes: 20 additions & 0 deletions distributed/http/scheduler/tests/test_scheduler_http.py
Expand Up @@ -282,6 +282,8 @@ async def fetch_metrics():
"partially_saturated": 0,
"saturated": 0,
"paused_or_retiring": 0,
"paused": 0,
"retiring": 0,
}

ev = Event()
Expand All @@ -292,6 +294,8 @@ async def fetch_metrics():
"partially_saturated": 1,
"saturated": 0,
"paused_or_retiring": 0,
"paused": 0,
"retiring": 0,
}

y = c.submit(lambda ev: ev.wait(), ev, key="y", workers=[a.address])
Expand All @@ -304,6 +308,8 @@ async def fetch_metrics():
"partially_saturated": 0,
"saturated": 1,
"paused_or_retiring": 0,
"paused": 0,
"retiring": 0,
}

a.monitor.get_process_memory = lambda: 2**40
Expand All @@ -314,8 +320,22 @@ async def fetch_metrics():
"partially_saturated": 0,
"saturated": 0,
"paused_or_retiring": 1,
"paused": 1,
"retiring": 0,
}

sa.status = Status.stopping
while sa.status != Status.stopping:
await asyncio.sleep(0.01)

assert await fetch_metrics() == {
"idle": 1,
"partially_saturated": 0,
"saturated": 0,
"paused_or_retiring": 1,
"paused": 0,
"retiring": 1,
}
await ev.set()


Expand Down