Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add metric for kernel restarts #1241

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions jupyter_server/prometheus/metrics.py
Expand Up @@ -5,7 +5,16 @@
conventions for metrics & labels.
"""

from prometheus_client import Counter

try:
import notebook # type: ignore

if notebook.__name__ != "notebook":
# avoid double-importing myself if nbclassic is shimming jupyter_server into notebook,
# in which case notebook.__name__ will be 'jupyter_server'
_msg = "Not importing jupyter_server metrics under two names"
raise ImportError(_msg)

Check warning on line 17 in jupyter_server/prometheus/metrics.py

View check run for this annotation

Codecov / codecov/patch

jupyter_server/prometheus/metrics.py#L16-L17

Added lines #L16 - L17 were not covered by tests
# Jupyter Notebook also defines these metrics. Re-defining them results in a ValueError.
# Try to de-duplicate by using the ones in Notebook if available.
# See https://github.com/jupyter/jupyter_server/issues/209
Expand Down Expand Up @@ -34,3 +43,9 @@
"counter for how many kernels are running labeled by type",
["type"],
)

KERNEL_RESTARTS = Counter(
"jupyter_kernel_restarts",
"counter for how many kernel restarts, labeled by kernel_name and source (user or restarter)",
["kernel_name", "source"],
)
2 changes: 2 additions & 0 deletions jupyter_server/services/kernels/handlers.py
Expand Up @@ -16,6 +16,7 @@
from tornado import web

from jupyter_server.auth import authorized
from jupyter_server.prometheus.metrics import KERNEL_RESTARTS
from jupyter_server.utils import url_escape, url_path_join

from ...base.handlers import APIHandler
Expand Down Expand Up @@ -104,6 +105,7 @@ async def post(self, kernel_id, action):
self.set_status(500)
else:
model = await ensure_async(km.kernel_model(kernel_id))
KERNEL_RESTARTS.labels(kernel_name=model["name"], source="user").inc()
self.write(json.dumps(model, default=json_default))
self.finish()

Expand Down
11 changes: 10 additions & 1 deletion jupyter_server/services/kernels/kernelmanager.py
Expand Up @@ -38,7 +38,7 @@
)

from jupyter_server._tz import isoformat, utcnow
from jupyter_server.prometheus.metrics import KERNEL_CURRENTLY_RUNNING_TOTAL
from jupyter_server.prometheus.metrics import KERNEL_CURRENTLY_RUNNING_TOTAL, KERNEL_RESTARTS
from jupyter_server.utils import ApiPath, import_item, to_os_path


Expand Down Expand Up @@ -179,6 +179,10 @@
# Methods for managing kernels and sessions
# -------------------------------------------------------------------------

def _handle_kernel_restart(self, kernel_id, kernel_name):
"""notice that a kernel restarted"""
KERNEL_RESTARTS.labels(kernel_name=kernel_name, source="restarter").inc()

Check warning on line 184 in jupyter_server/services/kernels/kernelmanager.py

View check run for this annotation

Codecov / codecov/patch

jupyter_server/services/kernels/kernelmanager.py#L184

Added line #L184 was not covered by tests

def _handle_kernel_died(self, kernel_id):
"""notice that a kernel died"""
self.log.warning("Kernel %s died, removing from map.", kernel_id)
Expand Down Expand Up @@ -279,6 +283,11 @@
lambda: self._handle_kernel_died(kernel_id),
"dead",
)
# register callback to count restarts
self.add_restart_callback(
kernel_id,
lambda: self._handle_kernel_restart(kernel_id, km.kernel_name),
)

def ports_changed(self, kernel_id):
"""Used by ZMQChannelsHandler to determine how to coordinate nudge and replays.
Expand Down