distributed/active_memory_manager.py

from __future__ import annotations

import logging
from collections import defaultdict
from collections.abc import Generator
from typing import TYPE_CHECKING

from tornado.ioloop import PeriodicCallback

import dask
from dask.utils import parse_timedelta

from .core import Status
from .metrics import time
from .utils import import_term, log_errors

if TYPE_CHECKING:  # pragma: nocover
    from .client import Client
    from .scheduler import Scheduler, TaskState, WorkerState

logger = logging.getLogger(__name__)


class ActiveMemoryManagerExtension:
    """Scheduler extension that optimizes memory usage across the cluster.
    It can be either triggered by hand or automatically every few seconds; at every
    iteration it performs one or both of the following:

    - create new replicas of in-memory tasks
    - destroy replicas of in-memory tasks; this never destroys the last available copy.

    There are no 'move' operations. A move is performed in two passes: first you create
    a copy and, in the next iteration, you delete the original (if the copy succeeded).

    This extension is configured by the dask config section
    ``distributed.scheduler.active-memory-manager``.
    """

    scheduler: Scheduler
    policies: set[ActiveMemoryManagerPolicy]
    interval: float

    # These attributes only exist within the scope of self.run()
    # Current memory (in bytes) allocated on each worker, plus/minus pending actions
    workers_memory: dict[WorkerState, int]
    # Pending replications and deletions for each task
    pending: defaultdict[TaskState, tuple[set[WorkerState], set[WorkerState]]]

    def __init__(
        self,
        scheduler: Scheduler,
        # The following parameters are exposed so that one may create, run, and throw
        # away on the fly a specialized manager, separate from the main one.
        policies: set[ActiveMemoryManagerPolicy] | None = None,
        *,
        register: bool = True,
        start: bool | None = None,
        interval: float | None = None,
    ):
        self.scheduler = scheduler
        self.policies = set()

        if policies is None:
            # Initialize policies from config
            policies = set()
            for kwargs in dask.config.get(
                "distributed.scheduler.active-memory-manager.policies"
            ):
                kwargs = kwargs.copy()
                cls = import_term(kwargs.pop("class"))
                policies.add(cls(**kwargs))

        for policy in policies:
            self.add_policy(policy)

        if register:
            scheduler.extensions["amm"] = self
            scheduler.handlers["amm_handler"] = self.amm_handler

        if interval is None:
            interval = parse_timedelta(
                dask.config.get("distributed.scheduler.active-memory-manager.interval")
            )
        self.interval = interval
        if start is None:
            start = dask.config.get("distributed.scheduler.active-memory-manager.start")
        if start:
            self.start()

    def amm_handler(self, comm, method: str):
        """Scheduler handler, invoked from the Client by
        :class:`~distributed.active_memory_manager.AMMClientProxy`
        """
        assert method in {"start", "stop", "run_once", "running"}
        out = getattr(self, method)
        return out() if callable(out) else out

    def start(self) -> None:
        """Start executing every ``self.interval`` seconds until scheduler shutdown"""
        if self.running:
            return
        pc = PeriodicCallback(self.run_once, self.interval * 1000.0)
        self.scheduler.periodic_callbacks[f"amm-{id(self)}"] = pc
        pc.start()

    def stop(self) -> None:
        """Stop periodic execution"""
        pc = self.scheduler.periodic_callbacks.pop(f"amm-{id(self)}", None)
        if pc:
            pc.stop()

    @property
    def running(self) -> bool:
        """Return True if the AMM is being triggered periodically; False otherwise"""
        return f"amm-{id(self)}" in self.scheduler.periodic_callbacks

    def add_policy(self, policy: ActiveMemoryManagerPolicy) -> None:
        if not isinstance(policy, ActiveMemoryManagerPolicy):
            raise TypeError(f"Expected ActiveMemoryManagerPolicy; got {policy!r}")
        self.policies.add(policy)
        policy.manager = self

    def run_once(self) -> None:
        """Run all policies once and asynchronously (fire and forget) enact their
        recommendations to replicate/drop keys
        """
        with log_errors():
            # This should never fail since this is a synchronous method
            assert not hasattr(self, "pending")

            self.pending = defaultdict(lambda: (set(), set()))
            self.workers_memory = {
                w: w.memory.optimistic for w in self.scheduler.workers.values()
            }
            try:
                # populate self.pending
                self._run_policies()

                if self.pending:
                    logger.debug("Enacting suggestions for %d keys", len(self.pending))
                    self._enact_suggestions()
            finally:
                del self.workers_memory
                del self.pending

    def _run_policies(self) -> None:
        """Sequentially run ActiveMemoryManagerPolicy.run() for all registered policies,
        obtain replicate/drop suggestions, and use them to populate self.pending.
        """
        candidates: set[WorkerState] | None
        cmd: str
        ws: WorkerState | None
        ts: TaskState
        nreplicas: int

        for policy in list(self.policies):  # a policy may remove itself
            policy_gen = policy.run()
            ws = None
            while True:
                try:
                    cmd, ts, candidates = policy_gen.send(ws)
                except StopIteration:
                    break  # next policy

                pending_repl, pending_drop = self.pending[ts]

                if cmd == "replicate":
                    ws = self._find_recipient(ts, candidates, pending_repl)
                    if ws:
                        pending_repl.add(ws)
                        self.workers_memory[ws] += ts.nbytes

                elif cmd == "drop":
                    ws = self._find_dropper(ts, candidates, pending_drop)
                    if ws:
                        pending_drop.add(ws)
                        self.workers_memory[ws] = max(
                            0, self.workers_memory[ws] - ts.nbytes
                        )

                else:
                    raise ValueError(f"Unknown command: {cmd}")  # pragma: nocover

    def _find_recipient(
        self,
        ts: TaskState,
        candidates: set[WorkerState] | None,
        pending_repl: set[WorkerState],
    ) -> WorkerState | None:
        """Choose a worker to acquire a new replica of an in-memory task among a set of
        candidates. If candidates is None, default to all workers in the cluster.
        Regardless, workers that either already hold a replica or are scheduled to
        receive one at the end of this AMM iteration are not considered.

        Returns
        -------
        The worker with the lowest memory usage (downstream of pending replications and
        drops), or None if no eligible candidates are available.
        """
        if ts.state != "memory":
            return None
        if candidates is None:
            candidates = self.scheduler.running.copy()
        else:
            candidates &= self.scheduler.running

        candidates -= ts.who_has
        candidates -= pending_repl
        if not candidates:
            return None

        # Select candidate with the lowest memory usage
        return min(candidates, key=self.workers_memory.__getitem__)

    def _find_dropper(
        self,
        ts: TaskState,
        candidates: set[WorkerState] | None,
        pending_drop: set[WorkerState],
    ) -> WorkerState | None:
        """Choose a worker to drop its replica of an in-memory task among a set of
        candidates. If candidates is None, default to all workers in the cluster.
        Regardless, workers that either do not hold a replica or are already scheduled
        to drop theirs at the end of this AMM iteration are not considered.
        This method also ensures that a key will not lose its last replica.

        Returns
        -------
        The worker with the highest memory usage (downstream of pending replications and
        drops), or None if no eligible candidates are available.
        """
        if len(ts.who_has) - len(pending_drop) < 2:
            return None
        if candidates is None:
            candidates = ts.who_has.copy()
        else:
            candidates &= ts.who_has
        candidates -= pending_drop
        candidates -= {waiter_ts.processing_on for waiter_ts in ts.waiters}
        if not candidates:
            return None

        # Select candidate with the highest memory usage.
        # Drop from workers with status paused or closing_gracefully first.
        return max(
            candidates,
            key=lambda ws: (ws.status != Status.running, self.workers_memory[ws]),
        )

    def _enact_suggestions(self) -> None:
        """Iterate through self.pending, which was filled by self._run_policies(), and
        push the suggestions to the workers through bulk comms. Return immediately.
        """
        drop_by_worker: (defaultdict[WorkerState, set[TaskState]]) = defaultdict(set)
        repl_by_worker: (
            defaultdict[WorkerState, dict[TaskState, set[str]]]
        ) = defaultdict(dict)

        for ts, (pending_repl, pending_drop) in self.pending.items():
            if not ts.who_has:
                continue
            who_has = {ws_snd.address for ws_snd in ts.who_has - pending_drop}
            assert who_has  # Never drop the last replica
            for ws_rec in pending_repl:
                assert ws_rec not in ts.who_has
                repl_by_worker[ws_rec][ts] = who_has
            for ws in pending_drop:
                assert ws in ts.who_has
                drop_by_worker[ws].add(ts)

        # Fire-and-forget enact recommendations from policies
        stimulus_id = str(time())
        for ws_rec, ts_to_who_has in repl_by_worker.items():
            self.scheduler.stream_comms[ws_rec.address].send(
                {
                    "op": "acquire-replicas",
                    "keys": [ts.key for ts in ts_to_who_has],
                    "stimulus_id": "acquire-replicas-" + stimulus_id,
                    "priorities": {ts.key: ts.priority for ts in ts_to_who_has},
                    "who_has": {ts.key: v for ts, v in ts_to_who_has.items()},
                },
            )

        for ws, tss in drop_by_worker.items():
            # The scheduler immediately forgets about the replica and suggests the
            # worker to drop it. The worker may refuse, at which point it will send back
            # an add-keys message to reinstate it.
            for ts in tss:
                self.scheduler.remove_replica(ts, ws)
            self.scheduler.stream_comms[ws.address].send(
                {
                    "op": "remove-replicas",
                    "keys": [ts.key for ts in tss],
                    "stimulus_id": "remove-replicas-" + stimulus_id,
                }
            )


class ActiveMemoryManagerPolicy:
    """Abstract parent class"""

    manager: ActiveMemoryManagerExtension

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"

    def run(
        self,
    ) -> Generator[
        tuple[str, TaskState, set[WorkerState] | None],
        WorkerState | None,
        None,
    ]:
        """This method is invoked by the ActiveMemoryManager every few seconds, or
        whenever the user invokes ``client.amm.run_once``.
        It is an iterator that must emit any of the following:

        - "replicate", <TaskState>, None
        - "replicate", <TaskState>, {subset of potential workers to replicate to}
        - "drop", <TaskState>, None
        - "drop", <TaskState>, {subset of potential workers to drop from}

        Each element yielded indicates the desire to create or destroy a single replica
        of a key. If a subset of workers is not provided, it defaults to all workers on
        the cluster. Either the ActiveMemoryManager or the Worker may later decide to
        disregard the request, e.g. because it would delete the last copy of a key or
        because the key is currently needed on that worker.

        You may optionally retrieve which worker it was decided the key will be
        replicated to or dropped from, as follows:

        .. code-block:: python

           choice = (yield "replicate", ts, None)

        ``choice`` is either a WorkerState or None; the latter is returned if the
        ActiveMemoryManager chose to disregard the request.

        The current pending (accepted) commands can be inspected on
        ``self.manager.pending``; this includes the commands previously yielded by this
        same method.

        The current memory usage on each worker, *downstream of all pending commands*,
        can be inspected on ``self.manager.workers_memory``.
        """
        raise NotImplementedError("Virtual method")  # pragma: nocover


class AMMClientProxy:
    """Convenience accessors to operate the AMM from the dask client

    Usage: ``client.amm.start()`` etc.

    All methods are asynchronous if the client is asynchronous and synchronous if the
    client is synchronous.
    """

    _client: Client

    def __init__(self, client: Client):
        self._client = client

    def _run(self, method: str):
        """Remotely invoke ActiveMemoryManagerExtension.amm_handler"""
        return self._client.sync(self._client.scheduler.amm_handler, method=method)

    def start(self):
        return self._run("start")

    def stop(self):
        return self._run("stop")

    def run_once(self):
        return self._run("run_once")

    def running(self):
        return self._run("running")


class ReduceReplicas(ActiveMemoryManagerPolicy):
    """Make sure that in-memory tasks are not replicated on more workers than desired;
    drop the excess replicas.
    """

    def run(self):
        nkeys = 0
        ndrop = 0

        for ts in self.manager.scheduler.replicated_tasks:
            desired_replicas = 1  # TODO have a marker on TaskState

            # If a dependent task has not been assigned to a worker yet, err on the side
            # of caution and preserve an additional replica for it.
            # However, if two dependent tasks have been already assigned to the same
            # worker, don't double count them.
            nwaiters = len({waiter.processing_on or waiter for waiter in ts.waiters})

            ndrop_key = len(ts.who_has) - max(desired_replicas, nwaiters)
            if ts in self.manager.pending:
                pending_repl, pending_drop = self.manager.pending[ts]
                ndrop_key += len(pending_repl) - len(pending_drop)

            if ndrop_key > 0:
                nkeys += 1
                ndrop += ndrop_key
                for _ in range(ndrop_key):
                    yield "drop", ts, None

        if ndrop:
            logger.debug("Dropping %d superfluous replicas of %d tasks", ndrop, nkeys)


class RetireWorker(ActiveMemoryManagerPolicy):
    """Replicate somewhere else all unique keys on a worker, preparing for its shutdown.
    Once the worker has been retired, this policy automatically removes itself from the
    Active Memory Manager it's attached to.

    **Retiring a worker with spilled keys**

    On its very first iteration, this policy suggests other workers to fetch all unique
    in-memory tasks. Frequently, this means that in the next few moments the worker to
    be retired will be bombarded with ``Worker.get_data`` calls from the rest of the
    cluster. This can be a problem if most of the managed memory of the worker has been
    spilled out, as it could send the worker above the terminate threshold.
    Two things are in place in order to avoid this:

    1. At every iteration, this policy drops all keys that have already been replicated
       somewhere else. This makes room for further keys to be moved out of the spill
       file in order to be replicated onto another worker.
    2. Once a worker passes the ``pause`` threshold, ``Worker.get_data`` throttles the
       number of outgoing connections to 1.
    """

    address: str

    def __init__(self, address: str):
        self.address = address

    def __repr__(self) -> str:
        return f"RetireWorker({self.address}, done={self.done})"

    def run(self):
        ws = self.manager.scheduler.workers.get(self.address)
        if ws is None:
            self.manager.policies.remove(self)
            return

        n_repl = 0
        n_no_rec = 0

        for ts in ws.has_what:
            if len(ts.who_has) == 1:
                n_repl += 1
                pending_repl, _ = self.manager.pending[ts]
                if not pending_repl:
                    rec_ws = (yield "replicate", ts, None)
                    if not rec_ws:
                        # replication was rejected by the AMM (see _find_recipient)
                        n_no_rec += 1
            else:
                # This may be rejected by either the AMM (see _find_dropper) or by the
                # Worker; e.g. a running task may depend on this key. If so we'll try
                # again at the next iteration. Anyway, this is just to allow spilled
                # keys to be moved back into memory and not mandatory for retirement.
                yield "drop", ts, {ws}

        if n_no_rec:
            logger.warning(
                f"Retiring worker {self.address}; {n_repl - n_no_rec} keys are being "
                f"moved away while {n_no_rec} keys won't be moved for now as there are "
                "no suitable workers to receive them; this typically happens when this "
                "is the only worker on the cluster or all other workers are either "
                "paused or are being shut down themselves. Trying again later..."
            )
        elif n_repl:
            logger.info(
                f"Retiring worker {self.address}; {n_repl} keys are being moved away.",
            )

    def done(self) -> bool:
        """Return True if it is safe to close the worker down, or False otherwise. True
        doesn't necessarily mean that run() won't issue any more suggestions - it could
        continue issuing ``drop`` suggestions afterwards.
        """
        ws = self.manager.scheduler.workers.get(self.address)
        if ws is None:
            return True
        return all(len(ts.who_has) > 1 for ts in ws.has_what)