From e81b867fed407a5cbe380ccdd6edc756b804cb0b Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Tue, 9 Apr 2024 08:35:08 -0600 Subject: [PATCH] container: always wait for containerd's deletion API call On a heavily loaded host, we were experiencing long container(d) deletion times: containerd[8907]: time="2024-03-25T13:47:45.938479195Z" level=debug msg="event forwarded" ns=moby topic=/tasks/exit type=containerd.events.TaskExit # our control plane logic deletes the successfully exited container via # the docker API, and... containerd[8907]: time="2024-03-25T13:47:47.202055216Z" level=debug msg="failed to delete task" error="context deadline exceeded" id=a618057629b35e3bfea82d5ce4cbb057ba979498496428dfe6935a1322b94add Before 4bafaa00aa81 ("Refactor libcontainerd to minimize c8d RPCs") when this happens, the docker API reports a 255 exit code and no error: 0a7ddd027c0497d5a titus-executor-[900884]: Processing msg from a docker: main container exited with code 255 which is especially confusing. After 4bafaa00aa81, the behavior has changed to report the container's real exit code, although there is still a hard coded timeout after which containerd will (try to) stop cleaning up. After some discussion, it seems best to explicitly wait for as long as this takes so we do not leak containerd resources. So let's change this 30s timeout to wait forever. Reported-by: Hechao Li Signed-off-by: Tycho Andersen --- daemon/monitor.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/daemon/monitor.go b/daemon/monitor.go index 65c82fdf8597a..ae45396e76f3d 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -39,9 +39,8 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine tsk, ok := c.Task() if ok { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + ctx := context.Background() es, err := tsk.Delete(ctx) - cancel() if err != nil { log.G(ctx).WithFields(log.Fields{ "error": err,