From 23160275bc95fefa90bfa58d487d7b081aa57928 Mon Sep 17 00:00:00 2001 From: Qiutong Song Date: Mon, 3 Oct 2022 12:12:40 +0000 Subject: [PATCH] Persist container and sandbox if resource cleanup fails, like teardownPodNetwork Signed-off-by: Qiutong Song --- .../container_update_resources_other.go | 24 ++- .../container_update_resources_windows.go | 24 ++- pkg/cri/server/sandbox_run.go | 160 +++++++++++------- pkg/cri/server/sandbox_run_linux.go | 9 + pkg/cri/server/sandbox_run_other.go | 3 + pkg/cri/server/sandbox_run_windows.go | 4 + pkg/cri/store/sandbox/status.go | 36 ++-- 7 files changed, 180 insertions(+), 80 deletions(-) diff --git a/pkg/cri/server/container_update_resources_other.go b/pkg/cri/server/container_update_resources_other.go index 38dc89dc70016..2eb5b81f7d677 100644 --- a/pkg/cri/server/container_update_resources_other.go +++ b/pkg/cri/server/container_update_resources_other.go @@ -20,8 +20,13 @@ package server import ( + "context" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/containers" + "github.com/containerd/typeurl" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" - "golang.org/x/net/context" runtime "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" containerstore "github.com/containerd/containerd/pkg/cri/store/container" @@ -43,3 +48,20 @@ func (c *criService) UpdateContainerResources(ctx context.Context, r *runtime.Up } return &runtime.UpdateContainerResourcesResponse{}, nil } + +// updateContainerSpec updates container spec. +// Copied from container_update_resources_linux.go because it only builds on Linux +// updateContainerSpec updates container spec. +func updateContainerSpec(ctx context.Context, cntr containerd.Container, spec *runtimespec.Spec) error { + any, err := typeurl.MarshalAny(spec) + if err != nil { + return errors.Wrapf(err, "failed to marshal spec %+v", spec) + } + if err := cntr.Update(ctx, func(ctx context.Context, client *containerd.Client, c *containers.Container) error { + c.Spec = any + return nil + }); err != nil { + return errors.Wrap(err, "failed to update container spec") + } + return nil +} diff --git a/pkg/cri/server/container_update_resources_windows.go b/pkg/cri/server/container_update_resources_windows.go index a7cb2489b3abe..747af0084649f 100644 --- a/pkg/cri/server/container_update_resources_windows.go +++ b/pkg/cri/server/container_update_resources_windows.go @@ -20,8 +20,14 @@ package server import ( + "context" + + "github.com/containerd/containerd" + "github.com/containerd/containerd/containers" "github.com/containerd/containerd/errdefs" - "golang.org/x/net/context" + "github.com/containerd/typeurl" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" runtime "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" ) @@ -30,3 +36,19 @@ import ( func (c *criService) UpdateContainerResources(ctx context.Context, r *runtime.UpdateContainerResourcesRequest) (*runtime.UpdateContainerResourcesResponse, error) { return nil, errdefs.ErrNotImplemented } + +// updateContainerSpec updates container spec. +// Copied from container_update_resources_linux.go because it only builds on Linux +func updateContainerSpec(ctx context.Context, cntr containerd.Container, spec *runtimespec.Spec) error { + any, err := typeurl.MarshalAny(spec) + if err != nil { + return errors.Wrapf(err, "failed to marshal spec %+v", spec) + } + if err := cntr.Update(ctx, func(ctx context.Context, client *containerd.Client, c *containers.Container) error { + c.Spec = any + return nil + }); err != nil { + return errors.Wrap(err, "failed to update container spec") + } + return nil +} diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index 642cf40824833..a84078f171c4d 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -18,6 +18,7 @@ package server import ( "encoding/json" + "fmt" "math" "path/filepath" goruntime "runtime" @@ -67,15 +68,23 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, errors.New("sandbox config must include metadata") } name := makeSandboxName(metadata) - log.G(ctx).Debugf("Generated id %q for sandbox %q", id, name) + log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name) + + // cleanupErr records the last error returned by the critical cleanup operations in deferred functions, + // like CNI teardown and stopping the running sandbox task. + // If cleanup is not completed for some reason, the CRI-plugin will leave the sandbox + // in a not-ready state, which can later be cleaned up by the next execution of the kubelet's syncPod workflow. + var cleanupErr error + // Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the // same sandbox. if err := c.sandboxNameIndex.Reserve(name, id); err != nil { return nil, errors.Wrapf(err, "failed to reserve sandbox name %q", name) } defer func() { - // Release the name if the function returns with an error. - if retErr != nil { + // Release the name if the function returns with an error and all the resource cleanup is done. + // When cleanupErr != nil, the name will be cleaned in sandbox_remove. + if retErr != nil && cleanupErr == nil { c.sandboxNameIndex.ReleaseByName(name) } }() @@ -109,61 +118,12 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } log.G(ctx).Debugf("Use OCI %+v for sandbox %q", ociRuntime, id) - podNetwork := true - // Pod network is always needed on windows. - if goruntime.GOOS != "windows" && - config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE { - // Pod network is not needed on linux with host network. - podNetwork = false - } - if podNetwork { - // If it is not in host network namespace then create a namespace and set the sandbox - // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network - // namespaces. If the pod is in host network namespace then both are empty and should not - // be used. - var netnsMountDir = "/var/run/netns" - if c.config.NetNSMountsUnderStateDir { - netnsMountDir = filepath.Join(c.config.StateDir, "netns") - } - sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) - if err != nil { - return nil, errors.Wrapf(err, "failed to create network namespace for sandbox %q", id) - } - sandbox.NetNSPath = sandbox.NetNS.GetPath() - defer func() { - if retErr != nil { - deferCtx, deferCancel := ctrdutil.DeferContext() - defer deferCancel() - // Teardown network if an error is returned. - if err := c.teardownPodNetwork(deferCtx, sandbox); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to destroy network for sandbox %q", id) - } - - if err := sandbox.NetNS.Remove(); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) - } - sandbox.NetNSPath = "" - } - }() - - // Setup network for sandbox. - // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) - // rely on the assumption that CRI shim will not be querying the network namespace to check the - // network states such as IP. - // In future runtime implementation should avoid relying on CRI shim implementation details. - // In this case however caching the IP will add a subtle performance enhancement by avoiding - // calls to network namespace of the pod to query the IP of the veth interface on every - // SandboxStatus request. - if err := c.setupPodNetwork(ctx, &sandbox); err != nil { - return nil, errors.Wrapf(err, "failed to setup network for sandbox %q", id) - } - } - // Create sandbox container. // NOTE: sandboxContainerSpec SHOULD NOT have side // effect, e.g. accessing/creating files, so that we can test // it safely. - spec, err := c.sandboxContainerSpec(id, config, &image.ImageSpec.Config, sandbox.NetNSPath, ociRuntime.PodAnnotations) + // NOTE: the network namespace path will be created later and update through updateNetNamespacePath function + spec, err := c.sandboxContainerSpec(id, config, &image.ImageSpec.Config, "", ociRuntime.PodAnnotations) if err != nil { return nil, errors.Wrap(err, "failed to generate sandbox container spec") } @@ -212,12 +172,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox if err != nil { return nil, errors.Wrap(err, "failed to create containerd container") } + + // Add container into sandbox store in INIT state. + sandbox.Container = container + defer func() { - if retErr != nil { + // Put the sandbox into sandbox store when the some resource fails to be cleaned. + if retErr != nil && cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("encountered an error cleaning up failed sandbox %q, marking sandbox state as SANDBOX_UNKNOWN", id) + if err := c.sandboxStore.Add(sandbox); err != nil { + log.G(ctx).WithError(err).Errorf("failed to add sandbox %+v into store", sandbox) + } + } + }() + + defer func() { + // Delete container only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { deferCtx, deferCancel := ctrdutil.DeferContext() defer deferCancel() - if err := container.Delete(deferCtx, containerd.WithSnapshotCleanup); err != nil { - log.G(ctx).WithError(err).Errorf("Failed to delete containerd container %q", id) + if cleanupErr = container.Delete(deferCtx, containerd.WithSnapshotCleanup); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to delete containerd container %q", id) } } }() @@ -271,6 +246,73 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, errors.Wrap(err, "failed to get sandbox container info") } + podNetwork := true + + if goruntime.GOOS != "windows" && + config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE { + // Pod network is not needed on linux with host network. + podNetwork = false + } + + if podNetwork { + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + sandbox.NetNSPath = sandbox.NetNS.GetPath() + + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + // Update network namespace in the container's spec + c.updateNetNamespacePath(spec, sandbox.NetNSPath) + if err := updateContainerSpec(ctx, container, spec); err != nil { + return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions. + defer func() { + // Teardown the network only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + } + // Create sandbox task in containerd. log.G(ctx).Tracef("Create sandbox container (id=%q, name=%q).", id, name) @@ -288,6 +330,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox // Cleanup the sandbox container if an error is returned. if _, err := task.Delete(deferCtx, WithNRISandboxDelete(id), containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { log.G(ctx).WithError(err).Errorf("Failed to delete sandbox container %q", id) + cleanupErr = err } } }() @@ -326,9 +369,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, errors.Wrap(err, "failed to update sandbox status") } - // Add sandbox into sandbox store in INIT state. - sandbox.Container = container - if err := c.sandboxStore.Add(sandbox); err != nil { return nil, errors.Wrapf(err, "failed to add sandbox %+v into store", sandbox) } diff --git a/pkg/cri/server/sandbox_run_linux.go b/pkg/cri/server/sandbox_run_linux.go index 85cfeb37574a7..634dc6e27e812 100644 --- a/pkg/cri/server/sandbox_run_linux.go +++ b/pkg/cri/server/sandbox_run_linux.go @@ -320,3 +320,12 @@ func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { return taskOpts } + +func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { + for i := range spec.Linux.Namespaces { + if spec.Linux.Namespaces[i].Type == runtimespec.NetworkNamespace { + spec.Linux.Namespaces[i].Path = nsPath + break + } + } +} diff --git a/pkg/cri/server/sandbox_run_other.go b/pkg/cri/server/sandbox_run_other.go index e4e876c012d66..59ed5ef22e821 100644 --- a/pkg/cri/server/sandbox_run_other.go +++ b/pkg/cri/server/sandbox_run_other.go @@ -54,3 +54,6 @@ func (c *criService) cleanupSandboxFiles(id string, config *runtime.PodSandboxCo func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { return []containerd.NewTaskOpts{} } + +func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { +} diff --git a/pkg/cri/server/sandbox_run_windows.go b/pkg/cri/server/sandbox_run_windows.go index c165344b392f4..06709ff5c6979 100644 --- a/pkg/cri/server/sandbox_run_windows.go +++ b/pkg/cri/server/sandbox_run_windows.go @@ -92,3 +92,7 @@ func (c *criService) cleanupSandboxFiles(id string, config *runtime.PodSandboxCo func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { return nil } + +func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { + spec.Windows.Network.NetworkNamespace = nsPath +} diff --git a/pkg/cri/store/sandbox/status.go b/pkg/cri/store/sandbox/status.go index e9198eb976a7c..787d31968ba0e 100644 --- a/pkg/cri/store/sandbox/status.go +++ b/pkg/cri/store/sandbox/status.go @@ -29,24 +29,24 @@ import ( // | | // | Create(Run) | Load // | | -// Start | | -// (failed) | | -// +------------------+ +-----------+ -// | | | | -// | | | | -// | | | | -// | | Start(Run) | | -// | | | | -// | PortForward +----v----+ | | -// | +------+ | | | -// | | | READY <---------+ | -// | +------> | | | -// | +----+----+ | | -// | | | | -// | | Stop/Exit | | -// | | | | -// | +----v----+ | | -// | | <---------+ +----v----+ +// | | +// | | Start +// | |(failed and not cleaned) +// Start |--------------|--------------+ +//(failed but cleaned)| | | +// +------------------+ |-----------+ | +// | | Start(Run) | | | +// | | | | | +// | PortForward +----v----+ | | | +// | +------+ | | | | +// | | | READY <---------+ | | +// | +------> | | | | +// | +----+----+ | | | +// | | | | | +// | | Stop/Exit | | | +// | | | | | +// | +----v----+ | | | +// | | <---------+ +----v--v-+ // | | NOTREADY| | | // | | <----------------+ UNKNOWN | // | +----+----+ Stop | |