From 61be716cdcf8cd8b880a0af73cce26eb87788e52 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Wed, 26 Jan 2022 22:26:35 +0800 Subject: [PATCH] oci: use readonly mount to read user/group info In linux kernel, the umount writable-mountpoint will try to do sync-fs to make sure that the dirty pages to the underlying filesystems. The many number of umount actions in the same time maybe introduce performance issue in IOPS limited disk. When CRI-plugin creates container, it will temp-mount rootfs to read that UID/GID info for entrypoint. Basically, the rootfs is writable snapshotter and then after read, umount will invoke sync-fs action. For example, using overlayfs on ext4 and use bcc-tools to monitor ext4_sync_fs call. ``` // uname -a Linux chaofan 5.13.0-27-generic #29~20.04.1-Ubuntu SMP Fri Jan 14 00:32:30 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux // open terminal 1 kubectl run --image=nginx --image-pull-policy=IfNotPresent nginx-pod // open terminal 2 /usr/share/bcc/tools/stackcount ext4_sync_fs -i 1 -v -P ext4_sync_fs sync_filesystem ovl_sync_fs __sync_filesystem sync_filesystem generic_shutdown_super kill_anon_super deactivate_locked_super deactivate_super cleanup_mnt __cleanup_mnt task_work_run exit_to_user_mode_prepare syscall_exit_to_user_mode do_syscall_64 entry_SYSCALL_64_after_hwframe syscall.Syscall.abi0 github.com/containerd/containerd/mount.unmount github.com/containerd/containerd/mount.UnmountAll github.com/containerd/containerd/mount.WithTempMount.func2 github.com/containerd/containerd/mount.WithTempMount github.com/containerd/containerd/oci.WithUserID.func1 github.com/containerd/containerd/oci.WithUser.func1 github.com/containerd/containerd/oci.ApplyOpts github.com/containerd/containerd.WithSpec.func1 github.com/containerd/containerd.(*Client).NewContainer github.com/containerd/containerd/pkg/cri/server.(*criService).CreateContainer github.com/containerd/containerd/pkg/cri/server.(*instrumentedService).CreateContainer k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler.func1 github.com/containerd/containerd/services/server.unaryNamespaceInterceptor github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc.UnaryServerInterceptor.func1 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1 k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler google.golang.org/grpc.(*Server).processUnaryRPC google.golang.org/grpc.(*Server).handleStream google.golang.org/grpc.(*Server).serveStreams.func1.2 runtime.goexit.abi0 containerd [34771] 1 ``` If there are comming several create requestes, umount actions might bring high IO pressure on the /var/lib/containerd's underlying disk. After checkout the kernel code[1], the kernel will not call __sync_filesystem if the mount is readonly. Based on this, containerd should use readonly mount to get UID/GID information. Reference: * [1] https://elixir.bootlin.com/linux/v5.13/source/fs/sync.c#L61 Closes: #4604 Signed-off-by: Wei Fu (cherry picked from commit 813a061fe13998dfaaceec6173448d0c12434e7b) Signed-off-by: Qiutong Song --- oci/spec_opts.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/oci/spec_opts.go b/oci/spec_opts.go index 5a952f616645..4199a85d9330 100644 --- a/oci/spec_opts.go +++ b/oci/spec_opts.go @@ -590,6 +590,8 @@ func WithUser(userstr string) SpecOpts { if err != nil { return err } + + mounts = tryReadonlyMounts(mounts) return mount.WithTempMount(ctx, mounts, f) default: return fmt.Errorf("invalid USER value %s", userstr) @@ -643,6 +645,8 @@ func WithUserID(uid uint32) SpecOpts { if err != nil { return err } + + mounts = tryReadonlyMounts(mounts) return mount.WithTempMount(ctx, mounts, func(root string) error { user, err := UserFromPath(root, func(u user.User) bool { return u.Uid == int(uid) @@ -692,6 +696,8 @@ func WithUsername(username string) SpecOpts { if err != nil { return err } + + mounts = tryReadonlyMounts(mounts) return mount.WithTempMount(ctx, mounts, func(root string) error { user, err := UserFromPath(root, func(u user.User) bool { return u.Name == username @@ -776,6 +782,8 @@ func WithAdditionalGIDs(userstr string) SpecOpts { if err != nil { return err } + + mounts = tryReadonlyMounts(mounts) return mount.WithTempMount(ctx, mounts, setAdditionalGids) } } @@ -1264,3 +1272,21 @@ func WithDevShmSize(kb int64) SpecOpts { return ErrNoShmMount } } + +// tryReadonlyMounts is used by the options which are trying to get user/group +// information from container's rootfs. Since the option does read operation +// only, this helper will append ReadOnly mount option to prevent linux kernel +// from syncing whole filesystem in umount syscall. +// +// TODO(fuweid): +// +// Currently, it only works for overlayfs. I think we can apply it to other +// kinds of filesystem. Maybe we can return `ro` option by `snapshotter.Mount` +// API, when the caller passes that experimental annotation +// `containerd.io/snapshot/readonly.mount` something like that. +func tryReadonlyMounts(mounts []mount.Mount) []mount.Mount { + if len(mounts) == 1 && mounts[0].Type == "overlay" { + mounts[0].Options = append(mounts[0].Options, "ro") + } + return mounts +}