From 87008933ce22961c1ae8a73ebbff67106aa7303d Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 29 Mar 2022 10:24:56 +0200 Subject: [PATCH] overlay: use idmapped lower layers where supported use idmapped mounts for the overlay lower layers when the kernel supports them. For each lower directory with ID=0...N-1, it creates a idmapped mount at $GRAPHROOT/overlay/$LAYER/mapped/$ID. The final overlay mount will use these idmapped mounts instead of the original source directory. The upperdir is not idmapped, so files are created with the same IDs used by the user namespace. Signed-off-by: Giuseppe Scrivano --- drivers/overlay/check.go | 53 +++++++++ drivers/overlay/idmapped_utils.go | 184 ++++++++++++++++++++++++++++++ drivers/overlay/overlay.go | 115 ++++++++++++++++++- 3 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 drivers/overlay/idmapped_utils.go diff --git a/drivers/overlay/check.go b/drivers/overlay/check.go index 44b3515a85..cb4ca57099 100644 --- a/drivers/overlay/check.go +++ b/drivers/overlay/check.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package overlay @@ -11,6 +12,7 @@ import ( "syscall" "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" "github.com/containers/storage/pkg/ioutils" "github.com/containers/storage/pkg/mount" "github.com/containers/storage/pkg/system" @@ -218,3 +220,54 @@ func doesVolatile(d string) (bool, error) { }() return true, nil } + +// supportsIdmappedLowerLayers checks if the kernel supports mounting overlay on top of +// a idmapped lower layer. +func supportsIdmappedLowerLayers(home string) (bool, error) { + layerDir, err := ioutil.TempDir(home, "compat") + if err != nil { + return false, err + } + + mergedDir := filepath.Join(layerDir, "merged") + lowerDir := filepath.Join(layerDir, "lower") + lowerMappedDir := filepath.Join(layerDir, "lower-mapped") + upperDir := filepath.Join(layerDir, "upper") + workDir := filepath.Join(layerDir, "work") + + defer func() { + _ = unix.Unmount(mergedDir, unix.MNT_DETACH) + _ = os.RemoveAll(layerDir) + }() + + _ = idtools.MkdirAs(mergedDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerMappedDir, 0700, 0, 0) + _ = idtools.MkdirAs(upperDir, 0700, 0, 0) + _ = idtools.MkdirAs(workDir, 0700, 0, 0) + + idmap := []idtools.IDMap{ + { + ContainerID: 0, + HostID: 0, + Size: 1, + }, + } + pid, cleanupFunc, err := createUsernsProcess(idmap, idmap) + if err != nil { + return false, err + } + defer cleanupFunc() + + if err := createIDMappedMount(lowerDir, lowerMappedDir, int(pid)); err != nil { + return false, errors.Wrapf(err, "create mapped mount") + } + defer unix.Unmount(lowerMappedDir, unix.MNT_DETACH) + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerMappedDir, upperDir, workDir) + flags := uintptr(0) + if err := unix.Mount("overlay", mergedDir, "overlay", flags, opts); err != nil { + return false, err + } + return true, nil +} diff --git a/drivers/overlay/idmapped_utils.go b/drivers/overlay/idmapped_utils.go new file mode 100644 index 0000000000..a314334139 --- /dev/null +++ b/drivers/overlay/idmapped_utils.go @@ -0,0 +1,184 @@ +//go:build linux +// +build linux + +package overlay + +import ( + "fmt" + "io/ioutil" + "os" + "syscall" + "unsafe" + + "github.com/containers/storage/pkg/idtools" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type attr struct { + attrSet uint64 + attrClr uint64 + propagation uint64 + userNs uint64 +} + +const ( + // MOUNT_ATTR_RDONLY - Mount read-only + MOUNT_ATTR_RDONLY = 0x00000001 //nolint:golint + // MOUNT_ATTR_NOSUID - Ignore suid and sgid bits + MOUNT_ATTR_NOSUID = 0x00000002 //nolint:golint + // MOUNT_ATTR_NODEV - Disallow access to device special files + MOUNT_ATTR_NODEV = 0x00000004 //nolint:golint + // MOUNT_ATTR_NOEXEC - Disallow program execution + MOUNT_ATTR_NOEXEC = 0x00000008 //nolint:golint + // MOUNT_ATTR__ATIME - Setting on how atime should be updated + MOUNT_ATTR__ATIME = 0x00000070 //nolint:golint + // MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime + MOUNT_ATTR_RELATIME = 0x00000000 //nolint:golint + // MOUNT_ATTR_NOATIME - Do not update access times + MOUNT_ATTR_NOATIME = 0x00000010 //nolint:golint + // MOUNT_ATTR_STRICTATIME - Always perform atime updates + MOUNT_ATTR_STRICTATIME = 0x00000020 //nolint:golint + // MOUNT_ATTR_NODIRATIME - Do not update directory access times + MOUNT_ATTR_NODIRATIME = 0x00000080 //nolint:golint + // MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr + MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint + + // OPEN_TREE_CLONE - Clone the source path mount + OPEN_TREE_CLONE = 0x00000001 //nolint:golint + + // MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd + MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint + + // AT_RECURSIVE applies the operation to the entire subtree. + AT_RECURSIVE = 0x8000 //nolint:golint +) + +// openTree is a wrapper for the open_tree syscall +func openTree(path string, flags int) (fd int, err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return 0, err + } + + r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), 0, 0, 0) + if e1 != 0 { + err = e1 + } + return int(r), nil +} + +// moveMount is a wrapper for the the move_mount syscall. +func moveMount(fdTree int, target string) (err error) { + var _p0, _p1 *byte + + empty := "" + + if _p0, err = syscall.BytePtrFromString(target); err != nil { + return err + } + if _p1, err = syscall.BytePtrFromString(empty); err != nil { + return err + } + + flags := MOVE_MOUNT_F_EMPTY_PATH + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT), + uintptr(fdTree), uintptr(unsafe.Pointer(_p1)), + 0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + if e1 != 0 { + err = e1 + } + return +} + +// mountSetAttr is a wrapper for the mount_setattr syscall +func mountSetAttr(dfd int, path string, flags uint, attr *attr, size uint) (err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return err + } + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0) + if e1 != 0 { + err = e1 + } + return +} + +// createIDMappedMount creates a IDMapped bind mount from SOURCE to TARGET using the user namespace +// for the PID process. +func createIDMappedMount(source, target string, pid int) error { + path := fmt.Sprintf("/proc/%d/ns/user", pid) + userNsFile, err := os.Open(path) + if err != nil { + return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path) + } + + var attr attr + attr.attrSet = MOUNT_ATTR_IDMAP + attr.attrClr = 0 + attr.propagation = 0 + attr.userNs = uint64(userNsFile.Fd()) + + defer userNsFile.Close() + + targetDirFd, err := openTree(source, OPEN_TREE_CLONE|AT_RECURSIVE) + if err != nil { + return err + } + defer unix.Close(targetDirFd) + + if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|AT_RECURSIVE, + &attr, uint(unsafe.Sizeof(attr))); err != nil { + return err + } + if err := os.Mkdir(target, 0700); err != nil && !os.IsExist(err) { + return err + } + return moveMount(targetDirFd, target) +} + +// createUsernsProcess forks the current process and creates a user namespace using the specified +// mappings. It returns the pid of the new process. +func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, func(), error) { + pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0) + if err != 0 { + return -1, nil, err + } + if pid == 0 { + _ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) + // just wait for the SIGKILL + for { + syscall.Syscall6(uintptr(unix.SYS_PAUSE), 0, 0, 0, 0, 0, 0) + } + } + cleanupFunc := func() { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + } + writeMappings := func(fname string, idmap []idtools.IDMap) error { + mappings := "" + for _, m := range idmap { + mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size) + } + if err := ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600); err != nil { + return err + } + return nil + } + if err := writeMappings("uid_map", uidMaps); err != nil { + cleanupFunc() + return -1, nil, err + } + if err := writeMappings("gid_map", gidMaps); err != nil { + cleanupFunc() + return -1, nil, err + } + + return int(pid), cleanupFunc, nil +} diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index a780ef5da3..de2e1a39e9 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -121,6 +121,8 @@ type Driver struct { supportsVolatile *bool usingMetacopy bool locker *locker.Locker + + supportsIDMappedMounts *bool } type additionalLayerStore struct { @@ -205,6 +207,29 @@ func checkSupportVolatile(home, runhome string) (bool, error) { return usingVolatile, nil } +// checkAndRecordIDMappedSupport checks and stores if the kernel supports mounting overlay on top of a +// idmapped lower layer. +func checkAndRecordIDMappedSupport(home, runhome string) (bool, error) { + feature := "idmapped-lower-dir" + overlayCacheResult, overlayCacheText, err := cachedFeatureCheck(runhome, feature) + if err == nil { + if overlayCacheResult { + logrus.Debugf("Cached value indicated that overlay is supported") + } else { + logrus.Debugf("Cached value indicated that overlay is not supported") + } + if !overlayCacheResult { + return false, errors.New(overlayCacheText) + } + return true, nil + } + supportsIDMappedMounts, err := supportsIdmappedLowerLayers(home) + if err2 := cachedFeatureRecord(runhome, feature, supportsIDMappedMounts, ""); err2 != nil { + return false, errors.Wrap(err2, "recording overlay idmapped mounts support status") + } + return supportsIDMappedMounts, err +} + func checkAndRecordOverlaySupport(fsMagic graphdriver.FsMagic, home, runhome string) (bool, error) { var supportsDType bool @@ -1485,6 +1510,51 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO } } + if d.supportsIDmappedMounts() && len(options.UidMaps) > 0 && len(options.GidMaps) > 0 { + var newAbsDir []string + mappedRoot := filepath.Join(d.home, id, "mapped") + if err := os.MkdirAll(mappedRoot, 0700); err != nil { + return "", err + } + + pid, cleanupFunc, err := createUsernsProcess(options.UidMaps, options.GidMaps) + if err != nil { + return "", err + } + defer cleanupFunc() + + idMappedMounts := make(map[string]string) + + // rewrite the lower dirs to their idmapped mount. + c := 0 + for _, absLower := range absLowers { + mappedMountSrc := getMappedMountRoot(absLower) + + root, found := idMappedMounts[mappedMountSrc] + if !found { + root = filepath.Join(mappedRoot, fmt.Sprintf("%d", c)) + c++ + if err := createIDMappedMount(mappedMountSrc, root, int(pid)); err != nil { + return "", errors.Wrapf(err, "create mapped mount for %q on %q", mappedMountSrc, root) + } + idMappedMounts[mappedMountSrc] = root + + // overlay takes a reference on the mount, so it is safe to unmount + // the mapped idmounts as soon as the final overlay file system is mounted. + defer unix.Unmount(root, unix.MNT_DETACH) + } + + // relative path to the layer through the id mapped mount + rel, err := filepath.Rel(mappedMountSrc, absLower) + if err != nil { + return "", err + } + + newAbsDir = append(newAbsDir, filepath.Join(root, rel)) + } + absLowers = newAbsDir + } + var opts string if readWrite { opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir) @@ -1587,6 +1657,18 @@ func (d *Driver) Put(id string) error { unmounted := false + mappedRoot := filepath.Join(d.home, id, "mapped") + // It should not happen, but cleanup any mapped mount if it was leaked. + if _, err := os.Stat(mappedRoot); err == nil { + mounts, err := ioutil.ReadDir(mappedRoot) + if err == nil { + // Go through all of the mapped mounts. + for _, m := range mounts { + _ = unix.Unmount(filepath.Join(mappedRoot, m.Name()), unix.MNT_DETACH) + } + } + } + if d.options.mountProgram != "" { // Attempt to unmount the FUSE mount using either fusermount or fusermount3. // If they fail, fallback to unix.Unmount @@ -1958,12 +2040,31 @@ func (d *Driver) UpdateLayerIDMap(id string, toContainer, toHost *idtools.IDMapp return nil } +// supportsIDmappedMounts returns whether the kernel supports using idmapped mounts with +// overlay lower layers. +func (d *Driver) supportsIDmappedMounts() bool { + if d.supportsIDMappedMounts != nil { + return *d.supportsIDMappedMounts + } + + supportsIDMappedMounts, err := checkAndRecordIDMappedSupport(d.home, d.runhome) + d.supportsIDMappedMounts = &supportsIDMappedMounts + if err == nil { + return supportsIDMappedMounts + } + logrus.Debugf("Check for idmapped mounts support %v", err) + return false +} + // SupportsShifting tells whether the driver support shifting of the UIDs/GIDs in an userNS func (d *Driver) SupportsShifting() bool { if os.Getenv("_TEST_FORCE_SUPPORT_SHIFTING") == "yes-please" { return true } - return d.options.mountProgram != "" + if d.options.mountProgram != "" { + return true + } + return d.supportsIDmappedMounts() } // dumbJoin is more or less a dumber version of filepath.Join, but one which @@ -2132,3 +2233,15 @@ func redirectDiffIfAdditionalLayer(diffPath string) (string, error) { } return diffPath, nil } + +// getMappedMountRoot is a heuristic that calculates the parent directory where +// the idmapped mount should be applied. +// It is useful to minimize the number of idmapped mounts and at the same time use +// a common path as long as possible to reduce the length of the mount data argument. +func getMappedMountRoot(path string) string { + dirName := filepath.Dir(path) + if filepath.Base(dirName) == linkDir { + return filepath.Dir(dirName) + } + return dirName +}