From 557c0510d1b47517a3db321b20dc0a6d661f287e Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 29 Mar 2022 10:24:56 +0200 Subject: [PATCH] overlay: use idmapped lower layers where supported use idmapped mounts for the overlay lower layers when the kernel supports them. For each lower directory with ID=0...N-1, it creates a idmapped mount at $GRAPHROOT/overlay/$LAYER/mapped/$ID. The final overlay mount will use these idmapped mounts instead of the original source directory. The upperdir is not idmapped, so files are created with the same IDs used by the user namespace. Signed-off-by: Giuseppe Scrivano --- drivers/overlay/check.go | 77 +++++++++++++ drivers/overlay/idmapped_utils.go | 178 ++++++++++++++++++++++++++++++ drivers/overlay/overlay.go | 44 +++++++- 3 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 drivers/overlay/idmapped_utils.go diff --git a/drivers/overlay/check.go b/drivers/overlay/check.go index 44b3515a85..ad9238ae98 100644 --- a/drivers/overlay/check.go +++ b/drivers/overlay/check.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package overlay @@ -11,6 +12,7 @@ import ( "syscall" "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" "github.com/containers/storage/pkg/ioutils" "github.com/containers/storage/pkg/mount" "github.com/containers/storage/pkg/system" @@ -218,3 +220,78 @@ func doesVolatile(d string) (bool, error) { }() return true, nil } + +// supportsIdmappedLowerLayers checks if the kernel supports mounting overlay on top of +// a idmapped lower layer. +func supportsIdmappedLowerLayers(home string) (bool, error) { + layerDir, err := ioutil.TempDir(home, "compat") + if err != nil { + return false, err + } + + mergedDir := filepath.Join(layerDir, "merged") + lowerDir := filepath.Join(layerDir, "lower") + lowerMappedDir := filepath.Join(layerDir, "lower-mapped") + upperDir := filepath.Join(layerDir, "upper") + workDir := filepath.Join(layerDir, "work") + + defer func() { + _ = unix.Unmount(mergedDir, unix.MNT_DETACH) + _ = os.RemoveAll(layerDir) + }() + + _ = idtools.MkdirAs(mergedDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerMappedDir, 0700, 0, 0) + _ = idtools.MkdirAs(upperDir, 0700, 0, 0) + _ = idtools.MkdirAs(workDir, 0700, 0, 0) + + idmap := []idtools.IDMap{ + idtools.IDMap{ContainerID: 0, + HostID: 0, + Size: 1, + }, + } + pid, err := createUsernsProcess(idmap, idmap) + if err != nil { + return false, err + } + defer func() { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + }() + + if err := getIDMappedMount(lowerDir, lowerMappedDir, int(pid)); err != nil { + return false, errors.Wrapf(err, "create mapped mount") + } + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerMappedDir, upperDir, workDir) + flags := uintptr(0) + if err := unix.Mount("overlay", mergedDir, "overlay", flags, opts); err != nil { + return false, err + } + return true, nil +} + +// checkAndRecordIdMappedSupport checks and stores if the kernel supports mounting overlay on top of a +// idmapped lower layer. +func checkAndRecordIdMappedSupport(home, runhome string) (bool, error) { + feature := "idmapped-lower-dir" + overlayCacheResult, overlayCacheText, err := cachedFeatureCheck(runhome, feature) + if err == nil { + if overlayCacheResult { + logrus.Debugf("Cached value indicated that overlay is supported") + } else { + logrus.Debugf("Cached value indicated that overlay is not supported") + } + if !overlayCacheResult { + return false, errors.New(overlayCacheText) + } + return true, nil + } + supportsIdMappedMounts, err := supportsIdmappedLowerLayers(home) + if err2 := cachedFeatureRecord(runhome, feature, supportsIdMappedMounts, ""); err2 != nil { + return false, errors.Wrap(err2, "recording overlay idmapped mounts support status") + } + return supportsIdMappedMounts, err +} diff --git a/drivers/overlay/idmapped_utils.go b/drivers/overlay/idmapped_utils.go new file mode 100644 index 0000000000..5b8d576aca --- /dev/null +++ b/drivers/overlay/idmapped_utils.go @@ -0,0 +1,178 @@ +//go:build linux +// +build linux + +package overlay + +import ( + "fmt" + "os" + "io/ioutil" + "syscall" + "unsafe" + + "github.com/containers/storage/pkg/idtools" + "golang.org/x/sys/unix" + "github.com/pkg/errors" +) + +type Attr struct { + attrSet uint64 + attrClr uint64 + propagation uint64 + userNs uint64 +} + +const AT_RECURSIVE = 0x8000 //nolint:golint + +const ( + // MOUNT_ATTR_RDONLY - Mount read-only + MOUNT_ATTR_RDONLY = 0x00000001 //nolint:golint + // MOUNT_ATTR_NOSUID - Ignore suid and sgid bits + MOUNT_ATTR_NOSUID = 0x00000002 //nolint:golint + // MOUNT_ATTR_NODEV - Disallow access to device special files + MOUNT_ATTR_NODEV = 0x00000004 //nolint:golint + // MOUNT_ATTR_NOEXEC - Disallow program execution + MOUNT_ATTR_NOEXEC = 0x00000008 //nolint:golint + // MOUNT_ATTR__ATIME - Setting on how atime should be updated + MOUNT_ATTR__ATIME = 0x00000070 //nolint:golint + // MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime + MOUNT_ATTR_RELATIME = 0x00000000 //nolint:golint + // MOUNT_ATTR_NOATIME - Do not update access times + MOUNT_ATTR_NOATIME = 0x00000010 //nolint:golint + // MOUNT_ATTR_STRICTATIME - Always perform atime updates + MOUNT_ATTR_STRICTATIME = 0x00000020 //nolint:golint + // MOUNT_ATTR_NODIRATIME - Do not update directory access times + MOUNT_ATTR_NODIRATIME = 0x00000080 //nolint:golint + // MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr + MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint + + // OPEN_TREE_CLONE - Clone the source path mount + OPEN_TREE_CLONE = 0x00000001 //nolint:golint + + // MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd + MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint +) + +func cloneTree(path string) (fd int, err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return 0, err + } + + flags := OPEN_TREE_CLONE | AT_RECURSIVE + + r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), 0, 0, 0) + if e1 != 0 { + err = e1 + } + return int(r), nil +} + +func mountMount(fdTree int, target string) (err error) { + var _p0, _p1 *byte + + empty := "" + + if _p0, err = syscall.BytePtrFromString(target); err != nil { + return err + } + if _p1, err = syscall.BytePtrFromString(empty); err != nil { + return err + } + + flags := MOVE_MOUNT_F_EMPTY_PATH + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT), + uintptr(fdTree), uintptr(unsafe.Pointer(_p1)), + 0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + if e1 != 0 { + err = e1 + } + return +} + +func mountSetAttr(dfd int, path string, flags uint, attr *Attr, size uint) (err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return err + } + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0) + if e1 != 0 { + err = e1 + } + return +} + +func getIDMappedMount(from, to string, pid int) error { + path := fmt.Sprintf("/proc/%d/ns/user", pid) + userNsFile, err := os.Open(path) + if err != nil { + return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path) + } + + var attr Attr + attr.attrSet = MOUNT_ATTR_IDMAP + attr.attrClr = 0 + attr.propagation = 0 + attr.userNs = uint64(userNsFile.Fd()) + + defer userNsFile.Close() + + targetDirFd, err := cloneTree(from) + if err != nil { + return err + } + defer unix.Close(targetDirFd) + + if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|AT_RECURSIVE, + &attr, uint(unsafe.Sizeof(attr))); err != nil { + return err + } + + return mountMount(targetDirFd, to) +} + +// createUsernsProcess forks the current process and creates a user namespace using the specified +// mappings. It returns the pid of the new process. +func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, error) { + pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0) + if err != 0 { + return -1, err + } + if pid == 0 { + _ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) + // just wait for the SIGKILL + for { + syscall.Syscall6(uintptr(unix.SYS_PAUSE), 0, 0, 0, 0, 0, 0) + } + } + + writeMappings := func(fname string, idmap []idtools.IDMap) error { + mappings := "" + for _, m := range idmap { + mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size) + } + if err := ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600); err != nil { + return err + } + return nil + } + + if err := writeMappings("uid_map", uidMaps); err != nil { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + return -1, err + } + if err := writeMappings("gid_map", gidMaps); err != nil { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + return -1, err + } + + return int(pid), nil +} diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index a780ef5da3..b22f28ed31 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -1485,6 +1485,40 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO } } + if len(options.UidMaps) > 0 && len(options.GidMaps) > 0 { + var newAbsDir []string + mappedRoot := filepath.Join(d.home, id, "mapped") + if err := os.MkdirAll(mappedRoot, 0700); err != nil { + return "", err + } + + pid, err := createUsernsProcess(options.UidMaps, options.GidMaps) + if err != nil { + return "", err + } + defer func() { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + }() + + // rewrite the lower dirs to their idmapped mount. + for c, absLower := range absLowers { + to := filepath.Join(mappedRoot, fmt.Sprintf("%d", c)) + if err := os.Mkdir(to, 0700); err != nil && !os.IsExist(err) { + return "", err + } + if err := getIDMappedMount(absLower, to, int(pid)); err != nil { + return "", errors.Wrapf(err, "create mapped mount") + } + // overlay takes a reference on the mount, so it is safe to unmount + // the mapped idmounts as soon as the final overlay file system is mounted. + defer unix.Unmount(to, unix.MNT_DETACH) + + newAbsDir = append(newAbsDir, to) + } + absLowers = newAbsDir + } + var opts string if readWrite { opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir) @@ -1963,7 +1997,15 @@ func (d *Driver) SupportsShifting() bool { if os.Getenv("_TEST_FORCE_SUPPORT_SHIFTING") == "yes-please" { return true } - return d.options.mountProgram != "" + if d.options.mountProgram != "" { + return true + } + supportsIdMappedMounts, err := checkAndRecordIdMappedSupport(d.home, d.runhome) + if err == nil { + return supportsIdMappedMounts + } + logrus.Debugf("Check for idmapped mounts support %v", err) + return false } // dumbJoin is more or less a dumber version of filepath.Join, but one which