diff --git a/drivers/overlay/check.go b/drivers/overlay/check.go index 44b3515a85..cb4ca57099 100644 --- a/drivers/overlay/check.go +++ b/drivers/overlay/check.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package overlay @@ -11,6 +12,7 @@ import ( "syscall" "github.com/containers/storage/pkg/archive" + "github.com/containers/storage/pkg/idtools" "github.com/containers/storage/pkg/ioutils" "github.com/containers/storage/pkg/mount" "github.com/containers/storage/pkg/system" @@ -218,3 +220,54 @@ func doesVolatile(d string) (bool, error) { }() return true, nil } + +// supportsIdmappedLowerLayers checks if the kernel supports mounting overlay on top of +// a idmapped lower layer. +func supportsIdmappedLowerLayers(home string) (bool, error) { + layerDir, err := ioutil.TempDir(home, "compat") + if err != nil { + return false, err + } + + mergedDir := filepath.Join(layerDir, "merged") + lowerDir := filepath.Join(layerDir, "lower") + lowerMappedDir := filepath.Join(layerDir, "lower-mapped") + upperDir := filepath.Join(layerDir, "upper") + workDir := filepath.Join(layerDir, "work") + + defer func() { + _ = unix.Unmount(mergedDir, unix.MNT_DETACH) + _ = os.RemoveAll(layerDir) + }() + + _ = idtools.MkdirAs(mergedDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerDir, 0700, 0, 0) + _ = idtools.MkdirAs(lowerMappedDir, 0700, 0, 0) + _ = idtools.MkdirAs(upperDir, 0700, 0, 0) + _ = idtools.MkdirAs(workDir, 0700, 0, 0) + + idmap := []idtools.IDMap{ + { + ContainerID: 0, + HostID: 0, + Size: 1, + }, + } + pid, cleanupFunc, err := createUsernsProcess(idmap, idmap) + if err != nil { + return false, err + } + defer cleanupFunc() + + if err := createIDMappedMount(lowerDir, lowerMappedDir, int(pid)); err != nil { + return false, errors.Wrapf(err, "create mapped mount") + } + defer unix.Unmount(lowerMappedDir, unix.MNT_DETACH) + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerMappedDir, upperDir, workDir) + flags := uintptr(0) + if err := unix.Mount("overlay", mergedDir, "overlay", flags, opts); err != nil { + return false, err + } + return true, nil +} diff --git a/drivers/overlay/idmapped_utils.go b/drivers/overlay/idmapped_utils.go new file mode 100644 index 0000000000..a314334139 --- /dev/null +++ b/drivers/overlay/idmapped_utils.go @@ -0,0 +1,184 @@ +//go:build linux +// +build linux + +package overlay + +import ( + "fmt" + "io/ioutil" + "os" + "syscall" + "unsafe" + + "github.com/containers/storage/pkg/idtools" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type attr struct { + attrSet uint64 + attrClr uint64 + propagation uint64 + userNs uint64 +} + +const ( + // MOUNT_ATTR_RDONLY - Mount read-only + MOUNT_ATTR_RDONLY = 0x00000001 //nolint:golint + // MOUNT_ATTR_NOSUID - Ignore suid and sgid bits + MOUNT_ATTR_NOSUID = 0x00000002 //nolint:golint + // MOUNT_ATTR_NODEV - Disallow access to device special files + MOUNT_ATTR_NODEV = 0x00000004 //nolint:golint + // MOUNT_ATTR_NOEXEC - Disallow program execution + MOUNT_ATTR_NOEXEC = 0x00000008 //nolint:golint + // MOUNT_ATTR__ATIME - Setting on how atime should be updated + MOUNT_ATTR__ATIME = 0x00000070 //nolint:golint + // MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime + MOUNT_ATTR_RELATIME = 0x00000000 //nolint:golint + // MOUNT_ATTR_NOATIME - Do not update access times + MOUNT_ATTR_NOATIME = 0x00000010 //nolint:golint + // MOUNT_ATTR_STRICTATIME - Always perform atime updates + MOUNT_ATTR_STRICTATIME = 0x00000020 //nolint:golint + // MOUNT_ATTR_NODIRATIME - Do not update directory access times + MOUNT_ATTR_NODIRATIME = 0x00000080 //nolint:golint + // MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr + MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint + + // OPEN_TREE_CLONE - Clone the source path mount + OPEN_TREE_CLONE = 0x00000001 //nolint:golint + + // MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd + MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint + + // AT_RECURSIVE applies the operation to the entire subtree. + AT_RECURSIVE = 0x8000 //nolint:golint +) + +// openTree is a wrapper for the open_tree syscall +func openTree(path string, flags int) (fd int, err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return 0, err + } + + r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), 0, 0, 0) + if e1 != 0 { + err = e1 + } + return int(r), nil +} + +// moveMount is a wrapper for the the move_mount syscall. +func moveMount(fdTree int, target string) (err error) { + var _p0, _p1 *byte + + empty := "" + + if _p0, err = syscall.BytePtrFromString(target); err != nil { + return err + } + if _p1, err = syscall.BytePtrFromString(empty); err != nil { + return err + } + + flags := MOVE_MOUNT_F_EMPTY_PATH + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT), + uintptr(fdTree), uintptr(unsafe.Pointer(_p1)), + 0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + if e1 != 0 { + err = e1 + } + return +} + +// mountSetAttr is a wrapper for the mount_setattr syscall +func mountSetAttr(dfd int, path string, flags uint, attr *attr, size uint) (err error) { + var _p0 *byte + + if _p0, err = syscall.BytePtrFromString(path); err != nil { + return err + } + + _, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)), + uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0) + if e1 != 0 { + err = e1 + } + return +} + +// createIDMappedMount creates a IDMapped bind mount from SOURCE to TARGET using the user namespace +// for the PID process. +func createIDMappedMount(source, target string, pid int) error { + path := fmt.Sprintf("/proc/%d/ns/user", pid) + userNsFile, err := os.Open(path) + if err != nil { + return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path) + } + + var attr attr + attr.attrSet = MOUNT_ATTR_IDMAP + attr.attrClr = 0 + attr.propagation = 0 + attr.userNs = uint64(userNsFile.Fd()) + + defer userNsFile.Close() + + targetDirFd, err := openTree(source, OPEN_TREE_CLONE|AT_RECURSIVE) + if err != nil { + return err + } + defer unix.Close(targetDirFd) + + if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|AT_RECURSIVE, + &attr, uint(unsafe.Sizeof(attr))); err != nil { + return err + } + if err := os.Mkdir(target, 0700); err != nil && !os.IsExist(err) { + return err + } + return moveMount(targetDirFd, target) +} + +// createUsernsProcess forks the current process and creates a user namespace using the specified +// mappings. It returns the pid of the new process. +func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, func(), error) { + pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0) + if err != 0 { + return -1, nil, err + } + if pid == 0 { + _ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) + // just wait for the SIGKILL + for { + syscall.Syscall6(uintptr(unix.SYS_PAUSE), 0, 0, 0, 0, 0, 0) + } + } + cleanupFunc := func() { + unix.Kill(int(pid), unix.SIGKILL) + _, _ = unix.Wait4(int(pid), nil, 0, nil) + } + writeMappings := func(fname string, idmap []idtools.IDMap) error { + mappings := "" + for _, m := range idmap { + mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size) + } + if err := ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600); err != nil { + return err + } + return nil + } + if err := writeMappings("uid_map", uidMaps); err != nil { + cleanupFunc() + return -1, nil, err + } + if err := writeMappings("gid_map", gidMaps); err != nil { + cleanupFunc() + return -1, nil, err + } + + return int(pid), cleanupFunc, nil +} diff --git a/drivers/overlay/overlay.go b/drivers/overlay/overlay.go index a780ef5da3..de2e1a39e9 100644 --- a/drivers/overlay/overlay.go +++ b/drivers/overlay/overlay.go @@ -121,6 +121,8 @@ type Driver struct { supportsVolatile *bool usingMetacopy bool locker *locker.Locker + + supportsIDMappedMounts *bool } type additionalLayerStore struct { @@ -205,6 +207,29 @@ func checkSupportVolatile(home, runhome string) (bool, error) { return usingVolatile, nil } +// checkAndRecordIDMappedSupport checks and stores if the kernel supports mounting overlay on top of a +// idmapped lower layer. +func checkAndRecordIDMappedSupport(home, runhome string) (bool, error) { + feature := "idmapped-lower-dir" + overlayCacheResult, overlayCacheText, err := cachedFeatureCheck(runhome, feature) + if err == nil { + if overlayCacheResult { + logrus.Debugf("Cached value indicated that overlay is supported") + } else { + logrus.Debugf("Cached value indicated that overlay is not supported") + } + if !overlayCacheResult { + return false, errors.New(overlayCacheText) + } + return true, nil + } + supportsIDMappedMounts, err := supportsIdmappedLowerLayers(home) + if err2 := cachedFeatureRecord(runhome, feature, supportsIDMappedMounts, ""); err2 != nil { + return false, errors.Wrap(err2, "recording overlay idmapped mounts support status") + } + return supportsIDMappedMounts, err +} + func checkAndRecordOverlaySupport(fsMagic graphdriver.FsMagic, home, runhome string) (bool, error) { var supportsDType bool @@ -1485,6 +1510,51 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO } } + if d.supportsIDmappedMounts() && len(options.UidMaps) > 0 && len(options.GidMaps) > 0 { + var newAbsDir []string + mappedRoot := filepath.Join(d.home, id, "mapped") + if err := os.MkdirAll(mappedRoot, 0700); err != nil { + return "", err + } + + pid, cleanupFunc, err := createUsernsProcess(options.UidMaps, options.GidMaps) + if err != nil { + return "", err + } + defer cleanupFunc() + + idMappedMounts := make(map[string]string) + + // rewrite the lower dirs to their idmapped mount. + c := 0 + for _, absLower := range absLowers { + mappedMountSrc := getMappedMountRoot(absLower) + + root, found := idMappedMounts[mappedMountSrc] + if !found { + root = filepath.Join(mappedRoot, fmt.Sprintf("%d", c)) + c++ + if err := createIDMappedMount(mappedMountSrc, root, int(pid)); err != nil { + return "", errors.Wrapf(err, "create mapped mount for %q on %q", mappedMountSrc, root) + } + idMappedMounts[mappedMountSrc] = root + + // overlay takes a reference on the mount, so it is safe to unmount + // the mapped idmounts as soon as the final overlay file system is mounted. + defer unix.Unmount(root, unix.MNT_DETACH) + } + + // relative path to the layer through the id mapped mount + rel, err := filepath.Rel(mappedMountSrc, absLower) + if err != nil { + return "", err + } + + newAbsDir = append(newAbsDir, filepath.Join(root, rel)) + } + absLowers = newAbsDir + } + var opts string if readWrite { opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir) @@ -1587,6 +1657,18 @@ func (d *Driver) Put(id string) error { unmounted := false + mappedRoot := filepath.Join(d.home, id, "mapped") + // It should not happen, but cleanup any mapped mount if it was leaked. + if _, err := os.Stat(mappedRoot); err == nil { + mounts, err := ioutil.ReadDir(mappedRoot) + if err == nil { + // Go through all of the mapped mounts. + for _, m := range mounts { + _ = unix.Unmount(filepath.Join(mappedRoot, m.Name()), unix.MNT_DETACH) + } + } + } + if d.options.mountProgram != "" { // Attempt to unmount the FUSE mount using either fusermount or fusermount3. // If they fail, fallback to unix.Unmount @@ -1958,12 +2040,31 @@ func (d *Driver) UpdateLayerIDMap(id string, toContainer, toHost *idtools.IDMapp return nil } +// supportsIDmappedMounts returns whether the kernel supports using idmapped mounts with +// overlay lower layers. +func (d *Driver) supportsIDmappedMounts() bool { + if d.supportsIDMappedMounts != nil { + return *d.supportsIDMappedMounts + } + + supportsIDMappedMounts, err := checkAndRecordIDMappedSupport(d.home, d.runhome) + d.supportsIDMappedMounts = &supportsIDMappedMounts + if err == nil { + return supportsIDMappedMounts + } + logrus.Debugf("Check for idmapped mounts support %v", err) + return false +} + // SupportsShifting tells whether the driver support shifting of the UIDs/GIDs in an userNS func (d *Driver) SupportsShifting() bool { if os.Getenv("_TEST_FORCE_SUPPORT_SHIFTING") == "yes-please" { return true } - return d.options.mountProgram != "" + if d.options.mountProgram != "" { + return true + } + return d.supportsIDmappedMounts() } // dumbJoin is more or less a dumber version of filepath.Join, but one which @@ -2132,3 +2233,15 @@ func redirectDiffIfAdditionalLayer(diffPath string) (string, error) { } return diffPath, nil } + +// getMappedMountRoot is a heuristic that calculates the parent directory where +// the idmapped mount should be applied. +// It is useful to minimize the number of idmapped mounts and at the same time use +// a common path as long as possible to reduce the length of the mount data argument. +func getMappedMountRoot(path string) string { + dirName := filepath.Dir(path) + if filepath.Base(dirName) == linkDir { + return filepath.Dir(dirName) + } + return dirName +}