Skip to content

Commit

Permalink
overlay: use idmapped lower layers where supported
Browse files Browse the repository at this point in the history
use idmapped mounts for the overlay lower layers when the kernel
supports them.

For each lower directory with ID=0...N-1, it creates a idmapped mount
at $GRAPHROOT/overlay/$LAYER/mapped/$ID.  The final overlay mount will
use these idmapped mounts instead of the original source directory.

The upperdir is not idmapped, so files are created with the same
IDs used by the user namespace.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
  • Loading branch information
giuseppe committed Mar 29, 2022
1 parent d48cfc6 commit 557c051
Show file tree
Hide file tree
Showing 3 changed files with 298 additions and 1 deletion.
77 changes: 77 additions & 0 deletions drivers/overlay/check.go
@@ -1,3 +1,4 @@
//go:build linux
// +build linux

package overlay
Expand All @@ -11,6 +12,7 @@ import (
"syscall"

"github.com/containers/storage/pkg/archive"
"github.com/containers/storage/pkg/idtools"
"github.com/containers/storage/pkg/ioutils"
"github.com/containers/storage/pkg/mount"
"github.com/containers/storage/pkg/system"
Expand Down Expand Up @@ -218,3 +220,78 @@ func doesVolatile(d string) (bool, error) {
}()
return true, nil
}

// supportsIdmappedLowerLayers checks if the kernel supports mounting overlay on top of
// a idmapped lower layer.
func supportsIdmappedLowerLayers(home string) (bool, error) {
layerDir, err := ioutil.TempDir(home, "compat")
if err != nil {
return false, err
}

mergedDir := filepath.Join(layerDir, "merged")
lowerDir := filepath.Join(layerDir, "lower")
lowerMappedDir := filepath.Join(layerDir, "lower-mapped")
upperDir := filepath.Join(layerDir, "upper")
workDir := filepath.Join(layerDir, "work")

defer func() {
_ = unix.Unmount(mergedDir, unix.MNT_DETACH)
_ = os.RemoveAll(layerDir)
}()

_ = idtools.MkdirAs(mergedDir, 0700, 0, 0)
_ = idtools.MkdirAs(lowerDir, 0700, 0, 0)
_ = idtools.MkdirAs(lowerMappedDir, 0700, 0, 0)
_ = idtools.MkdirAs(upperDir, 0700, 0, 0)
_ = idtools.MkdirAs(workDir, 0700, 0, 0)

idmap := []idtools.IDMap{
idtools.IDMap{ContainerID: 0,
HostID: 0,
Size: 1,
},
}
pid, err := createUsernsProcess(idmap, idmap)
if err != nil {
return false, err
}
defer func() {
unix.Kill(int(pid), unix.SIGKILL)
_, _ = unix.Wait4(int(pid), nil, 0, nil)
}()

if err := getIDMappedMount(lowerDir, lowerMappedDir, int(pid)); err != nil {
return false, errors.Wrapf(err, "create mapped mount")
}

opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerMappedDir, upperDir, workDir)
flags := uintptr(0)
if err := unix.Mount("overlay", mergedDir, "overlay", flags, opts); err != nil {
return false, err
}
return true, nil
}

// checkAndRecordIdMappedSupport checks and stores if the kernel supports mounting overlay on top of a
// idmapped lower layer.
func checkAndRecordIdMappedSupport(home, runhome string) (bool, error) {
feature := "idmapped-lower-dir"
overlayCacheResult, overlayCacheText, err := cachedFeatureCheck(runhome, feature)
if err == nil {
if overlayCacheResult {
logrus.Debugf("Cached value indicated that overlay is supported")
} else {
logrus.Debugf("Cached value indicated that overlay is not supported")
}
if !overlayCacheResult {
return false, errors.New(overlayCacheText)
}
return true, nil
}
supportsIdMappedMounts, err := supportsIdmappedLowerLayers(home)
if err2 := cachedFeatureRecord(runhome, feature, supportsIdMappedMounts, ""); err2 != nil {
return false, errors.Wrap(err2, "recording overlay idmapped mounts support status")
}
return supportsIdMappedMounts, err
}
178 changes: 178 additions & 0 deletions drivers/overlay/idmapped_utils.go
@@ -0,0 +1,178 @@
//go:build linux
// +build linux

package overlay

import (
"fmt"
"os"
"io/ioutil"
"syscall"
"unsafe"

"github.com/containers/storage/pkg/idtools"
"golang.org/x/sys/unix"
"github.com/pkg/errors"
)

type Attr struct {
attrSet uint64
attrClr uint64
propagation uint64
userNs uint64
}

const AT_RECURSIVE = 0x8000 //nolint:golint

const (
// MOUNT_ATTR_RDONLY - Mount read-only
MOUNT_ATTR_RDONLY = 0x00000001 //nolint:golint
// MOUNT_ATTR_NOSUID - Ignore suid and sgid bits
MOUNT_ATTR_NOSUID = 0x00000002 //nolint:golint
// MOUNT_ATTR_NODEV - Disallow access to device special files
MOUNT_ATTR_NODEV = 0x00000004 //nolint:golint
// MOUNT_ATTR_NOEXEC - Disallow program execution
MOUNT_ATTR_NOEXEC = 0x00000008 //nolint:golint
// MOUNT_ATTR__ATIME - Setting on how atime should be updated
MOUNT_ATTR__ATIME = 0x00000070 //nolint:golint
// MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime
MOUNT_ATTR_RELATIME = 0x00000000 //nolint:golint
// MOUNT_ATTR_NOATIME - Do not update access times
MOUNT_ATTR_NOATIME = 0x00000010 //nolint:golint
// MOUNT_ATTR_STRICTATIME - Always perform atime updates
MOUNT_ATTR_STRICTATIME = 0x00000020 //nolint:golint
// MOUNT_ATTR_NODIRATIME - Do not update directory access times
MOUNT_ATTR_NODIRATIME = 0x00000080 //nolint:golint
// MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr
MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint

// OPEN_TREE_CLONE - Clone the source path mount
OPEN_TREE_CLONE = 0x00000001 //nolint:golint

// MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd
MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint
)

func cloneTree(path string) (fd int, err error) {
var _p0 *byte

if _p0, err = syscall.BytePtrFromString(path); err != nil {
return 0, err
}

flags := OPEN_TREE_CLONE | AT_RECURSIVE

r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)),
uintptr(flags), 0, 0, 0)
if e1 != 0 {
err = e1
}
return int(r), nil
}

func mountMount(fdTree int, target string) (err error) {
var _p0, _p1 *byte

empty := ""

if _p0, err = syscall.BytePtrFromString(target); err != nil {
return err
}
if _p1, err = syscall.BytePtrFromString(empty); err != nil {
return err
}

flags := MOVE_MOUNT_F_EMPTY_PATH

_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT),
uintptr(fdTree), uintptr(unsafe.Pointer(_p1)),
0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0)
if e1 != 0 {
err = e1
}
return
}

func mountSetAttr(dfd int, path string, flags uint, attr *Attr, size uint) (err error) {
var _p0 *byte

if _p0, err = syscall.BytePtrFromString(path); err != nil {
return err
}

_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)),
uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0)
if e1 != 0 {
err = e1
}
return
}

func getIDMappedMount(from, to string, pid int) error {
path := fmt.Sprintf("/proc/%d/ns/user", pid)
userNsFile, err := os.Open(path)
if err != nil {
return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path)
}

var attr Attr
attr.attrSet = MOUNT_ATTR_IDMAP
attr.attrClr = 0
attr.propagation = 0
attr.userNs = uint64(userNsFile.Fd())

defer userNsFile.Close()

targetDirFd, err := cloneTree(from)
if err != nil {
return err
}
defer unix.Close(targetDirFd)

if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|AT_RECURSIVE,
&attr, uint(unsafe.Sizeof(attr))); err != nil {
return err
}

return mountMount(targetDirFd, to)
}

// createUsernsProcess forks the current process and creates a user namespace using the specified
// mappings. It returns the pid of the new process.
func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, error) {
pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0)
if err != 0 {
return -1, err
}
if pid == 0 {
_ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
// just wait for the SIGKILL
for {
syscall.Syscall6(uintptr(unix.SYS_PAUSE), 0, 0, 0, 0, 0, 0)
}
}

writeMappings := func(fname string, idmap []idtools.IDMap) error {
mappings := ""
for _, m := range idmap {
mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size)
}
if err := ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600); err != nil {
return err
}
return nil
}

if err := writeMappings("uid_map", uidMaps); err != nil {
unix.Kill(int(pid), unix.SIGKILL)
_, _ = unix.Wait4(int(pid), nil, 0, nil)
return -1, err
}
if err := writeMappings("gid_map", gidMaps); err != nil {
unix.Kill(int(pid), unix.SIGKILL)
_, _ = unix.Wait4(int(pid), nil, 0, nil)
return -1, err
}

return int(pid), nil
}
44 changes: 43 additions & 1 deletion drivers/overlay/overlay.go
Expand Up @@ -1485,6 +1485,40 @@ func (d *Driver) get(id string, disableShifting bool, options graphdriver.MountO
}
}

if len(options.UidMaps) > 0 && len(options.GidMaps) > 0 {
var newAbsDir []string
mappedRoot := filepath.Join(d.home, id, "mapped")
if err := os.MkdirAll(mappedRoot, 0700); err != nil {
return "", err
}

pid, err := createUsernsProcess(options.UidMaps, options.GidMaps)
if err != nil {
return "", err
}
defer func() {
unix.Kill(int(pid), unix.SIGKILL)
_, _ = unix.Wait4(int(pid), nil, 0, nil)
}()

// rewrite the lower dirs to their idmapped mount.
for c, absLower := range absLowers {
to := filepath.Join(mappedRoot, fmt.Sprintf("%d", c))
if err := os.Mkdir(to, 0700); err != nil && !os.IsExist(err) {
return "", err
}
if err := getIDMappedMount(absLower, to, int(pid)); err != nil {
return "", errors.Wrapf(err, "create mapped mount")
}
// overlay takes a reference on the mount, so it is safe to unmount
// the mapped idmounts as soon as the final overlay file system is mounted.
defer unix.Unmount(to, unix.MNT_DETACH)

newAbsDir = append(newAbsDir, to)
}
absLowers = newAbsDir
}

var opts string
if readWrite {
opts = fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", strings.Join(absLowers, ":"), diffDir, workdir)
Expand Down Expand Up @@ -1963,7 +1997,15 @@ func (d *Driver) SupportsShifting() bool {
if os.Getenv("_TEST_FORCE_SUPPORT_SHIFTING") == "yes-please" {
return true
}
return d.options.mountProgram != ""
if d.options.mountProgram != "" {
return true
}
supportsIdMappedMounts, err := checkAndRecordIdMappedSupport(d.home, d.runhome)
if err == nil {
return supportsIdMappedMounts
}
logrus.Debugf("Check for idmapped mounts support %v", err)
return false
}

// dumbJoin is more or less a dumber version of filepath.Join, but one which
Expand Down

0 comments on commit 557c051

Please sign in to comment.