Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
overlay: use idmapped lower layers where supported
use idmapped mounts for the overlay lower layers when the kernel supports them. For each lower directory with ID=0...N-1, it creates a idmapped mount at $GRAPHROOT/overlay/$LAYER/mapped/$ID. The final overlay mount will use these idmapped mounts instead of the original source directory. The upperdir is not idmapped, so files are created with the same IDs used by the user namespace. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
- Loading branch information
Showing
3 changed files
with
302 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
//go:build linux | ||
// +build linux | ||
|
||
package overlay | ||
|
||
import ( | ||
"fmt" | ||
"io/ioutil" | ||
"os" | ||
"syscall" | ||
"unsafe" | ||
|
||
"github.com/containers/storage/pkg/idtools" | ||
"github.com/pkg/errors" | ||
"golang.org/x/sys/unix" | ||
) | ||
|
||
type attr struct { | ||
attrSet uint64 | ||
attrClr uint64 | ||
propagation uint64 | ||
userNs uint64 | ||
} | ||
|
||
const ( | ||
// MOUNT_ATTR_RDONLY - Mount read-only | ||
MOUNT_ATTR_RDONLY = 0x00000001 //nolint:golint | ||
// MOUNT_ATTR_NOSUID - Ignore suid and sgid bits | ||
MOUNT_ATTR_NOSUID = 0x00000002 //nolint:golint | ||
// MOUNT_ATTR_NODEV - Disallow access to device special files | ||
MOUNT_ATTR_NODEV = 0x00000004 //nolint:golint | ||
// MOUNT_ATTR_NOEXEC - Disallow program execution | ||
MOUNT_ATTR_NOEXEC = 0x00000008 //nolint:golint | ||
// MOUNT_ATTR__ATIME - Setting on how atime should be updated | ||
MOUNT_ATTR__ATIME = 0x00000070 //nolint:golint | ||
// MOUNT_ATTR_RELATIME - Update atime relative to mtime/ctime | ||
MOUNT_ATTR_RELATIME = 0x00000000 //nolint:golint | ||
// MOUNT_ATTR_NOATIME - Do not update access times | ||
MOUNT_ATTR_NOATIME = 0x00000010 //nolint:golint | ||
// MOUNT_ATTR_STRICTATIME - Always perform atime updates | ||
MOUNT_ATTR_STRICTATIME = 0x00000020 //nolint:golint | ||
// MOUNT_ATTR_NODIRATIME - Do not update directory access times | ||
MOUNT_ATTR_NODIRATIME = 0x00000080 //nolint:golint | ||
// MOUNT_ATTR_IDMAP - Idmap mount to @userns_fd in struct mount_attr | ||
MOUNT_ATTR_IDMAP = 0x00100000 //nolint:golint | ||
|
||
// OPEN_TREE_CLONE - Clone the source path mount | ||
OPEN_TREE_CLONE = 0x00000001 //nolint:golint | ||
|
||
// MOVE_MOUNT_F_EMPTY_PATH - Move the path referenced by the fd | ||
MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 //nolint:golint | ||
|
||
// AT_RECURSIVE applies the operation to the entire subtree. | ||
AT_RECURSIVE = 0x8000 //nolint:golint | ||
) | ||
|
||
// openTree is a wrapper for the open_tree syscall | ||
func openTree(path string, flags int) (fd int, err error) { | ||
var _p0 *byte | ||
|
||
if _p0, err = syscall.BytePtrFromString(path); err != nil { | ||
return 0, err | ||
} | ||
|
||
r, _, e1 := syscall.Syscall6(uintptr(unix.SYS_OPEN_TREE), uintptr(0), uintptr(unsafe.Pointer(_p0)), | ||
uintptr(flags), 0, 0, 0) | ||
if e1 != 0 { | ||
err = e1 | ||
} | ||
return int(r), nil | ||
} | ||
|
||
// moveMount is a wrapper for the the move_mount syscall. | ||
func moveMount(fdTree int, target string) (err error) { | ||
var _p0, _p1 *byte | ||
|
||
empty := "" | ||
|
||
if _p0, err = syscall.BytePtrFromString(target); err != nil { | ||
return err | ||
} | ||
if _p1, err = syscall.BytePtrFromString(empty); err != nil { | ||
return err | ||
} | ||
|
||
flags := MOVE_MOUNT_F_EMPTY_PATH | ||
|
||
_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOVE_MOUNT), | ||
uintptr(fdTree), uintptr(unsafe.Pointer(_p1)), | ||
0, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) | ||
if e1 != 0 { | ||
err = e1 | ||
} | ||
return | ||
} | ||
|
||
// mountSetAttr is a wrapper for the mount_setattr syscall | ||
func mountSetAttr(dfd int, path string, flags uint, attr *attr, size uint) (err error) { | ||
var _p0 *byte | ||
|
||
if _p0, err = syscall.BytePtrFromString(path); err != nil { | ||
return err | ||
} | ||
|
||
_, _, e1 := syscall.Syscall6(uintptr(unix.SYS_MOUNT_SETATTR), uintptr(dfd), uintptr(unsafe.Pointer(_p0)), | ||
uintptr(flags), uintptr(unsafe.Pointer(attr)), uintptr(size), 0) | ||
if e1 != 0 { | ||
err = e1 | ||
} | ||
return | ||
} | ||
|
||
// getIDMappedMount creates a IDMapped bind mount from SOURCE to TARGET using the user namespace | ||
// for the PID process. | ||
func getIDMappedMount(source, target string, pid int) error { | ||
path := fmt.Sprintf("/proc/%d/ns/user", pid) | ||
userNsFile, err := os.Open(path) | ||
if err != nil { | ||
return errors.Wrapf(err, "unable to get user ns file descriptor for %q", path) | ||
} | ||
|
||
var attr attr | ||
attr.attrSet = MOUNT_ATTR_IDMAP | ||
attr.attrClr = 0 | ||
attr.propagation = 0 | ||
attr.userNs = uint64(userNsFile.Fd()) | ||
|
||
defer userNsFile.Close() | ||
|
||
targetDirFd, err := openTree(source, OPEN_TREE_CLONE|AT_RECURSIVE) | ||
if err != nil { | ||
return err | ||
} | ||
defer unix.Close(targetDirFd) | ||
|
||
if err := mountSetAttr(targetDirFd, "", unix.AT_EMPTY_PATH|AT_RECURSIVE, | ||
&attr, uint(unsafe.Sizeof(attr))); err != nil { | ||
return err | ||
} | ||
|
||
return moveMount(targetDirFd, target) | ||
} | ||
|
||
// createUsernsProcess forks the current process and creates a user namespace using the specified | ||
// mappings. It returns the pid of the new process. | ||
func createUsernsProcess(uidMaps []idtools.IDMap, gidMaps []idtools.IDMap) (int, func(), error) { | ||
pid, _, err := syscall.Syscall6(uintptr(unix.SYS_CLONE), unix.CLONE_NEWUSER|uintptr(unix.SIGCHLD), 0, 0, 0, 0, 0) | ||
if err != 0 { | ||
return -1, nil, err | ||
} | ||
if pid == 0 { | ||
_ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) | ||
// just wait for the SIGKILL | ||
for { | ||
syscall.Syscall6(uintptr(unix.SYS_PAUSE), 0, 0, 0, 0, 0, 0) | ||
} | ||
} | ||
cleanupFunc := func() { | ||
unix.Kill(int(pid), unix.SIGKILL) | ||
_, _ = unix.Wait4(int(pid), nil, 0, nil) | ||
} | ||
writeMappings := func(fname string, idmap []idtools.IDMap) error { | ||
mappings := "" | ||
for _, m := range idmap { | ||
mappings = mappings + fmt.Sprintf("%d %d %d\n", m.ContainerID, m.HostID, m.Size) | ||
} | ||
if err := ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mappings), 0600); err != nil { | ||
return err | ||
} | ||
return nil | ||
} | ||
if err := writeMappings("uid_map", uidMaps); err != nil { | ||
cleanupFunc() | ||
return -1, nil, err | ||
} | ||
if err := writeMappings("gid_map", gidMaps); err != nil { | ||
cleanupFunc() | ||
return -1, nil, err | ||
} | ||
|
||
return int(pid), cleanupFunc, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters