Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cgroup2: devices filtering cleanup #2951

Merged
merged 6 commits into from
May 24, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions libcontainer/cgroups/devices/devices_emulator.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,12 @@ func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
}
return transitionRules, nil
}

// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
106 changes: 67 additions & 39 deletions libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"strconv"

"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: devicesEmulator

"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
Expand All @@ -22,11 +23,44 @@ const (
)

// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
p := &program{}
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}

p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for i := len(devices) - 1; i >= 0; i-- {
if err := p.appendDevice(devices[i]); err != nil {

for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
Expand All @@ -35,9 +69,9 @@ func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
}

type program struct {
insts asm.Instructions
hasWildCard bool
blockID int
insts asm.Instructions
defaultAllow bool
blockID int
}

func (p *program) init() {
Expand Down Expand Up @@ -67,39 +101,36 @@ func (p *program) init() {
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}

// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev *devices.Rule) error {
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
if p.hasWildCard {
// All entries after wildcard entry are ignored
return nil
}

bpfType := int32(-1)
hasType := true
switch dev.Type {
case 'c':
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case 'b':
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
case 'a':
hasType = false
default:
// if not specified in OCI json, typ is set to DeviceTypeAll
return errors.Errorf("invalid Type %q", string(dev.Type))
// We do not permit 'a', nor any other types we don't know about.
return errors.Errorf("invalid Type %q", string(rule.Type))
}
if dev.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", dev.Major)
if rule.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", rule.Major)
}
if dev.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", dev.Major)
if rule.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", rule.Major)
}
hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := dev.Minor >= 0
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range dev.Permissions {
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
Expand Down Expand Up @@ -136,42 +167,39 @@ func (p *program) appendDevice(dev *devices.Rule) error {
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
if !hasType && !hasAccess && !hasMajor && !hasMinor {
p.hasWildCard = true
}
p.insts = append(p.insts, acceptBlock(dev.Allow)...)
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}

func (p *program) finalize() (asm.Instructions, error) {
if p.hasWildCard {
// acceptBlock with asm.Return() is already inserted
return p.insts, nil
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- 0
asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts, nil
}

func acceptBlock(accept bool) asm.Instructions {
v := int32(0)
var v int32
if accept {
v = 1
}
Expand Down
128 changes: 64 additions & 64 deletions libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,81 +55,81 @@ block-0:
func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
expected := `
// load parameters into registers
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// tuntap (c, 10, 200, rwm, allow)
6: JNEImm dst: r2 off: -1 imm: 2 <block-1>
7: JNEImm dst: r4 off: -1 imm: 10 <block-1>
8: JNEImm dst: r5 off: -1 imm: 200 <block-1>
9: Mov32Imm dst: r0 imm: 1
10: Exit
// (b, wildcard, wildcard, m, true)
6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
7: Mov32Reg dst: r1 src: r3
8: And32Imm dst: r1 imm: 1
9: JNEReg dst: r1 off: -1 src: r3 <block-1>
10: Mov32Imm dst: r0 imm: 1
11: Exit
block-1:
11: JNEImm dst: r2 off: -1 imm: 2 <block-2>
12: JNEImm dst: r4 off: -1 imm: 5 <block-2>
13: JNEImm dst: r5 off: -1 imm: 2 <block-2>
14: Mov32Imm dst: r0 imm: 1
15: Exit
// (c, wildcard, wildcard, m, true)
12: JNEImm dst: r2 off: -1 imm: 2 <block-2>
13: Mov32Reg dst: r1 src: r3
14: And32Imm dst: r1 imm: 1
15: JNEReg dst: r1 off: -1 src: r3 <block-2>
16: Mov32Imm dst: r0 imm: 1
17: Exit
block-2:
// /dev/pts (c, 136, wildcard, rwm, true)
16: JNEImm dst: r2 off: -1 imm: 2 <block-3>
17: JNEImm dst: r4 off: -1 imm: 136 <block-3>
18: Mov32Imm dst: r0 imm: 1
19: Exit
18: JNEImm dst: r2 off: -1 imm: 2 <block-3>
19: JNEImm dst: r4 off: -1 imm: 1 <block-3>
20: JNEImm dst: r5 off: -1 imm: 3 <block-3>
21: Mov32Imm dst: r0 imm: 1
22: Exit
block-3:
20: JNEImm dst: r2 off: -1 imm: 2 <block-4>
21: JNEImm dst: r4 off: -1 imm: 1 <block-4>
22: JNEImm dst: r5 off: -1 imm: 9 <block-4>
23: Mov32Imm dst: r0 imm: 1
24: Exit
23: JNEImm dst: r2 off: -1 imm: 2 <block-4>
24: JNEImm dst: r4 off: -1 imm: 1 <block-4>
25: JNEImm dst: r5 off: -1 imm: 5 <block-4>
26: Mov32Imm dst: r0 imm: 1
27: Exit
block-4:
25: JNEImm dst: r2 off: -1 imm: 2 <block-5>
26: JNEImm dst: r4 off: -1 imm: 1 <block-5>
27: JNEImm dst: r5 off: -1 imm: 5 <block-5>
28: Mov32Imm dst: r0 imm: 1
29: Exit
28: JNEImm dst: r2 off: -1 imm: 2 <block-5>
29: JNEImm dst: r4 off: -1 imm: 1 <block-5>
30: JNEImm dst: r5 off: -1 imm: 7 <block-5>
31: Mov32Imm dst: r0 imm: 1
32: Exit
block-5:
30: JNEImm dst: r2 off: -1 imm: 2 <block-6>
31: JNEImm dst: r4 off: -1 imm: 5 <block-6>
32: JNEImm dst: r5 off: -1 imm: 0 <block-6>
33: Mov32Imm dst: r0 imm: 1
34: Exit
33: JNEImm dst: r2 off: -1 imm: 2 <block-6>
34: JNEImm dst: r4 off: -1 imm: 1 <block-6>
35: JNEImm dst: r5 off: -1 imm: 8 <block-6>
36: Mov32Imm dst: r0 imm: 1
37: Exit
block-6:
35: JNEImm dst: r2 off: -1 imm: 2 <block-7>
36: JNEImm dst: r4 off: -1 imm: 1 <block-7>
37: JNEImm dst: r5 off: -1 imm: 7 <block-7>
38: Mov32Imm dst: r0 imm: 1
39: Exit
38: JNEImm dst: r2 off: -1 imm: 2 <block-7>
39: JNEImm dst: r4 off: -1 imm: 1 <block-7>
40: JNEImm dst: r5 off: -1 imm: 9 <block-7>
41: Mov32Imm dst: r0 imm: 1
42: Exit
block-7:
40: JNEImm dst: r2 off: -1 imm: 2 <block-8>
41: JNEImm dst: r4 off: -1 imm: 1 <block-8>
42: JNEImm dst: r5 off: -1 imm: 8 <block-8>
43: Mov32Imm dst: r0 imm: 1
44: Exit
43: JNEImm dst: r2 off: -1 imm: 2 <block-8>
44: JNEImm dst: r4 off: -1 imm: 5 <block-8>
45: JNEImm dst: r5 off: -1 imm: 0 <block-8>
46: Mov32Imm dst: r0 imm: 1
47: Exit
block-8:
45: JNEImm dst: r2 off: -1 imm: 2 <block-9>
46: JNEImm dst: r4 off: -1 imm: 1 <block-9>
47: JNEImm dst: r5 off: -1 imm: 3 <block-9>
48: Mov32Imm dst: r0 imm: 1
49: Exit
48: JNEImm dst: r2 off: -1 imm: 2 <block-9>
49: JNEImm dst: r4 off: -1 imm: 5 <block-9>
50: JNEImm dst: r5 off: -1 imm: 2 <block-9>
51: Mov32Imm dst: r0 imm: 1
52: Exit
block-9:
// (b, wildcard, wildcard, m, true)
50: JNEImm dst: r2 off: -1 imm: 1 <block-10>
51: Mov32Reg dst: r1 src: r3
52: And32Imm dst: r1 imm: 1
53: JNEReg dst: r1 off: -1 src: r3 <block-10>
54: Mov32Imm dst: r0 imm: 1
55: Exit
// tuntap (c, 10, 200, rwm, allow)
53: JNEImm dst: r2 off: -1 imm: 2 <block-10>
54: JNEImm dst: r4 off: -1 imm: 10 <block-10>
55: JNEImm dst: r5 off: -1 imm: 200 <block-10>
56: Mov32Imm dst: r0 imm: 1
57: Exit
block-10:
// (c, wildcard, wildcard, m, true)
56: JNEImm dst: r2 off: -1 imm: 2 <block-11>
57: Mov32Reg dst: r1 src: r3
58: And32Imm dst: r1 imm: 1
59: JNEReg dst: r1 off: -1 src: r3 <block-11>
// /dev/pts (c, 136, wildcard, rwm, true)
58: JNEImm dst: r2 off: -1 imm: 2 <block-11>
59: JNEImm dst: r4 off: -1 imm: 136 <block-11>
60: Mov32Imm dst: r0 imm: 1
61: Exit
block-11:
Expand Down
3 changes: 0 additions & 3 deletions libcontainer/cgroups/fs2/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,6 @@ func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
insts, license, err := devicefilter.DeviceFilter(r.Devices)
if err != nil {
return err
Expand Down
3 changes: 1 addition & 2 deletions libcontainer/cgroups/systemd/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,7 @@ func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, er
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
baseEmu := &cgroupdevices.Emulator{}
finalRules, err := baseEmu.Transition(configEmu)
finalRules, err := configEmu.Rules()
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
}
Expand Down
2 changes: 1 addition & 1 deletion libcontainer/configs/validate/validator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {

func TestValidateUsernamespace(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
t.Skip("Test requires userns.")
}
config := &configs.Config{
Rootfs: "/var",
Expand Down
2 changes: 1 addition & 1 deletion libcontainer/integration/checkpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func showFile(t *testing.T, fname string) error {

func TestUsernsCheckpoint(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("userns is unsupported")
t.Skip("Test requires userns.")
}
cmd := exec.Command("criu", "check", "--feature", "userns")
if err := cmd.Run(); err != nil {
Expand Down