Skip to content

Commit

Permalink
Only configure swap if available on node
Browse files Browse the repository at this point in the history
This change bypasses all logic to set swap in the linux container
resources if a swap controller is not available on node. Failing
to do so may cause errors in runc when starting a container with
a swap configuration -- even if this is set to 0.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
  • Loading branch information
elezar committed Sep 26, 2023
1 parent d3d1827 commit 394bcaf
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 26 deletions.
43 changes: 30 additions & 13 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,18 +107,8 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,

lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)

if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.LimitedSwap:
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
} else {
swapConfigurationHelper.ConfigureNoSwap(lcr)
}
// Configure swap for the container
m.configureContainerSwapResources(lcr, pod, container)

// Set memory.min and memory.high to enforce MemoryQoS
if enforceMemoryQoS {
Expand Down Expand Up @@ -170,6 +160,30 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
return lcr
}

// configureContainerSwapResources configures the swap resources for a specified (linux) container.
// Swap is only configured if a swap cgroup controller is available and the NodeSwap feature gate is enabled.
func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
if !swapControllerAvailable() {
klog.InfoS("No swap cgroup controller present", "swapBehavior", m.memorySwapBehavior, "pod", klog.KObj(pod), "containerName", container.Name)
return
}
swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo)

if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
swapConfigurationHelper.ConfigureNoSwap(lcr)
return
}

// NOTE(ehashman): Behavior is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.LimitedSwap:
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
}

// generateContainerResources generates platform specific (linux) container resources config for runtime
func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources {
enforceMemoryQoS := false
Expand Down Expand Up @@ -315,7 +329,10 @@ var (
swapControllerAvailabilityOnce sync.Once
)

func swapControllerAvailable() bool {
// Note: this function variable is being added here so it would be possible to mock
// the swap controller availability for unit tests by assigning a new function to it. Without it,
// the swap controller availability would solely depend on the environment running the test.
var swapControllerAvailable = func() bool {
// See https://github.com/containerd/containerd/pull/7838/
swapControllerAvailabilityOnce.Do(func() {
const warn = "Failed to detect the availability of the swap controller, assuming not available"
Expand Down
209 changes: 196 additions & 13 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ package kuberuntime
import (
"context"
"fmt"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/types"
"math"
"os"
"reflect"
"strconv"
"testing"

"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/types"

"github.com/google/go-cmp/cmp"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -845,12 +846,12 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
},
} {
t.Run(tc.name, func(t *testing.T) {
defer setSwapControllerAvailableDuringTest(false)()
if tc.scalingFg {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)()
}

setCgroupVersionDuringTest(cgroupV1)
tc.expected.MemorySwapLimitInBytes = tc.expected.MemoryLimitInBytes

pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
if len(tc.cStatus) > 0 {
Expand Down Expand Up @@ -891,6 +892,19 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
Status: v1.PodStatus{},
}

expectSwapDisabled := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
const msg = "container is expected to not have swap configured"

for _, r := range resources {
switch cgroupVersion {
case cgroupV1:
assert.Equal(t, int64(0), r.MemorySwapLimitInBytes, msg)
case cgroupV2:
assert.NotContains(t, r.Unified, cm.Cgroup2MaxSwapFilename, msg)
}
}
}

expectNoSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
const msg = "container is expected to not have swap access"

Expand Down Expand Up @@ -939,6 +953,7 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
name string
cgroupVersion CgroupVersion
qosClass v1.PodQOSClass
swapDisabledOnNode bool
nodeSwapFeatureGateEnabled bool
swapBehavior string
addContainerWithoutRequests bool
Expand Down Expand Up @@ -994,28 +1009,173 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {

// With Guaranteed and Best-effort QoS
{
name: "Best-effort Qos, cgroups v2, LimitedSwap",
name: "Best-effort QoS, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Best-effort QoS, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
{
name: "Guaranteed QoS, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Guaranteed QoS, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},

// With a "guaranteed" container (when memory requests equal to limits)
{
name: "Burstable QoS, cgroups v2, LimitedSwap, with a guaranteed container",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: true,
},
{
name: "Burstable QoS, cgroups v2, UnlimitedSwap, with a guaranteed container",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: true,
},

// Swap is expected to be allocated
{
name: "Burstable QoS, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: false,
},
{
name: "Burstable QoS, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: false,
},
{
name: "Burstable QoS, cgroups v2, LimitedSwap, with a container with no requests",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: true,
addGuaranteedContainer: false,
},
{
name: "Burstable QoS, cgroups v2, UnlimitedSwap, with a container with no requests",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: true,
addGuaranteedContainer: false,
},
// All the above examples with Swap disabled on node
{
name: "Swap disabled on node, cgroups v1, LimitedSwap, Burstable QoS",
swapDisabledOnNode: true,
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Swap disabled on node, cgroups v1, UnlimitedSwap, Burstable QoS",
swapDisabledOnNode: true,
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
{
name: "Swap disabled on node, cgroups v1, LimitedSwap, Best-effort QoS",
swapDisabledOnNode: true,
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBestEffort,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},

// With feature gate turned off
{
name: "Swap disabled on node, NodeSwap feature gate turned off, cgroups v2, LimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: false,
swapBehavior: types.LimitedSwap,
},
{
name: "Swap disabled on node, NodeSwap feature gate turned off, cgroups v2, UnlimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: false,
swapBehavior: types.UnlimitedSwap,
},

// With no swapBehavior, UnlimitedSwap should be the default
{
name: "Swap disabled on node, With no swapBehavior - UnlimitedSwap should be the default",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBestEffort,
nodeSwapFeatureGateEnabled: true,
swapBehavior: "",
},

// With Guaranteed and Best-effort QoS
{
name: "Swap disabled on node, Best-effort QoS, cgroups v2, LimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Best-effort Qos, cgroups v2, UnlimitedSwap",
name: "Swap disabled on node, Best-effort QoS, cgroups v2, UnlimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
{
name: "Guaranteed Qos, cgroups v2, LimitedSwap",
name: "Swap disabled on node, Guaranteed QoS, cgroups v2, LimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Guaranteed Qos, cgroups v2, UnlimitedSwap",
name: "Swap disabled on node, Guaranteed QoS, cgroups v2, UnlimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1024,7 +1184,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {

// With a "guaranteed" container (when memory requests equal to limits)
{
name: "Burstable Qos, cgroups v2, LimitedSwap, with a guaranteed container",
name: "Swap disabled on node, Burstable QoS, cgroups v2, LimitedSwap, with a guaranteed container",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1033,7 +1194,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
addGuaranteedContainer: true,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a guaranteed container",
name: "Swap disabled on node, Burstable QoS, cgroups v2, UnlimitedSwap, with a guaranteed container",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1044,7 +1206,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {

// Swap is expected to be allocated
{
name: "Burstable Qos, cgroups v2, LimitedSwap",
name: "Swap disabled on node, Burstable QoS, cgroups v2, LimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1053,7 +1216,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap",
name: "Swap disabled on node, Burstable QoS, cgroups v2, UnlimitedSwap",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1062,7 +1226,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, LimitedSwap, with a container with no requests",
name: "Swap disabled on node, Burstable QoS, cgroups v2, LimitedSwap, with a container with no requests",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1071,7 +1236,8 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a container with no requests",
name: "Swap disabled on node, Burstable QoS, cgroups v2, UnlimitedSwap, with a container with no requests",
swapDisabledOnNode: true,
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
Expand All @@ -1082,6 +1248,7 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
setCgroupVersionDuringTest(tc.cgroupVersion)
defer setSwapControllerAvailableDuringTest(!tc.swapDisabledOnNode)()
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tc.nodeSwapFeatureGateEnabled)()
m.memorySwapBehavior = tc.swapBehavior

Expand Down Expand Up @@ -1117,6 +1284,11 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
resourcesC1 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
resourcesC2 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[1], false)

if tc.swapDisabledOnNode {
expectSwapDisabled(tc.cgroupVersion, resourcesC1, resourcesC2)
return
}

if !tc.nodeSwapFeatureGateEnabled || tc.cgroupVersion == cgroupV1 || (tc.swapBehavior == types.LimitedSwap && tc.qosClass != v1.PodQOSBurstable) {
expectNoSwap(tc.cgroupVersion, resourcesC1, resourcesC2)
return
Expand Down Expand Up @@ -1151,3 +1323,14 @@ func setCgroupVersionDuringTest(version CgroupVersion) {
return version == cgroupV2
}
}

func setSwapControllerAvailableDuringTest(available bool) func() {
original := swapControllerAvailable
swapControllerAvailable = func() bool {
return available
}

return func() {
swapControllerAvailable = original
}
}

0 comments on commit 394bcaf

Please sign in to comment.