Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alpha node swap support #102823

Merged
merged 10 commits into from Jul 7, 2021
2 changes: 1 addition & 1 deletion hack/local-up-cluster.sh
Expand Up @@ -786,7 +786,7 @@ function start_kubelet {

# warn if users are running with swap allowed
if [ "${FAIL_SWAP_ON}" == "false" ]; then
echo "WARNING : The kubelet is configured to not fail even if swap is enabled; production deployments should disable swap."
echo "WARNING : The kubelet is configured to not fail even if swap is enabled; production deployments should disable swap unless testing NodeSwapEnabled feature."
fi

if [[ "${REUSE_CERTS}" != true ]]; then
Expand Down
7 changes: 7 additions & 0 deletions pkg/features/kube_features.go
Expand Up @@ -635,6 +635,12 @@ const (
// Allows user to override pod-level terminationGracePeriod for probes
ProbeTerminationGracePeriod featuregate.Feature = "ProbeTerminationGracePeriod"

// owner: @ehashman
// alpha: v1.22
//
// Permits kubelet to run with swap enabled
NodeSwapEnabled featuregate.Feature = "NodeSwapEnabled"

// owner: @ahg-g
// alpha: v1.21
// beta: v1.22
Expand Down Expand Up @@ -825,6 +831,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha},
PreferNominatedNode: {Default: true, PreRelease: featuregate.Beta},
ProbeTerminationGracePeriod: {Default: false, PreRelease: featuregate.Alpha},
NodeSwapEnabled: {Default: false, PreRelease: featuregate.Alpha},
PodDeletionCost: {Default: true, PreRelease: featuregate.Beta},
TopologyAwareHints: {Default: false, PreRelease: featuregate.Alpha},
PodAffinityNamespaceSelector: {Default: true, PreRelease: featuregate.Beta},
Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/apis/config/helpers_test.go
Expand Up @@ -209,6 +209,7 @@ var (
"MaxOpenFiles",
"MaxPods",
"MemoryManagerPolicy",
"MemorySwap.SwapBehavior",
"NodeLeaseDurationSeconds",
"NodeStatusMaxImages",
"NodeStatusUpdateFrequency.Duration",
Expand Down
Expand Up @@ -58,6 +58,7 @@ makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memorySwap: {}
nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s
Expand Down
Expand Up @@ -58,6 +58,7 @@ makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memorySwap: {}
nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s
Expand Down
13 changes: 13 additions & 0 deletions pkg/kubelet/apis/config/types.go
Expand Up @@ -326,6 +326,10 @@ type KubeletConfiguration struct {
FeatureGates map[string]bool
// Tells the Kubelet to fail to start if swap is enabled on the node.
FailSwapOn bool
// memorySwap configures swap memory available to container workloads.
// +featureGate=NodeSwapEnabled
// +optional
MemorySwap MemorySwapConfiguration
// A quantity defines the maximum size of the container log file before it is rotated. For example: "5Mi" or "256Ki".
ContainerLogMaxSize string
// Maximum number of container log files that can be present for a container.
Expand Down Expand Up @@ -568,3 +572,12 @@ type MemoryReservation struct {
NumaNode int32
Limits v1.ResourceList
}

type MemorySwapConfiguration struct {
liggitt marked this conversation as resolved.
Show resolved Hide resolved
// swapBehavior configures swap memory available to container workloads. May be one of
// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
// "UnlimitedSwap": workloads can use unlimited swap, up to the allocatable limit.
// +featureGate=NodeSwapEnabled
// +optional
SwapBehavior string
}
36 changes: 36 additions & 0 deletions pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions pkg/kubelet/apis/config/validation/validation.go
Expand Up @@ -155,6 +155,14 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
}
if localFeatureGate.Enabled(features.NodeSwapEnabled) {
if kc.MemorySwap.SwapBehavior != "" && kc.MemorySwap.SwapBehavior != kubetypes.LimitedSwap && kc.MemorySwap.SwapBehavior != kubetypes.UnlimitedSwap {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior %v must be one of: LimitedSwap, UnlimitedSwap", kc.MemorySwap.SwapBehavior))
}
}
ehashman marked this conversation as resolved.
Show resolved Hide resolved
if !localFeatureGate.Enabled(features.NodeSwapEnabled) && kc.MemorySwap != (kubeletconfig.MemorySwapConfiguration{}) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior cannot be set when NodeSwapEnabled feature flag is disabled"))
}

for _, val := range kc.EnforceNodeAllocatable {
switch val {
Expand Down
10 changes: 8 additions & 2 deletions pkg/kubelet/apis/config/validation/validation_test.go
Expand Up @@ -24,6 +24,7 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"
componentbaseconfig "k8s.io/component-base/config"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

func TestValidateKubeletConfiguration(t *testing.T) {
Expand Down Expand Up @@ -145,9 +146,11 @@ func TestValidateKubeletConfiguration(t *testing.T) {
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
"NodeSwapEnabled": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
Expand Down Expand Up @@ -187,8 +190,9 @@ func TestValidateKubeletConfiguration(t *testing.T) {
Logging: componentbaseconfig.LoggingConfiguration{
Format: "",
},
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
}
const numErrsErrorCase1 = 29
const numErrsErrorCase1 = 30
if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
}
Expand Down Expand Up @@ -225,15 +229,17 @@ func TestValidateKubeletConfiguration(t *testing.T) {
TopologyManagerPolicy: "invalid",
ShutdownGracePeriod: metav1.Duration{Duration: 40 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: "invalid"},
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
"NodeSwapEnabled": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
},
}
const numErrsErrorCase2 = 3
const numErrsErrorCase2 = 4
if allErrors := ValidateKubeletConfiguration(errorCase2); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase2 {
t.Errorf("expect %d errors, got %v", numErrsErrorCase2, len(allErrors.(utilerrors.Aggregate).Errors()))
}
Expand Down
17 changes: 17 additions & 0 deletions pkg/kubelet/apis/config/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/kubelet/kubelet.go
Expand Up @@ -652,6 +652,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.containerLogManager,
klet.runtimeClassManager,
seccompDefault,
kubeCfg.MemorySwap.SwapBehavior,
)
if err != nil {
return nil, err
Expand Down
18 changes: 18 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
Expand Up @@ -30,6 +30,7 @@ import (
kubefeatures "k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/qos"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
)

// applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
Expand Down Expand Up @@ -89,6 +90,23 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C

lc.Resources.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwapEnabled) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need a similar change in ResourceConfigForPod for pod level cgroup settings created by pod cgroup manager. I would expect them to match the container settings. I think memory backed volumes could ultimately use swap, but would like @sjenning to confirm. Either way, the cgroup settings for memory should match pod and container scopes.

// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.UnlimitedSwap:
// -1 = unlimited swap
lc.Resources.MemorySwapLimitInBytes = -1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just recording this as this swaps out of my own mental cache sometimes.

if a container has a defined memory limit X, it will still have MemoryLimitInBytes=X, but it may now use unbounded additional swap by setting MemorySwapLimitInBytes if UnlimitedSwap is enabled. This is consistent with existing behavior where --fail-swap-on was false because no kubelet enforced limit was written.

case kubelettypes.LimitedSwap:
fallthrough
default:
ehashman marked this conversation as resolved.
Show resolved Hide resolved
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lc.Resources.MemorySwapLimitInBytes = lc.Resources.MemoryLimitInBytes
ehashman marked this conversation as resolved.
Show resolved Hide resolved
}
}

return lc
}

Expand Down
90 changes: 90 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
Expand Up @@ -33,6 +33,7 @@ import (
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
)

func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int) *runtimeapi.ContainerConfig {
Expand Down Expand Up @@ -367,3 +368,92 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) {
})
}
}

func TestGenerateLinuxContainerConfigSwap(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwapEnabled, true)()
_, _, m, err := createTestRuntimeManager()
if err != nil {
t.Fatalf("error creating test RuntimeManager: %v", err)
}
m.machineInfo.MemoryCapacity = 1000000
containerName := "test"

for _, tc := range []struct {
name string
swapSetting string
pod *v1.Pod
expected int64
}{
{
name: "config unset, memory limit set",
// no swap setting
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: containerName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
Requests: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
},
}},
},
},
expected: 1000,
},
{
name: "config unset, no memory limit",
// no swap setting
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{Name: containerName},
},
},
},
expected: 0,
},
{
// Note: behaviour will be the same as previous two cases
name: "config set to LimitedSwap, memory limit set",
swapSetting: kubelettypes.LimitedSwap,
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: containerName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
Requests: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
},
}},
},
},
expected: 1000,
},
{
name: "UnlimitedSwap enabled",
swapSetting: kubelettypes.UnlimitedSwap,
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{Name: containerName},
},
},
},
expected: -1,
},
} {
t.Run(tc.name, func(t *testing.T) {
m.memorySwapBehavior = tc.swapSetting
actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil)
assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name)
})
}
}