[MCC][MCD]: Introduce in progress taint

All the upgrade candidate nodes in a mcp would be applied `UpdateInProgress: PreferNoSchedule` taint. The taint will be removed MCD once the upgrade is complete. Since kubernetes/kubernetes#104251 landed, the nodes not having PreferNoSchedule taint will have higher score. Before the upgrade starts, MCC will taint all the nodes in the cluster that are supposed to be upgraded. Once the upgrade is complete since MCD will remove the taint, none of the nodes will have `UpdateInProgress: PreferNoSchedule` taint. This ensures the score of the nodes will be equal again. Why is this needed? This reduces the pod churn when the cluster upgrade is in progress. When the non-upgraded nodes in the cluster have `UpdateInProgress: PreferNoSchedule` taint, they would get lesser score and the pods would prefer to land onto untainted(upgraded) nodes there by reducing the chances of landing onto an unupgraded node which can cause one more reschedule
ravisantoshgudimetla · Dec 10, 2021 · 2976dc6 · 2976dc6
1 parent 182cb2e
commit 2976dc6
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 19 deletions.
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
@@ -1,5 +1,12 @@
 package constants
 
+import (
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+)
+
 // constants defines some file paths that are shared outside of the
 // MCO package; and thus consumed by other users
 
@@ -9,7 +16,21 @@ const (
 	APIServerURLFile = "/etc/kubernetes/apiserver-url.env"
 )
 
-// ConstantsByName is a map of constants for ease of templating
-var ConstantsByName = map[string]string{
-	"APIServerURLFile": APIServerURLFile,
-}
+var (
+	// NodeUpdateBackoff is the backoff time before asking APIServer to update node
+	// object again.
+	NodeUpdateBackoff = wait.Backoff{
+		Steps:    5,
+		Duration: 100 * time.Millisecond,
+		Jitter:   1.0,
+	}
+	// NodeUpdateInProgressTaint is a taint applied by MCC when the update of node starts.
+	NodeUpdateInProgressTaint = &corev1.Taint{
+		Key:    "UpdateInProgress",
+		Effect: corev1.TaintEffectPreferNoSchedule,
+	}
+	// ConstantsByName is a map of constants for ease of templating
+	ConstantsByName = map[string]string{
+		"APIServerURLFile": APIServerURLFile,
+	}
+)
diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go
@@ -14,6 +14,7 @@ import (
 	"github.com/openshift/library-go/pkg/operator/v1helpers"
 	"github.com/openshift/machine-config-operator/internal"
 	mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
+	"github.com/openshift/machine-config-operator/pkg/constants"
 	ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
 	daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants"
 	mcfgclientset "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned"
@@ -65,12 +66,6 @@ const (
 	masterPoolName = "master"
 )
 
-var nodeUpdateBackoff = wait.Backoff{
-	Steps:    5,
-	Duration: 100 * time.Millisecond,
-	Jitter:   1.0,
-}
-
 // Controller defines the node controller.
 type Controller struct {
 	client        mcfgclientset.Interface
@@ -488,6 +483,10 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {
 			ctrl.logPoolNode(pool, curNode, "changed labels")
 			changed = true
 		}
+		if !reflect.DeepEqual(oldNode.Spec.Taints, curNode.Spec.Taints) {
+			ctrl.logPoolNode(pool, curNode, "changed taints")
+			changed = true
+		}
 	}
 
 	if !changed {
@@ -766,7 +765,18 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
 	if err := ctrl.setClusterConfigAnnotation(nodes); err != nil {
 		return goerrs.Wrapf(err, "error setting clusterConfig Annotation for node in pool %q, error: %v", pool.Name, err)
 	}
-
+	// Taint all the nodes in the node pool, irrespective of their upgrade status.
+	ctx := context.TODO()
+	for _, node := range nodes {
+		// All the nodes that need to be upgraded should have `NodeUpdateInProgressTaint` so that they're less likely
+		// to be chosen during the scheduling cycle.
+		targetConfig := pool.Spec.Configuration.Name
+		if node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] != targetConfig {
+			if err := ctrl.setUpdateInProgressTaint(ctx, node.Name); err != nil {
+				return goerrs.Wrapf(err, "failed applying %s taint for node %s", constants.NodeUpdateInProgressTaint.Key, node.Name)
+			}
+		}
+	}
 	candidates, capacity := getAllCandidateMachines(pool, nodes, maxunavail)
 	if len(candidates) > 0 {
 		ctrl.logPool(pool, "%d candidate nodes for update, capacity: %d", len(candidates), capacity)
@@ -834,7 +844,7 @@ func (ctrl *Controller) setClusterConfigAnnotation(nodes []*corev1.Node) error {
 }
 
 func (ctrl *Controller) setDesiredMachineConfigAnnotation(nodeName, currentConfig string) error {
-	return clientretry.RetryOnConflict(nodeUpdateBackoff, func() error {
+	return clientretry.RetryOnConflict(constants.NodeUpdateBackoff, func() error {
 		oldNode, err := ctrl.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
 		if err != nil {
 			return err
@@ -871,7 +881,6 @@ func (ctrl *Controller) setDesiredMachineConfigAnnotation(nodeName, currentConfi
 // capacity.  It is the reponsibility of the caller to choose a subset of the nodes given the capacity.
 func getAllCandidateMachines(pool *mcfgv1.MachineConfigPool, nodesInPool []*corev1.Node, maxUnavailable int) ([]*corev1.Node, uint) {
 	targetConfig := pool.Spec.Configuration.Name
-
 	unavail := getUnavailableMachines(nodesInPool)
 	// If we're at capacity, there's nothing to do.
 	if len(unavail) >= maxUnavailable {
@@ -888,18 +897,15 @@ func getAllCandidateMachines(pool *mcfgv1.MachineConfigPool, nodesInPool []*core
 			}
 			continue
 		}
-
 		nodes = append(nodes, node)
 	}
-
 	// Nodes which are failing to target this config also count against
 	// availability - it might be a transient issue, and if the issue
 	// clears we don't want multiple to update at once.
 	if failingThisConfig >= capacity {
 		return nil, 0
 	}
 	capacity -= failingThisConfig
-
 	return nodes, uint(capacity)
 }
 
@@ -973,6 +979,47 @@ func (ctrl *Controller) updateCandidateMachines(pool *mcfgv1.MachineConfigPool,
 	return nil
 }
 
+// setUpdateInProgressTaint applies in progress taint to all the nodes that are to be updated.
+// The taint on the individual node is removed by MCD once the update of the node is complete.
+// This is to ensure that the updated nodes are being preferred to non-updated nodes there by
+// reducing the number of reschedules.
+func (ctrl *Controller) setUpdateInProgressTaint(ctx context.Context, nodeName string) error {
+	return clientretry.RetryOnConflict(constants.NodeUpdateBackoff, func() error {
+		oldNode, err := ctrl.kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		oldData, err := json.Marshal(oldNode)
+		if err != nil {
+			return err
+		}
+
+		newNode := oldNode.DeepCopy()
+		if newNode.Spec.Taints == nil {
+			newNode.Spec.Taints = []corev1.Taint{}
+		}
+
+		for _, taint := range newNode.Spec.Taints {
+			if taint.MatchTaint(constants.NodeUpdateInProgressTaint) {
+				return nil
+			}
+		}
+
+		newNode.Spec.Taints = append(newNode.Spec.Taints, *constants.NodeUpdateInProgressTaint)
+		newData, err := json.Marshal(newNode)
+		if err != nil {
+			return err
+		}
+
+		patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, corev1.Node{})
+		if err != nil {
+			return fmt.Errorf("failed to create patch for node %q: %v", nodeName, err)
+		}
+		_, err = ctrl.kubeClient.CoreV1().Nodes().Patch(ctx, nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
+		return err
+	})
+}
+
 func maxUnavailable(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node) (int, error) {
 	intOrPercent := intstrutil.FromInt(1)
 	if pool.Spec.MaxUnavailable != nil {

diff --git a/pkg/controller/node/node_controller_test.go b/pkg/controller/node/node_controller_test.go
@@ -3,6 +3,7 @@ package node
 import (
 	"encoding/json"
 	"fmt"
+	"github.com/openshift/machine-config-operator/pkg/constants"
 	"reflect"
 	"testing"
 	"time"
@@ -789,9 +790,22 @@ func TestShouldMakeProgress(t *testing.T) {
 	mcpWorker := helpers.NewMachineConfigPool("worker", nil, helpers.WorkerSelector, "v1")
 	mcp.Spec.MaxUnavailable = intStrPtr(intstr.FromInt(1))
 
+	// Node2 is at desired config, so need to do a get on the node2 to check for the taint status
+	node2 := newNodeWithLabel("node-2", "v1", "v1", map[string]string{"node-role/worker": "", "node-role/infra": ""})
+	// Update node-2 to have the needed taint, this should still have no effect
+	node2.Spec.Taints = []corev1.Taint{*constants.NodeUpdateInProgressTaint}
+	node3 := newNodeWithLabel("node-3", "v0", "v0", map[string]string{"node-role/worker": "", "node-role/infra": ""})
+	// Update node-3 to have the needed taint, this should result in taint being applied, however we won't see patch
+	// to the annotations as maxUnavailable is set to 1.
+	node3.Spec.Taints = []corev1.Taint{*constants.NodeUpdateInProgressTaint}
 	nodes := []*corev1.Node{
+		// Since node-0 is at desired config, it shouldn't be tainted.
 		newNodeWithLabel("node-0", "v1", "v1", map[string]string{"node-role/worker": "", "node-role/infra": ""}),
 		newNodeWithLabel("node-1", "v0", "v0", map[string]string{"node-role/worker": "", "node-role/infra": ""}),
+		// This node is at desiredConfig and has taint, no patch calls
+		node2,
+		// This node is not at desiredConfig and has taint, get call
+		node3,
 	}
 
 	f.ccLister = append(f.ccLister, cc)
@@ -801,10 +815,10 @@ func TestShouldMakeProgress(t *testing.T) {
 	for idx := range nodes {
 		f.kubeobjects = append(f.kubeobjects, nodes[idx])
 	}
-
+	// First patch to apply taint
 	f.expectGetNodeAction(nodes[1])
 	expNode := nodes[1].DeepCopy()
-	expNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] = "v1"
+	expNode.Spec.Taints = append(expNode.Spec.Taints, *constants.NodeUpdateInProgressTaint)
 	oldData, err := json.Marshal(nodes[1])
 	if err != nil {
 		t.Fatal(err)
@@ -818,11 +832,30 @@ func TestShouldMakeProgress(t *testing.T) {
 		t.Fatal(err)
 	}
 	f.expectPatchNodeAction(expNode, exppatch)
+
+	// We'll get a get on the node object but no patch on the node object as taints are already present
+	f.expectGetNodeAction(node3)
+
+	// Patch the annotations on the node object now. Only doing it for node-1 as maxUnavailable is set 1
+	f.expectGetNodeAction(nodes[1])
+	oldData, err = json.Marshal(expNode)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] = "v1"
+	newData, err = json.Marshal(expNode)
+	if err != nil {
+		t.Fatal(err)
+	}
+	exppatch, err = strategicpatch.CreateTwoWayMergePatch(oldData, newData, corev1.Node{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	f.expectPatchNodeAction(expNode, exppatch)
 	expStatus := calculateStatus(mcp, nodes)
 	expMcp := mcp.DeepCopy()
 	expMcp.Status = expStatus
 	f.expectUpdateMachineConfigPoolStatus(expMcp)
-
 	f.run(getKey(mcp, t))
 }
 

diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
@@ -27,7 +27,10 @@ import (
 	"golang.org/x/time/rate"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/apimachinery/pkg/util/strategicpatch"
 	"k8s.io/apimachinery/pkg/util/wait"
 	coreinformersv1 "k8s.io/client-go/informers/core/v1"
 	"k8s.io/client-go/kubernetes"
@@ -36,12 +39,14 @@ import (
 	corev1lister "k8s.io/client-go/listers/core/v1"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/tools/record"
+	clientretry "k8s.io/client-go/util/retry"
 	"k8s.io/client-go/util/workqueue"
 	"k8s.io/kubectl/pkg/drain"
 
 	configv1 "github.com/openshift/api/config/v1"
 	"github.com/openshift/machine-config-operator/lib/resourceread"
 	mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
+	commonconstants "github.com/openshift/machine-config-operator/pkg/constants"
 	ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
 	"github.com/openshift/machine-config-operator/pkg/daemon/constants"
 	mcfginformersv1 "github.com/openshift/machine-config-operator/pkg/generated/informers/externalversions/machineconfiguration.openshift.io/v1"
@@ -1290,12 +1295,59 @@ func (dn *Daemon) completeUpdate(desiredConfigName string) error {
 		return err
 	}
 
+	ctx := context.TODO()
+	if err := dn.removeUpdateInProgressTaint(ctx); err != nil {
+		return err
+	}
+
 	dn.logSystem("Update completed for config %s and node has been successfully uncordoned", desiredConfigName)
 	dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Uncordon", fmt.Sprintf("Update completed for config %s and node has been uncordoned", desiredConfigName))
 
 	return nil
 }
 
+func (dn *Daemon) removeUpdateInProgressTaint(ctx context.Context) error {
+	return clientretry.RetryOnConflict(commonconstants.NodeUpdateBackoff, func() error {
+		oldNode, err := dn.kubeClient.CoreV1().Nodes().Get(ctx, dn.name, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		oldData, err := json.Marshal(oldNode)
+		if err != nil {
+			return err
+		}
+
+		newNode := oldNode.DeepCopy()
+
+		// New taints to be copied.
+		var taintsAfterUpgrade []corev1.Taint
+		for _, taint := range newNode.Spec.Taints {
+			if taint.MatchTaint(commonconstants.NodeUpdateInProgressTaint) {
+				continue
+			} else {
+				taintsAfterUpgrade = append(taintsAfterUpgrade, taint)
+			}
+		}
+		// updateInProgress taint is not there, so no need to patch the node object, return immediately
+		if len(taintsAfterUpgrade) == len(newNode.Spec.Taints) {
+			return nil
+		}
+		// Remove the NodeUpdateInProgressTaint.
+		newNode.Spec.Taints = taintsAfterUpgrade
+		newData, err := json.Marshal(newNode)
+		if err != nil {
+			return err
+		}
+
+		patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, corev1.Node{})
+		if err != nil {
+			return fmt.Errorf("failed to create patch for node %q: %v", dn.name, err)
+		}
+		_, err = dn.kubeClient.CoreV1().Nodes().Patch(ctx, dn.name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
+		return err
+	})
+}
+
 // triggerUpdateWithMachineConfig starts the update. It queries the cluster for
 // the current and desired config if they weren't passed.
 func (dn *Daemon) triggerUpdateWithMachineConfig(currentConfig, desiredConfig *mcfgv1.MachineConfig) error {