Skip to content

Commit

Permalink
[gardenlet] Switch Shoot care controller to controller-runtime (#…
Browse files Browse the repository at this point in the history
…7206)

* Add integration test

* Drop legacy custom listers

Now that we use a cached controller-runtime client (since a long while, actually), we don't need these custom cached listers anymore.

* Add documentation

* Switch care reconciler to native controller-runtime controller

* Cleanup no longer needed code

* Move shoot status updater to bootstrapping phase

* Address PR review feedback

* Address PR review feedback

* Final cleanups now that last controller was refactored
  • Loading branch information
rfranzke committed Dec 18, 2022
1 parent 7305923 commit 1d9b534
Show file tree
Hide file tree
Showing 30 changed files with 1,369 additions and 1,718 deletions.
116 changes: 49 additions & 67 deletions cmd/gardenlet/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ import (
gardenerhealthz "github.com/gardener/gardener/pkg/healthz"
"github.com/gardener/gardener/pkg/logger"
"github.com/gardener/gardener/pkg/utils"
"github.com/gardener/gardener/pkg/utils/flow"
gutil "github.com/gardener/gardener/pkg/utils/gardener"
kutil "github.com/gardener/gardener/pkg/utils/kubernetes"
)
Expand Down Expand Up @@ -156,15 +157,14 @@ func run(ctx context.Context, cancel context.CancelFunc, log logr.Logger, cfg *c
HealthProbeBindAddress: fmt.Sprintf("%s:%d", cfg.Server.HealthProbes.BindAddress, cfg.Server.HealthProbes.Port),
MetricsBindAddress: fmt.Sprintf("%s:%d", cfg.Server.Metrics.BindAddress, cfg.Server.Metrics.Port),

LeaderElection: cfg.LeaderElection.LeaderElect,
LeaderElectionResourceLock: cfg.LeaderElection.ResourceLock,
LeaderElectionID: cfg.LeaderElection.ResourceName,
LeaderElectionNamespace: cfg.LeaderElection.ResourceNamespace,
LeaseDuration: &cfg.LeaderElection.LeaseDuration.Duration,
RenewDeadline: &cfg.LeaderElection.RenewDeadline.Duration,
RetryPeriod: &cfg.LeaderElection.RetryPeriod.Duration,
// TODO: enable this once we have refactored all controllers and added them to this manager
// LeaderElectionReleaseOnCancel: true,
LeaderElection: cfg.LeaderElection.LeaderElect,
LeaderElectionResourceLock: cfg.LeaderElection.ResourceLock,
LeaderElectionID: cfg.LeaderElection.ResourceName,
LeaderElectionNamespace: cfg.LeaderElection.ResourceNamespace,
LeaderElectionReleaseOnCancel: true,
LeaseDuration: &cfg.LeaderElection.LeaseDuration.Duration,
RenewDeadline: &cfg.LeaderElection.RenewDeadline.Duration,
RetryPeriod: &cfg.LeaderElection.RetryPeriod.Duration,

ClientDisableCacheFor: []client.Object{
&corev1.Event{},
Expand Down Expand Up @@ -353,6 +353,11 @@ func (g *garden) Start(ctx context.Context) error {
return err
}

log.Info("Updating last operation status of processing Shoots to 'Aborted'")
if err := g.updateProcessingShootStatusToAborted(ctx, gardenCluster.GetClient()); err != nil {
return err
}

log.Info("Setting up shoot client map")
shootClientMap, err := clientmapbuilder.
NewShootClientMapBuilder().
Expand All @@ -364,65 +369,18 @@ func (g *garden) Start(ctx context.Context) error {
return fmt.Errorf("failed to build shoot ClientMap: %w", err)
}

log.Info("Fetching cluster identity and garden namespace from garden cluster")
configMap := &corev1.ConfigMap{}
if err := gardenCluster.GetClient().Get(ctx, kutil.Key(metav1.NamespaceSystem, v1beta1constants.ClusterIdentity), configMap); err != nil {
return fmt.Errorf("failed getting cluster-identity ConfigMap in garden cluster: %w", err)
}

gardenClusterIdentity, ok := configMap.Data[v1beta1constants.ClusterIdentity]
if !ok {
return fmt.Errorf("cluster-identity ConfigMap data does not have %q key", v1beta1constants.ClusterIdentity)
}

// TODO(rfranzke): Move this to the controller.AddToManager function once legacy controllers relying on
// it have been refactored.
seedClientSet, err := kubernetes.NewWithConfig(
kubernetes.WithRESTConfig(g.mgr.GetConfig()),
kubernetes.WithRuntimeAPIReader(g.mgr.GetAPIReader()),
kubernetes.WithRuntimeClient(g.mgr.GetClient()),
kubernetes.WithRuntimeCache(g.mgr.GetCache()),
)
if err != nil {
return fmt.Errorf("failed creating seed clientset: %w", err)
}

// TODO(rfranzke): Move this to the controller.AddControllersToManager function once the shoot legacy controller has
// been refactored.
identity, err := gutil.DetermineIdentity()
if err != nil {
return err
}

log.Info("Adding runnables now that bootstrapping is finished")
runnables := []manager.Runnable{
g.healthManager,
shootClientMap,
&controller.LegacyControllerFactory{
Log: log,
Config: g.config,
GardenCluster: gardenCluster,
SeedCluster: g.mgr,
SeedClientSet: seedClientSet,
ShootClientMap: shootClientMap,
GardenClusterIdentity: gardenClusterIdentity,
Identity: identity,
},
}

if g.config.GardenClientConnection.KubeconfigSecret != nil {
gardenClientSet, err := kubernetes.NewWithConfig(
kubernetes.WithRESTConfig(gardenCluster.GetConfig()),
kubernetes.WithRuntimeAPIReader(gardenCluster.GetAPIReader()),
kubernetes.WithRuntimeClient(gardenCluster.GetClient()),
kubernetes.WithRuntimeCache(gardenCluster.GetCache()),
)
certificateManager, err := certificate.NewCertificateManager(log, gardenCluster, g.mgr.GetClient(), g.config)
if err != nil {
return fmt.Errorf("failed creating garden clientset: %w", err)
return fmt.Errorf("failed to create a new certificate manager: %w", err)
}

certificateManager := certificate.NewCertificateManager(log, gardenClientSet, g.mgr.GetClient(), g.config)

runnables = append(runnables, manager.RunnableFunc(func(ctx context.Context) error {
return certificateManager.ScheduleCertificateRotation(ctx, g.cancel, g.mgr.GetEventRecorderFor("certificate-manager"))
}))
Expand All @@ -433,21 +391,13 @@ func (g *garden) Start(ctx context.Context) error {
}

log.Info("Adding controllers to manager")
gardenNamespace := &corev1.Namespace{}
if err := gardenCluster.GetClient().Get(ctx, kutil.Key(v1beta1constants.GardenNamespace), gardenNamespace); err != nil {
return fmt.Errorf("failed getting garden namespace in garden cluster: %w", err)
}

if err := controller.AddToManager(
ctx,
g.mgr,
gardenCluster,
g.mgr,
seedClientSet,
shootClientMap,
g.config,
gardenNamespace,
gardenClusterIdentity,
identity,
g.healthManager,
); err != nil {
return fmt.Errorf("failed adding controllers to manager: %w", err)
Expand Down Expand Up @@ -497,6 +447,38 @@ func (g *garden) registerSeed(ctx context.Context, gardenClient client.Client) e
})
}

func (g *garden) updateProcessingShootStatusToAborted(ctx context.Context, gardenClient client.Client) error {
shoots := &gardencorev1beta1.ShootList{}
if err := gardenClient.List(ctx, shoots); err != nil {
return err
}

var taskFns []flow.TaskFn

for _, shoot := range shoots.Items {
if specSeedName, statusSeedName := gutil.GetShootSeedNames(&shoot); gutil.GetResponsibleSeedName(specSeedName, statusSeedName) != g.config.SeedConfig.Name {
continue
}

// Check if the status indicates that an operation is processing and mark it as "aborted".
if shoot.Status.LastOperation == nil || shoot.Status.LastOperation.State != gardencorev1beta1.LastOperationStateProcessing {
continue
}

taskFns = append(taskFns, func(ctx context.Context) error {
patch := client.MergeFrom(shoot.DeepCopy())
shoot.Status.LastOperation.State = gardencorev1beta1.LastOperationStateAborted
if err := gardenClient.Status().Patch(ctx, &shoot, patch); err != nil {
return fmt.Errorf("failed to set status to 'Aborted' for shoot %q: %w", client.ObjectKeyFromObject(&shoot), err)
}

return nil
})
}

return flow.Parallel(taskFns...)(ctx)
}

func addAllFieldIndexes(ctx context.Context, i client.FieldIndexer) error {
for _, fn := range []func(context.Context, client.FieldIndexer) error{
// core API group
Expand Down
44 changes: 43 additions & 1 deletion docs/concepts/gardenlet.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ If at least one `ManagedResource` is unhealthy and there is threshold configurat

- to `Progressing` if it was `True` before.
- to `Progressing` if it was `Progressing` before and the `lastUpdateTime` of the condition does not exceed the configured threshold duration yet.
- to `False` if it was `Progressing` before and the `lastUpdateTime` of the condition does exceed the configured threshold duration.
- to `False` if it was `Progressing` before and the `lastUpdateTime` of the condition exceeds the configured threshold duration.

The condition thresholds can be used to prevent reporting issues too early just because there is a rollout or a short disruption.
Only if the unhealthiness persists for at least the configured threshold duration then the issues will be reported (by setting the status to `False`).
Expand Down Expand Up @@ -391,6 +391,48 @@ There are a few special cases that overwrite or confine how often and under whic
- In case `GardenletConfiguration.controllers.shoot.reconcileInMaintenanceOnly` is enabled (disabled by default), gardenlet performs regular shoot reconciliations only once in the respective maintenance time window (`GardenletConfiguration.controllers.shoot.syncPeriod` is ignored). Gardenlet randomly distributes shoot reconciliations over the maintenance time window to avoid high bursts of reconciliations (see [this doc](../usage/shoot_maintenance.md#cluster-reconciliation)).
- In case `Shoot.spec.maintenance.confineSpecUpdateRollout` is enabled (disabled by default), changes to the shoot specification are not rolled out immediately but only during the respective maintenance time window (see [this doc](../usage/shoot_maintenance.md)).

#### "Care" Reconciler

This reconciler performs three "care" actions related to `Shoot`s.

##### Conditions

It maintains four conditions and performs the following checks:

- `APIServerAvailable`: The `/healthz` endpoint of the shoot's `kube-apiserver` is called and considered healthy when it responds with `200 OK`.
- `ControlPlaneHealthy`: The control plane is considered healthy when the respective `Deployment`s (for example `kube-apiserver`), `StatefulSet`s (for example `prometheus`), and `Etcd`s (for example `etcd-main`) exist and are healthy.
- `EveryNodyReady`: The conditions of the worker nodes are checked (e.g., `Ready`, `MemoryPressure`, etc.). Also, it's checked whether the Kubernetes version of the installed `kubelet` matches the desired version specified in the `Shoot` resource.
- `SystemComponentsHealthy`: The conditions of the `ManagedResource`s are checked (e.g. `ResourcesApplied`, etc.). Also, it is verified whether the VPN tunnel connection is established (which is required for `kube-apiserver` to communicate with the worker nodes).

Each condition can optionally also have error `codes` in order to indicate which type of issue was detected (see [this document](../usage/shoot_status.md) for more details).

Apart from the above, extension controllers can also contribute to the `status` or error `codes` of these conditions (see [this document](../extensions/shoot-health-status-conditions.md) for more details).

If all checks for a certain conditions are succeeded then its `status` will be set to `True`.
Otherwise, it will be set to `False`.

If at least one check fails and there is threshold configuration for the conditions (in `.controllers.seedCare.conditionThresholds`) then the status will be set

- to `Progressing` if it was `True` before.
- to `Progressing` if it was `Progressing` before and the `lastUpdateTime` of the condition does not exceed the configured threshold duration yet.
- to `False` if it was `Progressing` before and the `lastUpdateTime` of the condition exceeds the configured threshold duration.

The condition thresholds can be used to prevent reporting issues too early just because there is a rollout or a short disruption.
Only if the unhealthiness persists for at least the configured threshold duration then the issues will be reported (by setting the status to `False`).

##### Constraints And Automatic Webhook Remediation

Please see [this document](../usage/shoot_status.md#constraints) for more details.

##### Garbage Collection

Stale pods in the shoot namespace in the seed cluster and in the `kube-system` namespace in the shoot cluster are deleted.
A pod is considered stale when

- it was terminated with reason `Evicted`.
- it was terminated with reason starting with `OutOf` (e.g., `OutOfCpu`).
- it is stuck in termination (i.e., if its `deletionTimestamp` is more than `5m` ago).

#### "Migration" Reconciler

This reconciler is only active if the [`ForceRestore`](../deployment/feature_gates.md#list-of-feature-gates) feature gate is enabled in the `gardenlet` and if the `Seed` has owner checks enabled (i.e., `spec.setttings.ownerChecks.enabled=true`).
Expand Down
14 changes: 2 additions & 12 deletions docs/usage/shoot_status.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,12 @@ The Shoot status consists of a set of conditions. A [Condition](../api-reference
Currently the available Shoot condition types are:

- `APIServerAvailable`

This condition type indicates whether the Shoot's kube-apiserver is available or not. In particular, the `/healthz` endpoint of the kube-apiserver is called, and the expected response code is `HTTP 200`.

- `ControlPlaneHealthy`

This condition type indicates whether all the control plane components deployed to the Shoot's namespace in the Seed do exist and are running fine.

- `EveryNodeReady`

This condition type indicates whether at least the requested minimum number of Nodes is present per each worker pool and whether all Nodes are healthy.

- `SystemComponentsHealthy`

This condition type indicates whether all system components deployed to the `kube-system` namespace in the shoot do exist and are running fine. It also reflects whether the tunnel connection between the control plane and the Shoot networks can be established.

The Shoot conditions are maintained by the [shoot care control](https://github.com/gardener/gardener/blob/master/pkg/gardenlet/controller/shoot/shoot_care_control.go) of gardenlet.
The Shoot conditions are maintained by the [shoot care reconciler](../../pkg/gardenlet/controller/shoot/care) of gardenlet.
Find more information in [this document](../concepts/gardenlet.md#shoot-controller).

### Sync Period

Expand Down
15 changes: 13 additions & 2 deletions pkg/gardenlet/bootstrap/certificate/certificate_rotation.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/cluster"

gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
Expand Down Expand Up @@ -56,7 +57,17 @@ type Manager struct {
}

// NewCertificateManager creates a certificate manager that can be used to rotate gardenlet's client certificate for the Garden cluster
func NewCertificateManager(log logr.Logger, gardenClientSet kubernetes.Interface, seedClient client.Client, config *config.GardenletConfiguration) *Manager {
func NewCertificateManager(log logr.Logger, gardenCluster cluster.Cluster, seedClient client.Client, config *config.GardenletConfiguration) (*Manager, error) {
gardenClientSet, err := kubernetes.NewWithConfig(
kubernetes.WithRESTConfig(gardenCluster.GetConfig()),
kubernetes.WithRuntimeAPIReader(gardenCluster.GetAPIReader()),
kubernetes.WithRuntimeClient(gardenCluster.GetClient()),
kubernetes.WithRuntimeCache(gardenCluster.GetCache()),
)
if err != nil {
return nil, fmt.Errorf("failed creating garden clientset: %w", err)
}

seedName := bootstraputil.GetSeedName(config.SeedConfig)

return &Manager{
Expand All @@ -65,7 +76,7 @@ func NewCertificateManager(log logr.Logger, gardenClientSet kubernetes.Interface
seedClient: seedClient,
gardenClientConnection: config.GardenClientConnection,
seedName: seedName,
}
}, nil
}

// ScheduleCertificateRotation waits until the currently used Garden cluster client certificate approaches expiration.
Expand Down
40 changes: 35 additions & 5 deletions pkg/gardenlet/controller/add.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@
package controller

import (
"context"
"fmt"
"os"
"path/filepath"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/cluster"
"sigs.k8s.io/controller-runtime/pkg/manager"

"github.com/gardener/gardener/charts"
gardencorev1beta1 "github.com/gardener/gardener/pkg/apis/core/v1beta1"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
"github.com/gardener/gardener/pkg/client/kubernetes"
"github.com/gardener/gardener/pkg/client/kubernetes/clientmap"
"github.com/gardener/gardener/pkg/gardenlet/apis/config"
Expand All @@ -38,22 +40,26 @@ import (
"github.com/gardener/gardener/pkg/gardenlet/controller/shoot"
"github.com/gardener/gardener/pkg/gardenlet/controller/shootstate"
"github.com/gardener/gardener/pkg/healthz"
gutil "github.com/gardener/gardener/pkg/utils/gardener"
"github.com/gardener/gardener/pkg/utils/imagevector"
kutil "github.com/gardener/gardener/pkg/utils/kubernetes"
)

// AddToManager adds all gardenlet controllers to the given manager.
func AddToManager(
ctx context.Context,
mgr manager.Manager,
gardenCluster cluster.Cluster,
seedCluster cluster.Cluster,
seedClientSet kubernetes.Interface,
shootClientMap clientmap.ClientMap,
cfg *config.GardenletConfiguration,
gardenNamespace *corev1.Namespace,
gardenClusterIdentity string,
identity *gardencorev1beta1.Gardener,
healthManager healthz.Manager,
) error {
identity, err := gutil.DetermineIdentity()
if err != nil {
return err
}

imageVector, err := imagevector.ReadGlobalImageVectorWithEnvOverride(filepath.Join(charts.Path, "images.yaml"))
if err != nil {
return fmt.Errorf("failed reading image vector override: %w", err)
Expand All @@ -67,6 +73,30 @@ func AddToManager(
}
}

configMap := &corev1.ConfigMap{}
if err := gardenCluster.GetClient().Get(ctx, kutil.Key(metav1.NamespaceSystem, v1beta1constants.ClusterIdentity), configMap); err != nil {
return fmt.Errorf("failed getting cluster-identity ConfigMap in garden cluster: %w", err)
}
gardenClusterIdentity, ok := configMap.Data[v1beta1constants.ClusterIdentity]
if !ok {
return fmt.Errorf("cluster-identity ConfigMap data does not have %q key", v1beta1constants.ClusterIdentity)
}

gardenNamespace := &corev1.Namespace{}
if err := gardenCluster.GetClient().Get(ctx, kutil.Key(v1beta1constants.GardenNamespace), gardenNamespace); err != nil {
return fmt.Errorf("failed getting garden namespace in garden cluster: %w", err)
}

seedClientSet, err := kubernetes.NewWithConfig(
kubernetes.WithRESTConfig(seedCluster.GetConfig()),
kubernetes.WithRuntimeAPIReader(seedCluster.GetAPIReader()),
kubernetes.WithRuntimeClient(seedCluster.GetClient()),
kubernetes.WithRuntimeCache(seedCluster.GetCache()),
)
if err != nil {
return fmt.Errorf("failed creating seed clientset: %w", err)
}

if err := (&backupbucket.Reconciler{
Config: *cfg.Controllers.BackupBucket,
SeedName: cfg.SeedConfig.Name,
Expand Down

0 comments on commit 1d9b534

Please sign in to comment.