Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Merge metrics to fire singleton metrics to controller_runtime metric namespace #225

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/controllers/metrics/pod/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func NewController(kubeClient client.Client) controller.Controller {
}

func (c *Controller) Name() string {
return "podmetrics"
return "pod_metrics"
}

// Reconcile executes a termination control loop for the resource
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/metrics/provisioner/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func NewController(kubeClient client.Client) corecontroller.Controller {
}

func (c *Controller) Name() string {
return "provisionermetrics"
return "provisioner_metrics"
}

// Reconcile executes a termination control loop for the resource
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/metrics/state/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func NewController(cluster *state.Cluster) *Controller {
}

func (c *Controller) Name() string {
return "metricscraper"
return "metric_scraper"
}

func (c *Controller) Builder(_ context.Context, mgr manager.Manager) controller.Builder {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/provisioning/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func NewController(kubeClient client.Client, provisioner *Provisioner, recorder
}

func (c *Controller) Name() string {
return "provisioning"
return "provisioner_trigger"
}

// Reconcile the resource
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ func validateNodeSelectorTerm(term v1.NodeSelectorTerm) (errs error) {
var schedulingDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: "allocation_controller",
Subsystem: "provisioner",
Name: "scheduling_duration_seconds",
Help: "Duration of scheduling process in seconds. Broken down by provisioner and error.",
Buckets: metrics.DurationBuckets(),
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/state/informer/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func NewNodeController(kubeClient client.Client, cluster *state.Cluster) corecon
}

func (c *NodeController) Name() string {
return "node-state"
return "node_state"
}

func (c *NodeController) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/state/informer/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func NewPodController(kubeClient client.Client, cluster *state.Cluster) corecont
}

func (c *PodController) Name() string {
return "pod-state"
return "pod_state"
}

func (c *PodController) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/state/informer/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func NewProvisionerController(kubeClient client.Client, cluster *state.Cluster)
}

func (c *ProvisionerController) Name() string {
return "provisionerstate"
return "provisioner_state"
}

func (c *ProvisionerController) Reconcile(_ context.Context, _ *v1alpha5.Provisioner) (reconcile.Result, error) {
Expand Down
119 changes: 84 additions & 35 deletions pkg/operator/controller/singleton.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@ package controller

import (
"context"
"strings"
"errors"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/samber/lo"
"k8s.io/client-go/util/workqueue"
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand All @@ -47,45 +46,28 @@ func (b SingletonBuilder) Complete(r Reconciler) error {

type Singleton struct {
Reconciler
metrics *singletonMetrics
rateLimiter ratelimiter.RateLimiter
}

type singletonMetrics struct {
reconcileDuration prometheus.Histogram
reconcileErrors prometheus.Counter
}

func newSingleton(r Reconciler) *Singleton {
return &Singleton{
s := &Singleton{
Reconciler: r,
metrics: newSingletonMetrics(r.Name()),
rateLimiter: workqueue.DefaultItemBasedRateLimiter(),
}
s.initMetrics()
return s
}

func newSingletonMetrics(name string) *singletonMetrics {
metrics := &singletonMetrics{
reconcileDuration: prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: strings.ReplaceAll(name, ".", "_"),
Name: "reconcile_time_seconds",
Help: "Length of time per reconcile.",
Buckets: metrics.DurationBuckets(),
},
),
reconcileErrors: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: strings.ReplaceAll(name, ".", "_"),
Name: "reconcile_errors_total",
Help: "Total number of reconcile errors.",
},
),
}
crmetrics.Registry.MustRegister(metrics.reconcileDuration, metrics.reconcileErrors)
return metrics
// initMetrics is effectively the same metrics initialization function used by controller-runtime
// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/controller.go
func (s *Singleton) initMetrics() {
activeWorkers.WithLabelValues(s.Name()).Set(0)
reconcileErrors.WithLabelValues(s.Name()).Add(0)
reconcileTotal.WithLabelValues(s.Name(), labelError).Add(0)
reconcileTotal.WithLabelValues(s.Name(), labelRequeueAfter).Add(0)
reconcileTotal.WithLabelValues(s.Name(), labelRequeue).Add(0)
reconcileTotal.WithLabelValues(s.Name(), labelSuccess).Add(0)
workerCount.WithLabelValues(s.Name()).Set(float64(1))
}

var singletonRequest = reconcile.Request{}
Expand All @@ -105,23 +87,90 @@ func (s *Singleton) Start(ctx context.Context) error {
}

func (s *Singleton) reconcile(ctx context.Context) time.Duration {
measureDuration := metrics.Measure(s.metrics.reconcileDuration)
activeWorkers.WithLabelValues(s.Name()).Inc()
defer activeWorkers.WithLabelValues(s.Name()).Dec()

measureDuration := metrics.Measure(reconcileDuration.WithLabelValues(s.Name()))
res, err := s.Reconcile(ctx, singletonRequest)
measureDuration() // Observe the length of time between the function creation and now

switch {
case err != nil:
s.metrics.reconcileErrors.Inc()
reconcileErrors.WithLabelValues(s.Name()).Inc()
reconcileTotal.WithLabelValues(s.Name(), labelError).Inc()
logging.FromContext(ctx).Error(err)
return s.rateLimiter.When(singletonRequest)
case res.Requeue:
reconcileTotal.WithLabelValues(s.Name(), labelRequeue).Inc()
return s.rateLimiter.When(singletonRequest)
default:
s.rateLimiter.Forget(singletonRequest)
return lo.Ternary(res.RequeueAfter > 0, res.RequeueAfter, time.Duration(0))
switch {
case res.RequeueAfter > 0:
reconcileTotal.WithLabelValues(s.Name(), labelRequeueAfter).Inc()
return res.RequeueAfter
default:
reconcileTotal.WithLabelValues(s.Name(), labelSuccess).Inc()
return time.Duration(0)
}
}
}

func (s *Singleton) NeedLeaderElection() bool {
return true
}

func init() {
mergeMetrics()
}

const (
labelError = "error"
labelRequeueAfter = "requeue_after"
labelRequeue = "requeue"
labelSuccess = "success"
)

// Metrics below are copied metrics fired by controller-runtime in its /internal package. This is leveraged
// so that we can fire to the same namespace as users expect other controller-runtime metrics to be fired
// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/metrics/metrics.go
var (
reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_total",
Help: "Total number of reconciliations per controller",
}, []string{"controller", "result"})
reconcileDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_seconds",
Help: "Length of time per reconciliation per controller",
Buckets: metrics.DurationBuckets(),
}, []string{"controller"})
reconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconciliation errors per controller",
}, []string{"controller"})
workerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_max_concurrent_reconciles",
Help: "Maximum number of concurrent reconciles per controller",
}, []string{"controller"})
activeWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_active_workers",
Help: "Number of currently used workers per controller",
}, []string{"controller"})
)

// mergeMetrics merges the singletonMetrics with metrics already registered in the controller-runtime metrics registry
// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/metrics/metrics.go
// We know that all these metrics should be registered by controller-runtime so we should switch over
func mergeMetrics() {
jonathan-innis marked this conversation as resolved.
Show resolved Hide resolved
err := &prometheus.AlreadyRegisteredError{}
errors.As(crmetrics.Registry.Register(reconcileTotal), err)
reconcileTotal = err.ExistingCollector.(*prometheus.CounterVec)
errors.As(crmetrics.Registry.Register(reconcileDuration), err)
reconcileDuration = err.ExistingCollector.(*prometheus.HistogramVec)
errors.As(crmetrics.Registry.Register(reconcileErrors), err)
reconcileErrors = err.ExistingCollector.(*prometheus.CounterVec)
errors.As(crmetrics.Registry.Register(workerCount), err)
workerCount = err.ExistingCollector.(*prometheus.GaugeVec)
errors.As(crmetrics.Registry.Register(activeWorkers), err)
activeWorkers = err.ExistingCollector.(*prometheus.GaugeVec)
}