From e1818d290d9c22566612464c664337982c8a102d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Naveiras?= Date: Thu, 21 Mar 2024 10:49:45 +0000 Subject: [PATCH] (feat) TerminationGracePeriodSeconds configurable It makes the pod.spec TerminationGracePeriodSeconds configurable via the CRDs for prometheus and prometheusagent Fixes #3433 Closes #4681 Co-authored-by: Ben Ye Signed-off-by: Raul Navieras --- pkg/apis/monitoring/v1/prometheus_types.go | 15 +++++++++ pkg/prometheus/agent/statefulset.go | 20 ++++++------ pkg/prometheus/agent/statefulset_test.go | 36 ++++++++++++++++++++++ pkg/prometheus/server/statefulset.go | 20 ++++++------ pkg/prometheus/server/statefulset_test.go | 36 ++++++++++++++++++++++ pkg/prometheus/statefulset.go | 12 ++++++++ 6 files changed, 117 insertions(+), 22 deletions(-) diff --git a/pkg/apis/monitoring/v1/prometheus_types.go b/pkg/apis/monitoring/v1/prometheus_types.go index 06dd95ba7bf..684f82e7271 100644 --- a/pkg/apis/monitoring/v1/prometheus_types.go +++ b/pkg/apis/monitoring/v1/prometheus_types.go @@ -676,6 +676,21 @@ type CommonPrometheusFields struct { // +listType=map // +listMapKey=name ScrapeClasses []ScrapeClass `json:"scrapeClasses,omitempty"` + + // Optional duration in seconds the pod needs to terminate gracefully. May be + // decreased in delete request. Value must be non-negative integer. The value + // zero indicates stop immediately via the kill signal (no opportunity to + // shutdown). If this value is nil, the default grace period will be used + // instead. + // The grace period is the duration in seconds after the processes running in + // the pod are sent a termination signal and the time when the processes are + // forcibly halted with a kill signal. Set this value longer than the + // expected cleanup time for your process. + // Default value is set to 10 minutes because Prometheus may take quite long + // time to checkpoint existing data before shutdown. + // +optional + // +kubebuilder:default:=600 + PodTerminationGracePeriodSeconds *uint64 `json:"podTerminationGracePeriodSeconds,omitempty"` } // +kubebuilder:validation:Enum=HTTP;ProcessSignal diff --git a/pkg/prometheus/agent/statefulset.go b/pkg/prometheus/agent/statefulset.go index 36ce6858f5a..bde704cf46c 100644 --- a/pkg/prometheus/agent/statefulset.go +++ b/pkg/prometheus/agent/statefulset.go @@ -365,17 +365,15 @@ func makeStatefulSetSpec( Annotations: podAnnotations, }, Spec: v1.PodSpec{ - ShareProcessNamespace: prompkg.ShareProcessNamespace(p), - Containers: containers, - InitContainers: initContainers, - SecurityContext: cpf.SecurityContext, - ServiceAccountName: cpf.ServiceAccountName, - AutomountServiceAccountToken: ptr.To(true), - NodeSelector: cpf.NodeSelector, - PriorityClassName: cpf.PriorityClassName, - // Prometheus may take quite long to shut down to checkpoint existing data. - // Allow up to 10 minutes for clean termination. - TerminationGracePeriodSeconds: ptr.To(int64(600)), + ShareProcessNamespace: prompkg.ShareProcessNamespace(p), + Containers: containers, + InitContainers: initContainers, + SecurityContext: cpf.SecurityContext, + ServiceAccountName: cpf.ServiceAccountName, + AutomountServiceAccountToken: ptr.To(true), + NodeSelector: cpf.NodeSelector, + PriorityClassName: cpf.PriorityClassName, + TerminationGracePeriodSeconds: prompkg.GetPodTerminationGracePeriodSeconds(cpf), Volumes: volumes, Tolerations: cpf.Tolerations, Affinity: cpf.Affinity, diff --git a/pkg/prometheus/agent/statefulset_test.go b/pkg/prometheus/agent/statefulset_test.go index e7781937709..bf0e6dc66a9 100644 --- a/pkg/prometheus/agent/statefulset_test.go +++ b/pkg/prometheus/agent/statefulset_test.go @@ -393,3 +393,39 @@ func TestPodTopologySpreadConstraintWithAdditionalLabels(t *testing.T) { }) } } + +func TestPodTerminationGracePeriodSeconds(t *testing.T) { + tests := []struct { + name string + podTerminationGracePeriodSeconds *uint64 + expectedTerminationGracePeriodSeconds int64 + }{ + { + name: "default value", + podTerminationGracePeriodSeconds: nil, + expectedTerminationGracePeriodSeconds: 600, + }, + { + name: "non-default value", + podTerminationGracePeriodSeconds: ptr.To(uint64(60)), + expectedTerminationGracePeriodSeconds: 60, + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + sset, err := makeStatefulSetFromPrometheus(monitoringv1alpha1.PrometheusAgent{ + ObjectMeta: metav1.ObjectMeta{}, + Spec: monitoringv1alpha1.PrometheusAgentSpec{ + CommonPrometheusFields: monitoringv1.CommonPrometheusFields{ + PodTerminationGracePeriodSeconds: tc.podTerminationGracePeriodSeconds, + }, + }, + }) + require.NoError(t, err) + require.NotNil(t, sset.Spec.Template.Spec.TerminationGracePeriodSeconds) + require.Equal(t, tc.expectedTerminationGracePeriodSeconds, *sset.Spec.Template.Spec.TerminationGracePeriodSeconds) + }) + } +} diff --git a/pkg/prometheus/server/statefulset.go b/pkg/prometheus/server/statefulset.go index db9e1881c1a..c13bf73bb52 100644 --- a/pkg/prometheus/server/statefulset.go +++ b/pkg/prometheus/server/statefulset.go @@ -467,17 +467,15 @@ func makeStatefulSetSpec( Annotations: podAnnotations, }, Spec: v1.PodSpec{ - ShareProcessNamespace: prompkg.ShareProcessNamespace(p), - Containers: containers, - InitContainers: initContainers, - SecurityContext: cpf.SecurityContext, - ServiceAccountName: cpf.ServiceAccountName, - AutomountServiceAccountToken: ptr.To(true), - NodeSelector: cpf.NodeSelector, - PriorityClassName: cpf.PriorityClassName, - // Prometheus may take quite long to shut down to checkpoint existing data. - // Allow up to 10 minutes for clean termination. - TerminationGracePeriodSeconds: ptr.To(int64(600)), + ShareProcessNamespace: prompkg.ShareProcessNamespace(p), + Containers: containers, + InitContainers: initContainers, + SecurityContext: cpf.SecurityContext, + ServiceAccountName: cpf.ServiceAccountName, + AutomountServiceAccountToken: ptr.To(true), + NodeSelector: cpf.NodeSelector, + PriorityClassName: cpf.PriorityClassName, + TerminationGracePeriodSeconds: prompkg.GetPodTerminationGracePeriodSeconds(cpf), Volumes: volumes, Tolerations: cpf.Tolerations, Affinity: cpf.Affinity, diff --git a/pkg/prometheus/server/statefulset_test.go b/pkg/prometheus/server/statefulset_test.go index f67a3695947..276ce56a985 100644 --- a/pkg/prometheus/server/statefulset_test.go +++ b/pkg/prometheus/server/statefulset_test.go @@ -3127,3 +3127,39 @@ func TestStartupProbeTimeoutSeconds(t *testing.T) { require.Equal(t, test.expectedStartupFailureThreshold, sset.Spec.Template.Spec.Containers[0].StartupProbe.FailureThreshold) } } + +func TestPodTerminationGracePeriodSeconds(t *testing.T) { + tests := []struct { + name string + podTerminationGracePeriodSeconds *uint64 + expectedTerminationGracePeriodSeconds int64 + }{ + { + name: "default value", + podTerminationGracePeriodSeconds: nil, + expectedTerminationGracePeriodSeconds: 600, + }, + { + name: "non-default value", + podTerminationGracePeriodSeconds: ptr.To(uint64(60)), + expectedTerminationGracePeriodSeconds: 60, + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + sset, err := makeStatefulSetFromPrometheus(monitoringv1.Prometheus{ + ObjectMeta: metav1.ObjectMeta{}, + Spec: monitoringv1.PrometheusSpec{ + CommonPrometheusFields: monitoringv1.CommonPrometheusFields{ + PodTerminationGracePeriodSeconds: tc.podTerminationGracePeriodSeconds, + }, + }, + }) + require.NoError(t, err) + require.NotNil(t, sset.Spec.Template.Spec.TerminationGracePeriodSeconds) + require.Equal(t, tc.expectedTerminationGracePeriodSeconds, *sset.Spec.Template.Spec.TerminationGracePeriodSeconds) + }) + } +} diff --git a/pkg/prometheus/statefulset.go b/pkg/prometheus/statefulset.go index 71f1d81d108..f5c8d590b27 100644 --- a/pkg/prometheus/statefulset.go +++ b/pkg/prometheus/statefulset.go @@ -514,3 +514,15 @@ func GetStatupProbePeriodSecondsAndFailureThreshold(cfp monitoringv1.CommonProme return int32(startupPeriodSeconds), int32(startupFailureThreshold) } + +func GetPodTerminationGracePeriodSeconds(cfp monitoringv1.CommonPrometheusFields) *int64 { + // Prometheus may take quite long to checkpoint existing data before shutdown. + // Allow up to 10 minutes for clean termination. + var podTerminationGracePeriodSeconds int64 = 600 + + if cfp.PodTerminationGracePeriodSeconds != nil { + podTerminationGracePeriodSeconds = int64(*cfp.PodTerminationGracePeriodSeconds) + } + + return ptr.To(podTerminationGracePeriodSeconds) +}