Skip to content

Commit

Permalink
(feat) TerminationGracePeriodSeconds configurable
Browse files Browse the repository at this point in the history
It makes the pod.spec TerminationGracePeriodSeconds configurable via the
CRDs for prometheus and prometheusagent

Fixes prometheus-operator#3433
Closes prometheus-operator#4681

Co-authored-by: Ben Ye <ben.ye@bytedance.com>
Signed-off-by: Raul Navieras <me@raulnaveiras.com>
  • Loading branch information
rnaveiras and Ben Ye committed Mar 21, 2024
1 parent 06d6a03 commit e1818d2
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 22 deletions.
15 changes: 15 additions & 0 deletions pkg/apis/monitoring/v1/prometheus_types.go
Expand Up @@ -676,6 +676,21 @@ type CommonPrometheusFields struct {
// +listType=map
// +listMapKey=name
ScrapeClasses []ScrapeClass `json:"scrapeClasses,omitempty"`

// Optional duration in seconds the pod needs to terminate gracefully. May be
// decreased in delete request. Value must be non-negative integer. The value
// zero indicates stop immediately via the kill signal (no opportunity to
// shutdown). If this value is nil, the default grace period will be used
// instead.
// The grace period is the duration in seconds after the processes running in
// the pod are sent a termination signal and the time when the processes are
// forcibly halted with a kill signal. Set this value longer than the
// expected cleanup time for your process.
// Default value is set to 10 minutes because Prometheus may take quite long
// time to checkpoint existing data before shutdown.
// +optional
// +kubebuilder:default:=600
PodTerminationGracePeriodSeconds *uint64 `json:"podTerminationGracePeriodSeconds,omitempty"`
}

// +kubebuilder:validation:Enum=HTTP;ProcessSignal
Expand Down
20 changes: 9 additions & 11 deletions pkg/prometheus/agent/statefulset.go
Expand Up @@ -365,17 +365,15 @@ func makeStatefulSetSpec(
Annotations: podAnnotations,
},
Spec: v1.PodSpec{
ShareProcessNamespace: prompkg.ShareProcessNamespace(p),
Containers: containers,
InitContainers: initContainers,
SecurityContext: cpf.SecurityContext,
ServiceAccountName: cpf.ServiceAccountName,
AutomountServiceAccountToken: ptr.To(true),
NodeSelector: cpf.NodeSelector,
PriorityClassName: cpf.PriorityClassName,
// Prometheus may take quite long to shut down to checkpoint existing data.
// Allow up to 10 minutes for clean termination.
TerminationGracePeriodSeconds: ptr.To(int64(600)),
ShareProcessNamespace: prompkg.ShareProcessNamespace(p),
Containers: containers,
InitContainers: initContainers,
SecurityContext: cpf.SecurityContext,
ServiceAccountName: cpf.ServiceAccountName,
AutomountServiceAccountToken: ptr.To(true),
NodeSelector: cpf.NodeSelector,
PriorityClassName: cpf.PriorityClassName,
TerminationGracePeriodSeconds: prompkg.GetPodTerminationGracePeriodSeconds(cpf),
Volumes: volumes,
Tolerations: cpf.Tolerations,
Affinity: cpf.Affinity,
Expand Down
36 changes: 36 additions & 0 deletions pkg/prometheus/agent/statefulset_test.go
Expand Up @@ -393,3 +393,39 @@ func TestPodTopologySpreadConstraintWithAdditionalLabels(t *testing.T) {
})
}
}

func TestPodTerminationGracePeriodSeconds(t *testing.T) {
tests := []struct {
name string
podTerminationGracePeriodSeconds *uint64
expectedTerminationGracePeriodSeconds int64
}{
{
name: "default value",
podTerminationGracePeriodSeconds: nil,
expectedTerminationGracePeriodSeconds: 600,
},
{
name: "non-default value",
podTerminationGracePeriodSeconds: ptr.To(uint64(60)),
expectedTerminationGracePeriodSeconds: 60,
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
sset, err := makeStatefulSetFromPrometheus(monitoringv1alpha1.PrometheusAgent{
ObjectMeta: metav1.ObjectMeta{},
Spec: monitoringv1alpha1.PrometheusAgentSpec{
CommonPrometheusFields: monitoringv1.CommonPrometheusFields{
PodTerminationGracePeriodSeconds: tc.podTerminationGracePeriodSeconds,
},
},
})
require.NoError(t, err)
require.NotNil(t, sset.Spec.Template.Spec.TerminationGracePeriodSeconds)
require.Equal(t, tc.expectedTerminationGracePeriodSeconds, *sset.Spec.Template.Spec.TerminationGracePeriodSeconds)
})
}
}
20 changes: 9 additions & 11 deletions pkg/prometheus/server/statefulset.go
Expand Up @@ -467,17 +467,15 @@ func makeStatefulSetSpec(
Annotations: podAnnotations,
},
Spec: v1.PodSpec{
ShareProcessNamespace: prompkg.ShareProcessNamespace(p),
Containers: containers,
InitContainers: initContainers,
SecurityContext: cpf.SecurityContext,
ServiceAccountName: cpf.ServiceAccountName,
AutomountServiceAccountToken: ptr.To(true),
NodeSelector: cpf.NodeSelector,
PriorityClassName: cpf.PriorityClassName,
// Prometheus may take quite long to shut down to checkpoint existing data.
// Allow up to 10 minutes for clean termination.
TerminationGracePeriodSeconds: ptr.To(int64(600)),
ShareProcessNamespace: prompkg.ShareProcessNamespace(p),
Containers: containers,
InitContainers: initContainers,
SecurityContext: cpf.SecurityContext,
ServiceAccountName: cpf.ServiceAccountName,
AutomountServiceAccountToken: ptr.To(true),
NodeSelector: cpf.NodeSelector,
PriorityClassName: cpf.PriorityClassName,
TerminationGracePeriodSeconds: prompkg.GetPodTerminationGracePeriodSeconds(cpf),
Volumes: volumes,
Tolerations: cpf.Tolerations,
Affinity: cpf.Affinity,
Expand Down
36 changes: 36 additions & 0 deletions pkg/prometheus/server/statefulset_test.go
Expand Up @@ -3127,3 +3127,39 @@ func TestStartupProbeTimeoutSeconds(t *testing.T) {
require.Equal(t, test.expectedStartupFailureThreshold, sset.Spec.Template.Spec.Containers[0].StartupProbe.FailureThreshold)
}
}

func TestPodTerminationGracePeriodSeconds(t *testing.T) {
tests := []struct {
name string
podTerminationGracePeriodSeconds *uint64
expectedTerminationGracePeriodSeconds int64
}{
{
name: "default value",
podTerminationGracePeriodSeconds: nil,
expectedTerminationGracePeriodSeconds: 600,
},
{
name: "non-default value",
podTerminationGracePeriodSeconds: ptr.To(uint64(60)),
expectedTerminationGracePeriodSeconds: 60,
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
sset, err := makeStatefulSetFromPrometheus(monitoringv1.Prometheus{
ObjectMeta: metav1.ObjectMeta{},
Spec: monitoringv1.PrometheusSpec{
CommonPrometheusFields: monitoringv1.CommonPrometheusFields{
PodTerminationGracePeriodSeconds: tc.podTerminationGracePeriodSeconds,
},
},
})
require.NoError(t, err)
require.NotNil(t, sset.Spec.Template.Spec.TerminationGracePeriodSeconds)
require.Equal(t, tc.expectedTerminationGracePeriodSeconds, *sset.Spec.Template.Spec.TerminationGracePeriodSeconds)
})
}
}
12 changes: 12 additions & 0 deletions pkg/prometheus/statefulset.go
Expand Up @@ -514,3 +514,15 @@ func GetStatupProbePeriodSecondsAndFailureThreshold(cfp monitoringv1.CommonProme

return int32(startupPeriodSeconds), int32(startupFailureThreshold)
}

func GetPodTerminationGracePeriodSeconds(cfp monitoringv1.CommonPrometheusFields) *int64 {
// Prometheus may take quite long to checkpoint existing data before shutdown.
// Allow up to 10 minutes for clean termination.
var podTerminationGracePeriodSeconds int64 = 600

if cfp.PodTerminationGracePeriodSeconds != nil {
podTerminationGracePeriodSeconds = int64(*cfp.PodTerminationGracePeriodSeconds)
}

return ptr.To(podTerminationGracePeriodSeconds)
}

0 comments on commit e1818d2

Please sign in to comment.