Skip to content

Commit

Permalink
Merge pull request prometheus#5787 from cstyan/reshard-max-logging
Browse files Browse the repository at this point in the history
Add metrics for max/min/desired shards to queue manager.
  • Loading branch information
beorn7 committed Sep 9, 2019
2 parents 937cc1a + a98599b commit 3b3eaf3
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 0 deletions.
20 changes: 20 additions & 0 deletions documentation/prometheus-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,26 @@
description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
},
},
{
alert: 'PrometheusRemoteWriteDesiredShards',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{%(prometheusSelector)s}[5m])
> on(job, instance) group_right
max_over_time(prometheus_remote_storage_shards_max{%(prometheusSelector)s}[5m])
)
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus remote write desired shards calculation wants to run more than configured max shards.',
description: 'Prometheus %(prometheusName)s remote write desired shards calculation wants to run {{ printf $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%%s",%(prometheusSelector)s}` $labels.instance | query | first | value }}.' % $._config,
},
},
{
alert: 'PrometheusRuleFailures',
expr: |||
Expand Down
40 changes: 40 additions & 0 deletions storage/remote/queue_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,33 @@ var (
},
[]string{queue},
)
maxNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_max",
Help: "The maximum number of shards that the queue is allowed to run.",
},
[]string{queue},
)
minNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_min",
Help: "The minimum number of shards that the queue is allowed to run.",
},
[]string{queue},
)
desiredNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_desired",
Help: "The number of shards that the queues shard calculation wants to run based on the rate of samples in vs. samples out.",
},
[]string{queue},
)
)

// StorageClient defines an interface for sending a batch of samples to an
Expand Down Expand Up @@ -190,6 +217,9 @@ type QueueManager struct {
succeededSamplesTotal prometheus.Counter
retriedSamplesTotal prometheus.Counter
shardCapacity prometheus.Gauge
maxNumShards prometheus.Gauge
minNumShards prometheus.Gauge
desiredNumShards prometheus.Gauge
}

// NewQueueManager builds a new QueueManager.
Expand Down Expand Up @@ -291,10 +321,16 @@ func (t *QueueManager) Start() {
t.succeededSamplesTotal = succeededSamplesTotal.WithLabelValues(name)
t.retriedSamplesTotal = retriedSamplesTotal.WithLabelValues(name)
t.shardCapacity = shardCapacity.WithLabelValues(name)
t.maxNumShards = maxNumShards.WithLabelValues(name)
t.minNumShards = minNumShards.WithLabelValues(name)
t.desiredNumShards = desiredNumShards.WithLabelValues(name)

// Initialise some metrics.
t.shardCapacity.Set(float64(t.cfg.Capacity))
t.pendingSamplesMetric.Set(0)
t.maxNumShards.Set(float64(t.cfg.MaxShards))
t.minNumShards.Set(float64(t.cfg.MinShards))
t.desiredNumShards.Set(float64(t.cfg.MinShards))

t.shards.start(t.numShards)
t.watcher.Start()
Expand Down Expand Up @@ -334,6 +370,9 @@ func (t *QueueManager) Stop() {
succeededSamplesTotal.DeleteLabelValues(name)
retriedSamplesTotal.DeleteLabelValues(name)
shardCapacity.DeleteLabelValues(name)
maxNumShards.DeleteLabelValues(name)
minNumShards.DeleteLabelValues(name)
desiredNumShards.DeleteLabelValues(name)
}

// StoreSeries keeps track of which series we know about for lookups when sending samples to remote.
Expand Down Expand Up @@ -502,6 +541,7 @@ func (t *QueueManager) calculateDesiredShards() {
}

numShards := int(math.Ceil(desiredShards))
t.desiredNumShards.Set(float64(numShards))
if numShards > t.cfg.MaxShards {
numShards = t.cfg.MaxShards
} else if numShards < t.cfg.MinShards {
Expand Down

0 comments on commit 3b3eaf3

Please sign in to comment.