Skip to content

Commit

Permalink
Use GaugeHistogram for Prometheus LongTaskTimer
Browse files Browse the repository at this point in the history
The Prometheus registry used "classic" cumulative Prometheus Histogram
for LongTaskTimer. This was conceptually wrong since _count, _sum, and
_bucket are counters (monotonic) in case of "classic" Prometheus
Histogram but since LongTaskTimer measures the currently active tasks,
these values can increase or decrease so a Prometheus GaugeHistogram is
a better fit for it since there _gcount, _gsum, and _bucket are
gauges (non-monotonic).

See the OpenMetrics specs for more details:
https://github.com/OpenObservability/OpenMetrics/blob/1386544931307dff279688f332890c31b6c5de36/specification/OpenMetrics.md#gaugehistogram

This also means that the suffix of the time series change:
- From _count to _gcount
- From _sum to _gsum

So the scrape output of a LongTaskTimer (test.ltt) could look like this:
# HELP test_ltt_seconds
# TYPE test_ltt_seconds histogram
test_ltt_seconds_bucket{le="120.0"} 1
[...]
test_ltt_seconds_bucket{le="7200.0"} 1
test_ltt_seconds_bucket{le="+Inf"} 1
# HELP test_ltt_seconds_gcount
# TYPE test_ltt_seconds_gcount gauge
test_ltt_seconds_gcount 1
# HELP test_ltt_seconds_gsum
# TYPE test_ltt_seconds_gsum gauge
test_ltt_seconds_gsum 10.0
# HELP test_ltt_seconds_max
# TYPE test_ltt_seconds_max gauge
test_ltt_seconds_max 10.0

Closes gh-4988
  • Loading branch information
jonatan-ivanov committed Apr 30, 2024
1 parent b2d8dc7 commit d7b9d24
Show file tree
Hide file tree
Showing 2 changed files with 235 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import io.micrometer.core.instrument.distribution.HistogramSnapshot;
import io.micrometer.core.instrument.distribution.*;
import io.micrometer.core.instrument.distribution.pause.PauseDetector;
import io.micrometer.core.instrument.internal.CumulativeHistogramLongTaskTimer;
import io.micrometer.core.instrument.internal.DefaultGauge;
import io.micrometer.core.instrument.internal.DefaultLongTaskTimer;
import io.micrometer.core.instrument.internal.DefaultMeter;
import io.micrometer.core.instrument.util.TimeUtils;
import io.prometheus.metrics.config.PrometheusProperties;
Expand Down Expand Up @@ -308,8 +308,8 @@ protected io.micrometer.core.instrument.Timer newTimer(Meter.Id id,
DistributionStatisticConfig distributionStatisticConfig, PauseDetector pauseDetector) {
PrometheusTimer timer = new PrometheusTimer(id, clock, distributionStatisticConfig, pauseDetector,
exemplarSamplerFactory);
applyToCollector(id,
(collector) -> addDistributionStatisticSamples(id, collector, timer, timer::exemplars, tagValues(id)));
applyToCollector(id, (collector) -> addDistributionStatisticSamples(id, collector, timer, timer::exemplars,
tagValues(id), false));
return timer;
}

Expand Down Expand Up @@ -339,10 +339,9 @@ protected <T> io.micrometer.core.instrument.Gauge newGauge(Meter.Id id, @Nullabl

@Override
protected LongTaskTimer newLongTaskTimer(Meter.Id id, DistributionStatisticConfig distributionStatisticConfig) {
LongTaskTimer ltt = new CumulativeHistogramLongTaskTimer(id, clock, getBaseTimeUnit(),
distributionStatisticConfig);
LongTaskTimer ltt = new DefaultLongTaskTimer(id, clock, getBaseTimeUnit(), distributionStatisticConfig, true);
applyToCollector(id, (collector) -> addDistributionStatisticSamples(id, collector, ltt, () -> Exemplars.EMPTY,
tagValues(id)));
tagValues(id), true));
return ltt;
}

Expand Down Expand Up @@ -449,7 +448,8 @@ public PrometheusRegistry getPrometheusRegistry() {
}

private void addDistributionStatisticSamples(Meter.Id id, MicrometerCollector collector,
HistogramSupport histogramSupport, Supplier<Exemplars> exemplarsSupplier, List<String> tagValues) {
HistogramSupport histogramSupport, Supplier<Exemplars> exemplarsSupplier, List<String> tagValues,
boolean forLongTaskTimer) {
collector.add(tagValues, (conventionName, tagKeys) -> {
Stream.Builder<MicrometerCollector.Family<?>> families = Stream.builder();

Expand Down Expand Up @@ -499,8 +499,8 @@ private void addDistributionStatisticSamples(Meter.Id id, MicrometerCollector co

Exemplars exemplars = createExemplarsWithScaledValues(exemplarsSupplier.get());
families.add(new MicrometerCollector.Family<>(conventionName,
family -> new io.prometheus.metrics.model.snapshots.HistogramSnapshot(family.metadata,
family.dataPointSnapshots),
family -> new io.prometheus.metrics.model.snapshots.HistogramSnapshot(forLongTaskTimer,
family.metadata, family.dataPointSnapshots),
getMetadata(id), new HistogramDataPointSnapshot(ClassicHistogramBuckets.of(buckets, counts),
sum, Labels.of(tagKeys, tagValues), exemplars, 0)));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package io.micrometer.prometheusmetrics;

import io.micrometer.core.Issue;
import io.micrometer.core.instrument.LongTaskTimer.Sample;
import io.micrometer.core.instrument.Timer;
import io.micrometer.core.instrument.*;
import io.micrometer.core.instrument.binder.BaseUnits;
Expand Down Expand Up @@ -583,12 +584,231 @@ void timerSumAndMaxHaveCorrectBaseUnit_whenPercentileHistogramEnabled() {
}

@Test
void scrapeWithLongTaskTimer() {
LongTaskTimer.builder("my.long.task.timer").register(registry);
assertThat(registry.scrape()).contains("my_long_task_timer_seconds_max")
// since Prometheus client 1.x, suffix _active_count => _count
.contains("my_long_task_timer_seconds_count")
.contains("my_long_task_timer_seconds_sum");
@Issue("#4988")
void longTaskTimerRecordingsSchouldBeCorrect() {
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);

String result = registry.scrape();
// since Prometheus client 1.x, suffix _active_count => _gcount and _sum => _gsum
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");

// A task started
Sample sample = ltt.start();
clock.add(150, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 150.0\n");
assertThat(result).contains("test_ltt_seconds_max 150.0\n");

// After a while another scrape happens, the task is still in progress...
clock.add(20, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 170.0\n");
assertThat(result).contains("test_ltt_seconds_max 170.0\n");

// Another task started
Sample sample2 = ltt.start();
clock.add(10, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 2\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 2\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 2\n");
assertThat(result).contains("test_ltt_seconds_gcount 2\n");
assertThat(result).contains("test_ltt_seconds_gsum 190.0\n");
assertThat(result).contains("test_ltt_seconds_max 180.0\n");

sample2.stop();

// After the second task stopped
clock.add(1, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 181.0\n");
assertThat(result).contains("test_ltt_seconds_max 181.0\n");

sample.stop();

// After the first task stopped
clock.add(10, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
}

@Test
@Issue("#4988")
void longTaskTimerInfBucketShouldBeCorrect() {
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);

String result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");

// A task started
Sample sample = ltt.start();
clock.add(7000, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 7000.0\n");
assertThat(result).contains("test_ltt_seconds_max 7000.0\n");

// After a while another scrape happens, the task is still in progress...
// Now the task is in the +Inf bucket
clock.add(500, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 7500.0\n");
assertThat(result).contains("test_ltt_seconds_max 7500.0\n");

// Another task started
Sample sample2 = ltt.start();
clock.add(500, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"458.129844906\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"549.755813887\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 2\n");
assertThat(result).contains("test_ltt_seconds_gcount 2\n");
assertThat(result).contains("test_ltt_seconds_gsum 8500.0\n");
assertThat(result).contains("test_ltt_seconds_max 8000.0\n");

sample2.stop();

// After the second task stopped
clock.add(500, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"458.129844906\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"549.755813887\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 8500.0\n");
assertThat(result).contains("test_ltt_seconds_max 8500.0\n");

sample.stop();

// After the first task stopped
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
}

@Test
@Issue("#4988")
void nonTelescopicLongTaskTimerRecordingsShouldBeCorrect() {
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);

String result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");

// A task started
Sample sample = ltt.start();
clock.add(200, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"206.158430206\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 200.0\n");
assertThat(result).contains("test_ltt_seconds_max 200.0\n");

// A second task started before the first stopped
Sample sample2 = ltt.start();
// The first task stopped
sample.stop();

clock.add(100, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 100.0\n");
assertThat(result).contains("test_ltt_seconds_max 100.0\n");

// The second task stopped
sample2.stop();

// A third task started after the first and second stopped
Sample sample3 = ltt.start();
clock.add(300, TimeUnit.SECONDS);
result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"274.877906944\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"366.503875925\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
assertThat(result).contains("test_ltt_seconds_gsum 300.0\n");
assertThat(result).contains("test_ltt_seconds_max 300.0\n");

// The third task stopped
sample3.stop();

result = registry.scrape();
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
}

@Issue("#2087")
Expand Down

0 comments on commit d7b9d24

Please sign in to comment.