From 31ce79b669489ba6286b81978d1d090f4e17c699 Mon Sep 17 00:00:00 2001 From: Filip Petkovski Date: Mon, 20 Jun 2022 13:49:01 +0200 Subject: [PATCH] Expose tsdb status in receiver (#5402) * Expose tsdb status in receiver This commit implements the api/v1/status/tsdb API in the Receiver. Signed-off-by: Filip Petkovski * Add docs and todo Signed-off-by: Filip Petkovski * Fix tests Signed-off-by: Filip Petkovski --- CHANGELOG.md | 1 + cmd/thanos/receive.go | 1 + docs/components/receive.md | 6 ++ pkg/api/status/v1.go | 126 +++++++++++++++++++++++++++++++++++++ pkg/receive/handler.go | 36 +++++++++-- pkg/receive/multitsdb.go | 14 +++++ 6 files changed, 180 insertions(+), 4 deletions(-) create mode 100644 pkg/api/status/v1.go diff --git a/CHANGELOG.md b/CHANGELOG.md index b144d46e19..4a52d488a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5352](https://github.com/thanos-io/thanos/pull/5352) Cache: Add cache metrics to groupcache. - [#5391](https://github.com/thanos-io/thanos/pull/5391) Receive: Add relabeling support. - [#5408](https://github.com/thanos-io/thanos/pull/5391) Receive: Add support for consistent hashrings. +- [#5391](https://github.com/thanos-io/thanos/pull/5391) Receive: Implement api/v1/status/tsdb. ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index e6228c26fd..21e8c0d1c0 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -212,6 +212,7 @@ func runReceive( TLSConfig: rwTLSConfig, DialOpts: dialOpts, ForwardTimeout: time.Duration(*conf.forwardTimeout), + GetTSDBStats: dbs.Stats, }) grpcProbe := prober.NewGRPC() diff --git a/docs/components/receive.md b/docs/components/receive.md index dbc61685e1..676fbf04cb 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -12,6 +12,12 @@ For more information please check out [initial design proposal](../proposals-don > NOTE: As the block producer it's important to set correct "external labels" that will identify data block across Thanos clusters. See [external labels](../storage.md#external-labels) docs for details. +## TSDB stats + +Thanos Receive supports getting TSDB stats using the `/api/v1/status/tsdb` endpoint. Use the `THANOS-TENANT` HTTP header to get stats for individual Tenants. The output format of the endpoint is compatible with [Prometheus API](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats). + +Note that each Thanos Receive will only expose local stats and replicated series will not be included in the response. + ## Example ```bash diff --git a/pkg/api/status/v1.go b/pkg/api/status/v1.go new file mode 100644 index 0000000000..1755cbce41 --- /dev/null +++ b/pkg/api/status/v1.go @@ -0,0 +1,126 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package status + +import ( + "fmt" + "math" + "net/http" + + "github.com/go-kit/log" + "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/route" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/tsdb" + "github.com/prometheus/prometheus/tsdb/index" + v1 "github.com/prometheus/prometheus/web/api/v1" + "github.com/thanos-io/thanos/pkg/api" + extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http" + "github.com/thanos-io/thanos/pkg/logging" +) + +// Stat holds the information about individual cardinality. +type Stat struct { + Name string `json:"name"` + Value uint64 `json:"value"` +} + +func convertStats(stats []index.Stat) []Stat { + result := make([]Stat, 0, len(stats)) + for _, item := range stats { + item := Stat{Name: item.Name, Value: item.Count} + result = append(result, item) + } + return result +} + +// TSDBStatus has information of cardinality statistics from postings. +type TSDBStatus struct { + HeadStats v1.HeadStats `json:"headStats"` + SeriesCountByMetricName []Stat `json:"seriesCountByMetricName"` + LabelValueCountByLabelName []Stat `json:"labelValueCountByLabelName"` + MemoryInBytesByLabelName []Stat `json:"memoryInBytesByLabelName"` + SeriesCountByLabelValuePair []Stat `json:"seriesCountByLabelValuePair"` +} + +type GetStatsFunc func(r *http.Request, statsByLabelName string) (*tsdb.Stats, error) + +type Options struct { + GetStats GetStatsFunc + Registry *prometheus.Registry +} + +// TODO(fpetkovski): replace with upstream struct after dependency update. +type StatusAPI struct { + getTSDBStats GetStatsFunc + options Options +} + +func New(opts Options) *StatusAPI { + return &StatusAPI{ + getTSDBStats: opts.GetStats, + options: opts, + } +} + +func (sapi *StatusAPI) Register(r *route.Router, tracer opentracing.Tracer, logger log.Logger, ins extpromhttp.InstrumentationMiddleware, logMiddleware *logging.HTTPServerMiddleware) { + instr := api.GetInstr(tracer, logger, ins, logMiddleware, false) + r.Get("/api/v1/status/tsdb", instr("tsdb_status", sapi.httpServeStats)) +} + +func (sapi *StatusAPI) httpServeStats(r *http.Request) (interface{}, []error, *api.ApiError) { + s, err := sapi.getTSDBStats(r, labels.MetricName) + if err != nil { + return nil, nil, &api.ApiError{Typ: api.ErrorInternal, Err: err} + } + + if s == nil { + return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: fmt.Errorf("unknown tenant")} + } + + metrics, err := sapi.options.Registry.Gather() + if err != nil { + return nil, []error{err}, nil + } + + chunkCount := int64(math.NaN()) + for _, mF := range metrics { + if *mF.Name == "prometheus_tsdb_head_chunks" { + m := *mF.Metric[0] + if m.Gauge != nil { + chunkCount = int64(m.Gauge.GetValue()) + break + } + } + } + + return TSDBStatus{ + HeadStats: v1.HeadStats{ + NumSeries: s.NumSeries, + ChunkCount: chunkCount, + MinTime: s.MinTime, + MaxTime: s.MaxTime, + NumLabelPairs: s.IndexPostingStats.NumLabelPairs, + }, + SeriesCountByMetricName: convertStats(s.IndexPostingStats.CardinalityMetricsStats), + LabelValueCountByLabelName: convertStats(s.IndexPostingStats.CardinalityLabelStats), + MemoryInBytesByLabelName: convertStats(s.IndexPostingStats.LabelValueStats), + SeriesCountByLabelValuePair: convertStats(s.IndexPostingStats.LabelValuePairsStats), + }, nil, nil + +} diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 0cf8509ea5..d0ebc3e83a 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,6 +17,9 @@ import ( "sync" "time" + statusapi "github.com/thanos-io/thanos/pkg/api/status" + "github.com/thanos-io/thanos/pkg/logging" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -72,7 +75,7 @@ var ( type Options struct { Writer *Writer ListenAddress string - Registry prometheus.Registerer + Registry *prometheus.Registry TenantHeader string DefaultTenantID string ReplicaHeader string @@ -84,6 +87,7 @@ type Options struct { DialOpts []grpc.DialOption ForwardTimeout time.Duration RelabelConfigs []*relabel.Config + GetTSDBStats GetStatsFunc } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -111,6 +115,11 @@ func NewHandler(logger log.Logger, o *Options) *Handler { logger = log.NewNopLogger() } + var registerer prometheus.Registerer = nil + if o.Registry != nil { + registerer = o.Registry + } + h := &Handler{ logger: logger, writer: o.Writer, @@ -124,19 +133,19 @@ func NewHandler(logger log.Logger, o *Options) *Handler { Max: 30 * time.Second, Jitter: true, }, - forwardRequests: promauto.With(o.Registry).NewCounterVec( + forwardRequests: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_forward_requests_total", Help: "The number of forward requests.", }, []string{"result"}, ), - replications: promauto.With(o.Registry).NewCounterVec( + replications: promauto.With(registerer).NewCounterVec( prometheus.CounterOpts{ Name: "thanos_receive_replications_total", Help: "The number of replication operations done by the receiver. The success of replication is fulfilled when a quorum is met.", }, []string{"result"}, ), - replicationFactor: promauto.With(o.Registry).NewGauge( + replicationFactor: promauto.With(registerer).NewGauge( prometheus.GaugeOpts{ Name: "thanos_receive_replication_factor", Help: "The number of times to replicate incoming write requests.", @@ -173,6 +182,12 @@ func NewHandler(logger log.Logger, o *Options) *Handler { h.router.Post("/api/v1/receive", instrf("receive", readyf(middleware.RequestID(http.HandlerFunc(h.receiveHTTP))))) + statusAPI := statusapi.New(statusapi.Options{ + GetStats: h.getStats, + Registry: o.Registry, + }) + statusAPI.Register(h.router, o.Tracer, logger, ins, logging.NewHTTPServerMiddleware(logger)) + return h } @@ -214,6 +229,19 @@ func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc { } } +func (h *Handler) getStats(r *http.Request, statsByLabelName string) (*tsdb.Stats, error) { + if !h.isReady() { + return nil, fmt.Errorf("service unavailable") + } + + tenantID := r.Header.Get(h.options.TenantHeader) + if tenantID == "" { + tenantID = h.options.DefaultTenantID + } + + return h.options.GetTSDBStats(tenantID, statsByLabelName), nil +} + // Close stops the Handler. func (h *Handler) Close() { if h.listener != nil { diff --git a/pkg/receive/multitsdb.go b/pkg/receive/multitsdb.go index 9e158776c0..7ffab5cfc3 100644 --- a/pkg/receive/multitsdb.go +++ b/pkg/receive/multitsdb.go @@ -31,6 +31,8 @@ import ( "github.com/thanos-io/thanos/pkg/store/labelpb" ) +type GetStatsFunc func(tenantID, statsByLabelName string) *tsdb.Stats + type MultiTSDB struct { dataDir string logger log.Logger @@ -289,6 +291,18 @@ func (t *MultiTSDB) TSDBExemplars() map[string]*exemplars.TSDB { return res } +func (t *MultiTSDB) Stats(tenantID, statsByLabelName string) *tsdb.Stats { + t.mtx.RLock() + defer t.mtx.RUnlock() + + tenant, ok := t.tenants[tenantID] + if !ok { + return nil + } + + return tenant.readyS.get().db.Head().Stats(statsByLabelName) +} + func (t *MultiTSDB) startTSDB(logger log.Logger, tenantID string, tenant *tenant) error { reg := prometheus.WrapRegistererWith(prometheus.Labels{"tenant": tenantID}, t.reg) lset := labelpb.ExtendSortedLabels(t.labels, labels.FromStrings(t.tenantLabelName, tenantID))