diff --git a/.bingo/promdoc.sum b/.bingo/promdoc.sum new file mode 100644 index 00000000000..7ecaa4580fa --- /dev/null +++ b/.bingo/promdoc.sum @@ -0,0 +1,18 @@ +github.com/fsnotify/fsnotify v1.5.1 h1:mZcQUHVQUQWoPXXtuf9yuEXKudkV2sx1E06UadKWpgI= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/magiconair/properties v1.8.5 h1:b6kJs+EmPFMYGkow9GiUyCyOvIwYetYJ3fSaWak/Gls= +github.com/mitchellh/mapstructure v1.4.2 h1:6h7AQ0yhTcIsmFmnAwQls75jp2Gzs4iB8W7pjMO+rqo= +github.com/pelletier/go-toml v1.9.4 h1:tjENF6MfZAg8e4ZmZTeWaWiT2vXtsoO6+iuOjFhECwM= +github.com/plexsystems/promdoc v0.8.0 h1:mNAp+WQkb2yZV5m7PeybHFTPYz+4pbaMCaH8iPLOMog= +github.com/plexsystems/promdoc v0.8.0/go.mod h1:CoTbHLEVPziXN+Y4GozwsiLvgdJqdOBYywqUy40sYuI= +github.com/spf13/afero v1.6.0 h1:xoax2sJ2DT8S8xA2paPFjDCScCNeWsg75VG0DLRreiY= +github.com/spf13/cast v1.4.1 h1:s0hze+J0196ZfEMTs80N7UlFt0BDuQ7Q+JDnHiMWKdA= +github.com/spf13/cobra v1.2.1 h1:+KmjbUw1hriSNMF55oPrkZcb27aECyrj8V2ytv7kWDw= +github.com/spf13/jwalterweatherman v1.1.0 h1:ue6voC5bR5F8YxI5S67j9i582FU4Qvo2bmqnqMYADFk= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/viper v1.9.0 h1:yR6EXjTp0y0cLN8OZg1CRZmOBdI88UcGkhgyJhu6nZk= +github.com/subosito/gotenv v1.2.0 h1:Slr1R9HxAlEKefgq5jn9U+DnETlIUa6HfgEzj0g5d7s= +golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf h1:2ucpDCmfkl8Bd/FsLtiD653Wf96cW37s+iGx93zsu4k= +golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= +gopkg.in/ini.v1 v1.63.2 h1:tGK/CyBg7SMzb60vP1M03vNZ3VDu3wGQJwn7Sxi9r3c= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fe29bf5a0f..c4a78b7ddc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Added - [#5440](https://github.com/thanos-io/thanos/pull/5440) HTTP metrics: export number of in-flight HTTP requests. +- [#5402](https://github.com/thanos-io/thanos/pull/5402) Receive: Implement exposing TSDB stats for all tenants ### Changed diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index c01c67fb06f..445717d8ea4 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -213,7 +213,7 @@ func runReceive( TLSConfig: rwTLSConfig, DialOpts: dialOpts, ForwardTimeout: time.Duration(*conf.forwardTimeout), - GetTSDBStats: dbs.Stats, + TSDBStats: dbs, }) grpcProbe := prober.NewGRPC() diff --git a/pkg/api/status/v1.go b/pkg/api/status/v1.go index 1755cbce410..8843ac0da50 100644 --- a/pkg/api/status/v1.go +++ b/pkg/api/status/v1.go @@ -18,15 +18,14 @@ package status import ( "fmt" - "math" "net/http" + "github.com/prometheus/prometheus/tsdb" + "github.com/go-kit/log" "github.com/opentracing/opentracing-go" - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/index" v1 "github.com/prometheus/prometheus/web/api/v1" "github.com/thanos-io/thanos/pkg/api" @@ -49,8 +48,15 @@ func convertStats(stats []index.Stat) []Stat { return result } +type TenantStats struct { + Tenant string + Stats *tsdb.Stats +} + // TSDBStatus has information of cardinality statistics from postings. +// TODO(fpetkovski): replace with upstream struct after dependency update. type TSDBStatus struct { + Tenant string `json:"tenant"` HeadStats v1.HeadStats `json:"headStats"` SeriesCountByMetricName []Stat `json:"seriesCountByMetricName"` LabelValueCountByLabelName []Stat `json:"labelValueCountByLabelName"` @@ -58,23 +64,19 @@ type TSDBStatus struct { SeriesCountByLabelValuePair []Stat `json:"seriesCountByLabelValuePair"` } -type GetStatsFunc func(r *http.Request, statsByLabelName string) (*tsdb.Stats, error) +type GetStatsFunc func(r *http.Request, statsByLabelName string) ([]TenantStats, *api.ApiError) type Options struct { GetStats GetStatsFunc - Registry *prometheus.Registry } -// TODO(fpetkovski): replace with upstream struct after dependency update. type StatusAPI struct { getTSDBStats GetStatsFunc - options Options } func New(opts Options) *StatusAPI { return &StatusAPI{ getTSDBStats: opts.GetStats, - options: opts, } } @@ -84,43 +86,30 @@ func (sapi *StatusAPI) Register(r *route.Router, tracer opentracing.Tracer, logg } func (sapi *StatusAPI) httpServeStats(r *http.Request) (interface{}, []error, *api.ApiError) { - s, err := sapi.getTSDBStats(r, labels.MetricName) + stats, err := sapi.getTSDBStats(r, labels.MetricName) if err != nil { - return nil, nil, &api.ApiError{Typ: api.ErrorInternal, Err: err} + return nil, nil, err } - if s == nil { + if stats == nil { return nil, nil, &api.ApiError{Typ: api.ErrorBadData, Err: fmt.Errorf("unknown tenant")} } - metrics, err := sapi.options.Registry.Gather() - if err != nil { - return nil, []error{err}, nil - } - - chunkCount := int64(math.NaN()) - for _, mF := range metrics { - if *mF.Name == "prometheus_tsdb_head_chunks" { - m := *mF.Metric[0] - if m.Gauge != nil { - chunkCount = int64(m.Gauge.GetValue()) - break - } - } + result := make([]TSDBStatus, 0, len(stats)) + for _, s := range stats { + result = append(result, TSDBStatus{ + Tenant: s.Tenant, + HeadStats: v1.HeadStats{ + NumSeries: s.Stats.NumSeries, + MinTime: s.Stats.MinTime, + MaxTime: s.Stats.MaxTime, + NumLabelPairs: s.Stats.IndexPostingStats.NumLabelPairs, + }, + SeriesCountByMetricName: convertStats(s.Stats.IndexPostingStats.CardinalityMetricsStats), + LabelValueCountByLabelName: convertStats(s.Stats.IndexPostingStats.CardinalityLabelStats), + MemoryInBytesByLabelName: convertStats(s.Stats.IndexPostingStats.LabelValueStats), + SeriesCountByLabelValuePair: convertStats(s.Stats.IndexPostingStats.LabelValuePairsStats), + }) } - - return TSDBStatus{ - HeadStats: v1.HeadStats{ - NumSeries: s.NumSeries, - ChunkCount: chunkCount, - MinTime: s.MinTime, - MaxTime: s.MaxTime, - NumLabelPairs: s.IndexPostingStats.NumLabelPairs, - }, - SeriesCountByMetricName: convertStats(s.IndexPostingStats.CardinalityMetricsStats), - LabelValueCountByLabelName: convertStats(s.IndexPostingStats.CardinalityLabelStats), - MemoryInBytesByLabelName: convertStats(s.IndexPostingStats.LabelValueStats), - SeriesCountByLabelValuePair: convertStats(s.IndexPostingStats.LabelValuePairsStats), - }, nil, nil - + return result, nil, nil } diff --git a/pkg/receive/handler.go b/pkg/receive/handler.go index 8627d158d66..6732b274d80 100644 --- a/pkg/receive/handler.go +++ b/pkg/receive/handler.go @@ -17,6 +17,8 @@ import ( "sync" "time" + "github.com/thanos-io/thanos/pkg/api" + statusapi "github.com/thanos-io/thanos/pkg/api/status" "github.com/thanos-io/thanos/pkg/logging" @@ -57,6 +59,8 @@ const ( DefaultTenantLabel = "tenant_id" // DefaultReplicaHeader is the default header used to designate the replica count of a write request. DefaultReplicaHeader = "THANOS-REPLICA" + // AllTenantsQueryParam is the query parameter for getting TSDB stats for all tenants + AllTenantsQueryParam = "all_tenants" // Labels for metrics. labelSuccess = "success" labelError = "error" @@ -95,7 +99,7 @@ type Options struct { DialOpts []grpc.DialOption ForwardTimeout time.Duration RelabelConfigs []*relabel.Config - GetTSDBStats GetStatsFunc + TSDBStats TSDBStats } // Handler serves a Prometheus remote write receiving HTTP endpoint. @@ -226,7 +230,6 @@ func NewHandler(logger log.Logger, o *Options) *Handler { statusAPI := statusapi.New(statusapi.Options{ GetStats: h.getStats, - Registry: o.Registry, }) statusAPI.Register(h.router, o.Tracer, logger, ins, logging.NewHTTPServerMiddleware(logger)) @@ -271,17 +274,32 @@ func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc { } } -func (h *Handler) getStats(r *http.Request, statsByLabelName string) (*tsdb.Stats, error) { +func (h *Handler) getStats(r *http.Request, statsByLabelName string) ([]statusapi.TenantStats, *api.ApiError) { if !h.isReady() { - return nil, fmt.Errorf("service unavailable") + return nil, &api.ApiError{Typ: api.ErrorInternal, Err: fmt.Errorf("service unavailable")} } tenantID := r.Header.Get(h.options.TenantHeader) + getAllTenantStats := r.FormValue(AllTenantsQueryParam) == "true" + if getAllTenantStats && tenantID != "" { + err := fmt.Errorf("using both the %s parameter and the %s header is not supported", AllTenantsQueryParam, h.options.TenantHeader) + return nil, &api.ApiError{Typ: api.ErrorBadData, Err: err} + } + + if getAllTenantStats { + return h.options.TSDBStats.AllTenantStats(statsByLabelName), nil + } + if tenantID == "" { tenantID = h.options.DefaultTenantID } - return h.options.GetTSDBStats(tenantID, statsByLabelName), nil + stats := h.options.TSDBStats.SingleTenantStats(tenantID, statsByLabelName) + if stats == nil { + return nil, nil + } + + return []statusapi.TenantStats{*stats}, nil } // Close stops the Handler. diff --git a/pkg/receive/multitsdb.go b/pkg/receive/multitsdb.go index e98cded8565..50deefaa349 100644 --- a/pkg/receive/multitsdb.go +++ b/pkg/receive/multitsdb.go @@ -12,6 +12,8 @@ import ( "sync" "time" + "github.com/thanos-io/thanos/pkg/api/status" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/pkg/errors" @@ -32,7 +34,10 @@ import ( "github.com/thanos-io/thanos/pkg/store/labelpb" ) -type GetStatsFunc func(tenantID, statsByLabelName string) *tsdb.Stats +type TSDBStats interface { + SingleTenantStats(tenantID, statsByLabelName string) *status.TenantStats + AllTenantStats(statsByLabelName string) []status.TenantStats +} type MultiTSDB struct { dataDir string @@ -375,7 +380,7 @@ func (t *MultiTSDB) TSDBExemplars() map[string]*exemplars.TSDB { return res } -func (t *MultiTSDB) Stats(tenantID, statsByLabelName string) *tsdb.Stats { +func (t *MultiTSDB) SingleTenantStats(tenantID, statsByLabelName string) *status.TenantStats { t.mtx.RLock() defer t.mtx.RUnlock() @@ -384,7 +389,39 @@ func (t *MultiTSDB) Stats(tenantID, statsByLabelName string) *tsdb.Stats { return nil } - return tenant.readyS.get().db.Head().Stats(statsByLabelName) + stats := tenant.readyS.get().db.Head().Stats(statsByLabelName) + return &status.TenantStats{ + Tenant: tenantID, + Stats: stats, + } +} + +func (t *MultiTSDB) AllTenantStats(statsByLabelName string) []status.TenantStats { + t.mtx.RLock() + defer t.mtx.RUnlock() + + var ( + mu sync.Mutex + wg sync.WaitGroup + result = make([]status.TenantStats, 0, len(t.tenants)) + ) + for tenantID, tenantInstance := range t.tenants { + wg.Add(1) + go func(tenantID string, tenantInstance *tenant) { + defer wg.Done() + stats := tenantInstance.readyS.get().db.Head().Stats(statsByLabelName) + + mu.Lock() + defer mu.Unlock() + result = append(result, status.TenantStats{ + Tenant: tenantID, + Stats: stats, + }) + }(tenantID, tenantInstance) + } + wg.Wait() + + return result } func (t *MultiTSDB) startTSDB(logger log.Logger, tenantID string, tenant *tenant) error {