Skip to content

Commit

Permalink
fix(healthcheck): log ping errors
Browse files Browse the repository at this point in the history
  • Loading branch information
karol-kokoszka committed May 13, 2024
1 parent bff2401 commit 032ea2c
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 12 deletions.
12 changes: 8 additions & 4 deletions pkg/service/healthcheck/metrics_test.go
Expand Up @@ -6,6 +6,7 @@ import (
"testing"

"github.com/prometheus/client_golang/prometheus"
"github.com/scylladb/go-log"

"github.com/scylladb/scylla-manager/v3/pkg/util/uuid"
)
Expand All @@ -27,10 +28,13 @@ func TestRemoveClusterMetricsWhenNumberOfMetricsExceedsDefaultChannelLength_2843
}
metric.With(hl).Set(1)
}
r := runner{metrics: &runnerMetrics{
status: metric,
rtt: metric,
}}
r := runner{
logger: log.NewDevelopment(),
metrics: &runnerMetrics{
status: metric,
rtt: metric,
},
}

r.removeMetricsForCluster(clusterID)
}
14 changes: 6 additions & 8 deletions pkg/service/healthcheck/runner.go
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/scylladb/go-log"
"github.com/scylladb/go-set/strset"

"github.com/scylladb/scylla-manager/v3/pkg/scyllaclient"
Expand Down Expand Up @@ -43,6 +44,7 @@ func (r Runner) Run(ctx context.Context, clusterID, taskID, runID uuid.UUID, pro
}

type runner struct {
logger log.Logger
scyllaClient scyllaclient.ProviderFunc
timeout time.Duration
metrics *runnerMetrics
Expand Down Expand Up @@ -91,13 +93,7 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, status []sc

rtt, err := r.ping(ctx, clusterID, status[i].Addr, r.timeout)
if err != nil {
// Set -2 for unavailable agent and -1 for unavailable Scylla
_, err := r.pingAgent(ctx, clusterID, status[i].Addr, r.timeout)
if err != nil {
r.metrics.status.With(hl).Set(-2)
} else {
r.metrics.status.With(hl).Set(-1)
}
r.metrics.status.With(hl).Set(-1)
} else {
r.metrics.status.With(hl).Set(1)
}
Expand All @@ -106,7 +102,9 @@ func (r runner) checkHosts(ctx context.Context, clusterID uuid.UUID, status []sc
return nil
}

_ = parallel.Run(len(status), parallel.NoLimit, f, parallel.NopNotify) // nolint: errcheck
_ = parallel.Run(len(status), parallel.NoLimit, f, func(i int, err error) { // nolint: errcheck
r.logger.Error(ctx, "Parallel hosts check failed", "", status[i].Addr, "error", err)
})
}

func (r runner) removeMetricsForCluster(clusterID uuid.UUID) {
Expand Down
3 changes: 3 additions & 0 deletions pkg/service/healthcheck/service.go
Expand Up @@ -56,6 +56,7 @@ func NewService(config Config, scyllaClient scyllaclient.ProviderFunc, secretsSt
func (s *Service) Runner() Runner {
return Runner{
cql: runner{
logger: s.logger.Named("CQL healthcheck"),
scyllaClient: s.scyllaClient,
timeout: s.config.MaxTimeout,
metrics: &runnerMetrics{
Expand All @@ -66,6 +67,7 @@ func (s *Service) Runner() Runner {
pingAgent: s.pingAgent,
},
rest: runner{
logger: s.logger.Named("REST healthcheck"),
scyllaClient: s.scyllaClient,
timeout: s.config.MaxTimeout,
metrics: &runnerMetrics{
Expand All @@ -76,6 +78,7 @@ func (s *Service) Runner() Runner {
pingAgent: s.pingAgent,
},
alternator: runner{
logger: s.logger.Named("Alternator healthcheck"),
scyllaClient: s.scyllaClient,
timeout: s.config.MaxTimeout,
metrics: &runnerMetrics{
Expand Down

0 comments on commit 032ea2c

Please sign in to comment.