diff --git a/pkg/cmd/roachtest/overload_tpcc_olap.go b/pkg/cmd/roachtest/overload_tpcc_olap.go index 007b3ad61de5..ed72d0a94d39 100644 --- a/pkg/cmd/roachtest/overload_tpcc_olap.go +++ b/pkg/cmd/roachtest/overload_tpcc_olap.go @@ -16,6 +16,8 @@ import ( "strings" "time" + "github.com/cockroachdb/cockroach/pkg/ts/tspb" + "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" ) @@ -73,12 +75,23 @@ func verifyNodeLiveness(ctx context.Context, c *cluster, t *test, runDuration ti const maxFailures = 10 adminURLs := c.ExternalAdminUIAddr(ctx, c.Node(1)) now := timeutil.Now() - response := getMetrics(t, adminURLs[0], now.Add(-runDuration), now, []tsQuery{ - { - name: "cr.node.liveness.heartbeatfailures", - queryType: total, - }, - }) + var response tspb.TimeSeriesQueryResponse + // Retry because timeseries queries can fail if the underlying inter-node + // connections are in a failed state which can happen due to overload. + // Now that the load has stopped, this should resolve itself soon. + if err := retry.WithMaxAttempts(ctx, retry.Options{ + MaxBackoff: 500 * time.Millisecond, + }, 3, func() (err error) { + response, err = getMetrics(adminURLs[0], now.Add(-runDuration), now, []tsQuery{ + { + name: "cr.node.liveness.heartbeatfailures", + queryType: total, + }, + }) + return err + }); err != nil { + t.Fatalf("failed to fetch liveness metrics: %v", err) + } if len(response.Results[0].Datapoints) <= 1 { t.Fatalf("not enough datapoints in timeseries query response: %+v", response) } diff --git a/pkg/cmd/roachtest/ts_util.go b/pkg/cmd/roachtest/ts_util.go index 454c99bca74e..e87663c1c2d8 100644 --- a/pkg/cmd/roachtest/ts_util.go +++ b/pkg/cmd/roachtest/ts_util.go @@ -40,9 +40,19 @@ type tsQuery struct { queryType tsQueryType } -func getMetrics( +func mustGetMetrics( t *test, adminURL string, start, end time.Time, tsQueries []tsQuery, ) tspb.TimeSeriesQueryResponse { + response, err := getMetrics(adminURL, start, end, tsQueries) + if err != nil { + t.Fatal(err) + } + return response +} + +func getMetrics( + adminURL string, start, end time.Time, tsQueries []tsQuery, +) (tspb.TimeSeriesQueryResponse, error) { url := "http://" + adminURL + "/ts/query" queries := make([]tspb.Query, len(tsQueries)) for i := 0; i < len(tsQueries); i++ { @@ -74,10 +84,9 @@ func getMetrics( Queries: queries, } var response tspb.TimeSeriesQueryResponse - if err := httputil.PostJSON(http.Client{}, url, &request, &response); err != nil { - t.Fatal(err) - } - return response + err := httputil.PostJSON(http.Client{Timeout: 500 * time.Millisecond}, url, &request, &response) + return response, err + } func verifyTxnPerSecond( @@ -90,7 +99,7 @@ func verifyTxnPerSecond( ) { // Query needed information over the timespan of the query. adminURL := c.ExternalAdminUIAddr(ctx, adminNode)[0] - response := getMetrics(t, adminURL, start, end, []tsQuery{ + response := mustGetMetrics(t, adminURL, start, end, []tsQuery{ {name: "cr.node.txn.commits", queryType: rate}, {name: "cr.node.txn.commits", queryType: total}, }) @@ -137,7 +146,7 @@ func verifyLookupsPerSec( ) { // Query needed information over the timespan of the query. adminURL := c.ExternalAdminUIAddr(ctx, adminNode)[0] - response := getMetrics(t, adminURL, start, end, []tsQuery{ + response := mustGetMetrics(t, adminURL, start, end, []tsQuery{ {name: "cr.node.distsender.rangelookups", queryType: rate}, })