Skip to content

Commit

Permalink
roachtest: retry fetching timeseries data in overload test
Browse files Browse the repository at this point in the history
A test failure has been observed with the following error:

```
503 Service Unavailable, content-type: application/json, body: {
                  "error": "all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: failed to write client preface: io: read/write on closed pipe\"",
                  "message": "all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: failed to write client preface: io: read/write on closed pipe\"",
                  "code": 14,
                  "details": [
                  ]
                }, error: <nil>
```

That error makes some sense if the DefaultClass connection has failed
for some reason. In the fullness of time we should get to the bottom
of why these gRPC connections still close when overloaded. We had
hoped that cockroachdb#39041 would be the end of that but unfortunately it
still seems to happen sometimes. That being said, the situation
resolves itself rapidly when the load stops. This PR adds a retry
loop to make the system robust to these transient failures.

It's also possible that we should run timeseries queries as
SystemClass operations but I'll also leave that for another
change.

Release note: None
  • Loading branch information
ajwerner authored and nvanbenschoten committed Aug 26, 2019
1 parent f8ec52d commit b02970e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 13 deletions.
25 changes: 19 additions & 6 deletions pkg/cmd/roachtest/overload_tpcc_olap.go
Expand Up @@ -16,6 +16,8 @@ import (
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/ts/tspb"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

Expand Down Expand Up @@ -73,12 +75,23 @@ func verifyNodeLiveness(ctx context.Context, c *cluster, t *test, runDuration ti
const maxFailures = 10
adminURLs := c.ExternalAdminUIAddr(ctx, c.Node(1))
now := timeutil.Now()
response := getMetrics(t, adminURLs[0], now.Add(-runDuration), now, []tsQuery{
{
name: "cr.node.liveness.heartbeatfailures",
queryType: total,
},
})
var response tspb.TimeSeriesQueryResponse
// Retry because timeseries queries can fail if the underlying inter-node
// connections are in a failed state which can happen due to overload.
// Now that the load has stopped, this should resolve itself soon.
if err := retry.WithMaxAttempts(ctx, retry.Options{
MaxBackoff: 500 * time.Millisecond,
}, 3, func() (err error) {
response, err = getMetrics(adminURLs[0], now.Add(-runDuration), now, []tsQuery{
{
name: "cr.node.liveness.heartbeatfailures",
queryType: total,
},
})
return err
}); err != nil {
t.Fatalf("failed to fetch liveness metrics: %v", err)
}
if len(response.Results[0].Datapoints) <= 1 {
t.Fatalf("not enough datapoints in timeseries query response: %+v", response)
}
Expand Down
23 changes: 16 additions & 7 deletions pkg/cmd/roachtest/ts_util.go
Expand Up @@ -40,9 +40,19 @@ type tsQuery struct {
queryType tsQueryType
}

func getMetrics(
func mustGetMetrics(
t *test, adminURL string, start, end time.Time, tsQueries []tsQuery,
) tspb.TimeSeriesQueryResponse {
response, err := getMetrics(adminURL, start, end, tsQueries)
if err != nil {
t.Fatal(err)
}
return response
}

func getMetrics(
adminURL string, start, end time.Time, tsQueries []tsQuery,
) (tspb.TimeSeriesQueryResponse, error) {
url := "http://" + adminURL + "/ts/query"
queries := make([]tspb.Query, len(tsQueries))
for i := 0; i < len(tsQueries); i++ {
Expand Down Expand Up @@ -74,10 +84,9 @@ func getMetrics(
Queries: queries,
}
var response tspb.TimeSeriesQueryResponse
if err := httputil.PostJSON(http.Client{}, url, &request, &response); err != nil {
t.Fatal(err)
}
return response
err := httputil.PostJSON(http.Client{Timeout: 500 * time.Millisecond}, url, &request, &response)
return response, err

}

func verifyTxnPerSecond(
Expand All @@ -90,7 +99,7 @@ func verifyTxnPerSecond(
) {
// Query needed information over the timespan of the query.
adminURL := c.ExternalAdminUIAddr(ctx, adminNode)[0]
response := getMetrics(t, adminURL, start, end, []tsQuery{
response := mustGetMetrics(t, adminURL, start, end, []tsQuery{
{name: "cr.node.txn.commits", queryType: rate},
{name: "cr.node.txn.commits", queryType: total},
})
Expand Down Expand Up @@ -137,7 +146,7 @@ func verifyLookupsPerSec(
) {
// Query needed information over the timespan of the query.
adminURL := c.ExternalAdminUIAddr(ctx, adminNode)[0]
response := getMetrics(t, adminURL, start, end, []tsQuery{
response := mustGetMetrics(t, adminURL, start, end, []tsQuery{
{name: "cr.node.distsender.rangelookups", queryType: rate},
})

Expand Down

0 comments on commit b02970e

Please sign in to comment.