From 7a6f86c1d44ab5d9577949d3915d613ff4403570 Mon Sep 17 00:00:00 2001 From: Dan Upton Date: Wed, 27 Apr 2022 17:17:31 +0100 Subject: [PATCH] Upgrade Raft to v1.3.9 for saturation metrics (#12865) --- .changelog/12865.txt | 3 +++ go.mod | 2 +- go.sum | 4 ++-- website/content/docs/agent/telemetry.mdx | 22 ++++++++++++++++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 .changelog/12865.txt diff --git a/.changelog/12865.txt b/.changelog/12865.txt new file mode 100644 index 000000000000..73a4bb9c3bcc --- /dev/null +++ b/.changelog/12865.txt @@ -0,0 +1,3 @@ +```release-note:improvement +telemetry: Added `consul.raft.thread.main.saturation` and `consul.raft.thread.fsm.saturation` metrics to measure approximate saturation of the Raft goroutines +``` diff --git a/go.mod b/go.mod index b3fb5807ed56..0d42d990d0ce 100644 --- a/go.mod +++ b/go.mod @@ -53,7 +53,7 @@ require ( github.com/hashicorp/hcl v1.0.0 github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038 github.com/hashicorp/memberlist v0.3.1 - github.com/hashicorp/raft v1.3.8 + github.com/hashicorp/raft v1.3.9 github.com/hashicorp/raft-autopilot v0.1.6 github.com/hashicorp/raft-boltdb v0.0.0-20211202195631-7d34b9fb3f42 // indirect github.com/hashicorp/raft-boltdb/v2 v2.2.2 diff --git a/go.sum b/go.sum index ca0e61768be5..c0075f2f7a5a 100644 --- a/go.sum +++ b/go.sum @@ -368,8 +368,8 @@ github.com/hashicorp/memberlist v0.3.1/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOn github.com/hashicorp/raft v1.1.0/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= -github.com/hashicorp/raft v1.3.8 h1:lrhx4wesQLOSv3ERX/pK4cwfzQ0J2RgzsvAkBxHe1bA= -github.com/hashicorp/raft v1.3.8/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= +github.com/hashicorp/raft v1.3.9 h1:9yuo1aR0bFTr1cw7pj3S2Bk6MhJCsnr2NAxvIBrP2x4= +github.com/hashicorp/raft v1.3.9/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM= github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I= github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 5c40a7242b52..cd33454991e7 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -149,6 +149,28 @@ you will need to apply a function such as InfluxDB's [`non_negative_difference() Sudden large changes to the `consul.client.rpc` metrics (greater than 50% deviation from baseline). `consul.client.rpc.exceeded` or `consul.client.rpc.failed` count > 0, as it implies that an agent is being rate-limited or fails to make an RPC request to a Consul server +### Raft Thread Saturation + +| Metric Name | Description | Unit | Type | +| :----------------------------------- | :----------------------------------------------------------------------------------------------------------------------- | :--------- | :----- | +| `consul.raft.thread.main.saturation` | An approximate measurement of the proportion of time the main Raft goroutine is busy and unavailable to accept new work. | percentage | sample | +| `consul.raft.thread.fsm.saturation` | An approximate measurement of the proportion of time the Raft FSM goroutine is busy and unavailable to accept new work. | percentage | sample | + +**Why they're important:** These measurements are a useful proxy for how much +capacity a Consul server has to accept additional write load. High saturation +of the Raft goroutines can lead to elevated latency in the rest of the system +and cause cluster instability. + +**What to look for:** Generally, a server's steady-state saturation should be +less than 50%. + +**NOTE:** These metrics are approximate and under extremely heavy load won't +give a perfect fine-grained view of how much headroom a server has available. +Instead, treat them as an early warning sign. + +** Requirements: ** +* Consul 1.13.0+ + ### Raft Replication Capacity Issues | Metric Name | Description | Unit | Type |