From 9fc67b66562eb76623112e42e6c4765eb108e257 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Thu, 28 Sep 2023 13:25:52 +0100 Subject: [PATCH] Hopefully fix flaky Consul fencing test (#23280) * Hopefully fix flaky fencing test when run in Enterprise * Fix typo --- .../consul_fencing_test.go | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/vault/external_tests/consul_fencing_binary/consul_fencing_test.go b/vault/external_tests/consul_fencing_binary/consul_fencing_test.go index bf69a87144e99..0efaf7407a529 100644 --- a/vault/external_tests/consul_fencing_binary/consul_fencing_test.go +++ b/vault/external_tests/consul_fencing_binary/consul_fencing_test.go @@ -68,6 +68,18 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) { }) require.NoError(t, err) + // We need to wait for the KV mount to be ready on all servers - it runs an + // "Upgrade" process on mount and will error until that's done. In practice + // that's so fast that in CE I've never seen it fail yet, but in Enterprise + // there is the added complexity of the upgrade running on the primary while + // standby nodes wait for it to complete. So in Enterprise the first PATCH + // sent to a standby often comes in before the standby has "seen" that the + // primary has completed the upgrade and fails causing the whole test to + // error. We can prevent that by taking a short loop here to wait for a read + // to succeed on the standby/non-leader (since KV-v2 does upgrade check on + // read too). + waitForKVv2Upgrade(t, ctx, notLeader.APIClient(), "/test") + // Start two background workers that will cause writes to Consul in the // background. KV v2 relies on a single active node for correctness. // Specifically its patch operation does a read-modify-write under a @@ -263,6 +275,46 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) { require.Equal(t, expect1, sets[1], "Client 1 writes lost") } +func waitForKVv2Upgrade(t *testing.T, ctx context.Context, client *api.Client, path string) { + t.Helper() + + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + kv := client.KVv2(path) + attempts := 0 + wait := 20 * time.Millisecond + for { + // Attempt to perform a write on the KVv2 mount. It will fail until the + // backend is done upgrading. If this is a performance standby in Ent then + // it will not complete until the primary is done upgrading AND the standby + // has noticed that! + _, err := kv.Put(ctx, "test-upgrade-done", map[string]interface{}{ + "ok": 1, + }) + if err == nil { + return + } + t.Logf("waitForKVv2Upgrade: write failed: %s", err) + select { + case <-ctx.Done(): + t.Fatalf("context cancelled waiting for KVv2 (%s) upgrade to complete: %s", + path, ctx.Err()) + return + case <-time.After(wait): + } + attempts++ + // We don't quite want exponential backoff because it really should be fast, + // but just reduce log spam on failures if it's taking a while. + if attempts > 4 { + wait = 250 * time.Millisecond + } + if attempts > 10 { + wait = time.Second + } + } +} + func makeSet(n int) []int { a := make([]int, n) for i := 0; i < n; i++ {