Skip to content

Commit

Permalink
Hopefully fix flaky Consul fencing test (#23280)
Browse files Browse the repository at this point in the history
* Hopefully fix flaky fencing test when run in Enterprise

* Fix typo
  • Loading branch information
banks committed Sep 28, 2023
1 parent 98e9d0c commit 9fc67b6
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions vault/external_tests/consul_fencing_binary/consul_fencing_test.go
Expand Up @@ -68,6 +68,18 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) {
})
require.NoError(t, err)

// We need to wait for the KV mount to be ready on all servers - it runs an
// "Upgrade" process on mount and will error until that's done. In practice
// that's so fast that in CE I've never seen it fail yet, but in Enterprise
// there is the added complexity of the upgrade running on the primary while
// standby nodes wait for it to complete. So in Enterprise the first PATCH
// sent to a standby often comes in before the standby has "seen" that the
// primary has completed the upgrade and fails causing the whole test to
// error. We can prevent that by taking a short loop here to wait for a read
// to succeed on the standby/non-leader (since KV-v2 does upgrade check on
// read too).
waitForKVv2Upgrade(t, ctx, notLeader.APIClient(), "/test")

// Start two background workers that will cause writes to Consul in the
// background. KV v2 relies on a single active node for correctness.
// Specifically its patch operation does a read-modify-write under a
Expand Down Expand Up @@ -263,6 +275,46 @@ func TestConsulFencing_PartitionedLeaderCantWrite(t *testing.T) {
require.Equal(t, expect1, sets[1], "Client 1 writes lost")
}

func waitForKVv2Upgrade(t *testing.T, ctx context.Context, client *api.Client, path string) {
t.Helper()

ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()

kv := client.KVv2(path)
attempts := 0
wait := 20 * time.Millisecond
for {
// Attempt to perform a write on the KVv2 mount. It will fail until the
// backend is done upgrading. If this is a performance standby in Ent then
// it will not complete until the primary is done upgrading AND the standby
// has noticed that!
_, err := kv.Put(ctx, "test-upgrade-done", map[string]interface{}{
"ok": 1,
})
if err == nil {
return
}
t.Logf("waitForKVv2Upgrade: write failed: %s", err)
select {
case <-ctx.Done():
t.Fatalf("context cancelled waiting for KVv2 (%s) upgrade to complete: %s",
path, ctx.Err())
return
case <-time.After(wait):
}
attempts++
// We don't quite want exponential backoff because it really should be fast,
// but just reduce log spam on failures if it's taking a while.
if attempts > 4 {
wait = 250 * time.Millisecond
}
if attempts > 10 {
wait = time.Second
}
}
}

func makeSet(n int) []int {
a := make([]int, n)
for i := 0; i < n; i++ {
Expand Down

0 comments on commit 9fc67b6

Please sign in to comment.