Skip to content

Commit

Permalink
Vault 6773/raft rejoin nonvoter (#16324)
Browse files Browse the repository at this point in the history
* raft: Ensure init before setting suffrage

As reported in https://hashicorp.atlassian.net/browse/VAULT-6773:

	The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely
	on the seal to manage trust.

	It’s possible to use multiple join requests to switch nodes from voter to
	non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader,
	and vault_3 and vault_4 are followers with non-voters set to false.  sent two
	requests to the raft join endpoint to have vault_3 and vault_4 join the cluster
	with non_voters:true.

This commit fixes the issue by delaying the call to SetDesiredSuffrage until after
the initialization check, preventing unauthenticated mangling of voter status.

Tested locally using
https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh
and the reproducer outlined in VAULT-6773.

* raft: Return join err on failure

This is necessary to correctly distinguish errors returned from the Join
workflow. Previously, errors were being masked as timeouts.

* raft: Default autopilot parameters in teststorage

Change some defaults so we don't have to pass in parameters or set them
in the originating tests. These storage types are only used in two
places:

1) Raft HA testing
2) Seal migration testing

Both consumers have been tested and pass with this change.

* changelog: Unauthn voter status change bugfix
  • Loading branch information
mpalmi committed Jul 18, 2022
1 parent 228a26c commit cc64092
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 14 deletions.
3 changes: 3 additions & 0 deletions changelog/16324.txt
@@ -0,0 +1,3 @@
```release-note:bug
storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin
```
8 changes: 5 additions & 3 deletions helper/testhelpers/teststorage/teststorage.go
Expand Up @@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg

nodeID := fmt.Sprintf("core-%d", coreIdx)
backendConf := map[string]string{
"path": raftDir,
"node_id": nodeID,
"performance_multiplier": "8",
"path": raftDir,
"node_id": nodeID,
"performance_multiplier": "8",
"autopilot_reconcile_interval": "300ms",
"autopilot_update_interval": "100ms",
}

// Create and set the HA Backend
Expand Down
9 changes: 5 additions & 4 deletions helper/testhelpers/teststorage/teststorage_reusable.go
Expand Up @@ -18,7 +18,6 @@ import (
// seal migration, wherein a given physical backend must be re-used as several
// test clusters are sequentially created, tested, and discarded.
type ReusableStorage struct {

// IsRaft specifies whether the storage is using a raft backend.
IsRaft bool

Expand Down Expand Up @@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string {
func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
nodeID := fmt.Sprintf("core-%d", coreIdx)
conf := map[string]string{
"path": raftDir,
"node_id": nodeID,
"performance_multiplier": "8",
"path": raftDir,
"node_id": nodeID,
"performance_multiplier": "8",
"autopilot_reconcile_interval": "300ms",
"autopilot_update_interval": "100ms",
}

backend, err := raft.NewRaftBackend(conf, logger)
Expand Down
21 changes: 14 additions & 7 deletions vault/raft.go
Expand Up @@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
return false, errors.New("raft backend not in use")
}

if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
c.logger.Error("failed to set desired suffrage for this node", "error", err)
return false, nil
}

init, err := c.InitializedLocally(ctx)
if err != nil {
return false, fmt.Errorf("failed to check if core is initialized: %w", err)
Expand Down Expand Up @@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
if err != nil {
return fmt.Errorf("failed to check if core is initialized: %w", err)
}
if init && !isRaftHAOnly {

// InitializedLocally will return non-nil before HA backends are
// initialized. c.Initialized(ctx) checks InitializedLocally first, so
// we can't use that generically for both cases. Instead check
// raftBackend.Initialized() directly for the HA-Only case.
if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) {
c.logger.Info("returning from raft join as the node is initialized")
return nil
}

if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
c.logger.Error("failed to set desired suffrage for this node", "error", err)
return nil
}

challengeCh := make(chan *raftInformation)
var expandedJoinInfos []*raft.LeaderJoinInfo
for _, leaderInfo := range leaderInfos {
Expand Down Expand Up @@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo

select {
case <-ctx.Done():
err = ctx.Err()
case raftInfo := <-challengeCh:
if raftInfo != nil {
err = answerChallenge(ctx, raftInfo)
Expand All @@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
}
}
}
return fmt.Errorf("timed out on raft join: %w", ctx.Err())
return err
}

switch retryFailures {
Expand Down

0 comments on commit cc64092

Please sign in to comment.