Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check if server is in configuration when receiving a voteRequest #526

Merged
merged 5 commits into from Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions configuration.go
Expand Up @@ -177,6 +177,17 @@ func hasVote(configuration Configuration, id ServerID) bool {
return false
}

// inConfiguration returns true if the server identified by 'id' is in in the
// provided Configuration.
func inConfiguration(configuration Configuration, id ServerID) bool {
for _, server := range configuration.Servers {
if server.ID == id {
return true
}
}
return false
}

// checkConfiguration tests a cluster membership configuration for common
// errors.
func checkConfiguration(configuration Configuration) error {
Expand Down
14 changes: 12 additions & 2 deletions raft.go
Expand Up @@ -1566,8 +1566,8 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
candidateID := ServerID(req.ID)
// if the Servers list is empty that mean the cluster is very likely trying to bootstrap,
// Grant the vote
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not a voter",
if len(r.configurations.latest.Servers) > 0 && !inConfiguration(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not in configuration",
"from", candidate)
return
}
Expand All @@ -1594,6 +1594,16 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
resp.Term = req.Term
}

// if we get a request for vote from a nonVoter and the request term is higher,
// step down and update term, but reject the vote request
// This could happen when a node, previously voter, is converted to non-voter
Comment on lines +1597 to +1599
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's worth a note or link back to this issue here? I suspect reading this I'd be puzzled about why we allow a non-voter to "disrupt" leaders at all because the nuance involved in this issue is high! Ideally people would view blame before changing this back to ignore votes from non-voters but we could potentially use this comment to flag the non-intuitive behaviour is for a good reason?

if len(req.ID) > 0 {
candidateID := ServerID(req.ID)
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not a voter", "from", candidate)
return
}
}
// Check if we have voted yet
lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm)
if err != nil && err.Error() != "not found" {
Expand Down
90 changes: 90 additions & 0 deletions raft_test.go
Expand Up @@ -2644,6 +2644,96 @@ func TestRaft_VoteNotGranted_WhenNodeNotInCluster(t *testing.T) {
}
}

func TestRaft_ClusterCanRegainStability_WhenNonVoterWithHigherTermJoin(t *testing.T) {
// Make a cluster
c := MakeCluster(3, t, nil)

defer c.Close()

// Get the leader
leader := c.Leader()

// Wait until we have 2 followers
limit := time.Now().Add(c.longstopTimeout)
var followers []*Raft
for time.Now().Before(limit) && len(followers) != 2 {
c.WaitEvent(nil, c.conf.CommitTimeout)
followers = c.GetInState(Follower)
}
if len(followers) != 2 {
t.Fatalf("expected two followers: %v", followers)
}
dhiaayachi marked this conversation as resolved.
Show resolved Hide resolved

// Remove a follower
followerRemoved := followers[0]
c.Disconnect(followerRemoved.localAddr)
time.Sleep(c.propagateTimeout)

future := leader.RemoveServer(followerRemoved.localID, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}

//set that follower term to higher term to faster simulate a partitioning
newTerm := leader.getCurrentTerm() + 20
followerRemoved.setCurrentTerm(newTerm)
//Add the node back as NonVoter
future = leader.AddNonvoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}

c.FullyConnect()

// Wait a while
time.Sleep(c.propagateTimeout)
// Check the term is now a new term
leader = c.Leader()
currentTerm := leader.getCurrentTerm()
if newTerm > currentTerm {
t.Fatalf("term should have changed,%d < %d", newTerm, currentTerm)
}

// check nonVoter is not elected
if leader.localID == followerRemoved.localID {
t.Fatalf("Should not be leader %s", followerRemoved.localID)
}

//Write some logs to ensure they replicate
for i := 0; i < 100; i++ {
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
if err := future.Error(); err != nil {
t.Fatalf("[ERR] apply err: %v", err)
}
}
c.WaitForReplication(100)

//Remove the server and add it back as Voter
future = leader.RemoveServer(followerRemoved.localID, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
leader.AddVoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)

// Wait a while
time.Sleep(c.propagateTimeout * 10)

//Write some logs to ensure they replicate
for i := 100; i < 200; i++ {
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
if err := future.Error(); err != nil {
t.Fatalf("[ERR] apply err: %v", err)
}
}
c.WaitForReplication(200)

// Check leader stable
newLeader := c.Leader()
if newLeader.leaderID != leader.leaderID {
t.Fatalf("leader changed")
}
}

// TestRaft_FollowerRemovalNoElection ensures that a leader election is not
// started when a standby is shut down and restarted.
func TestRaft_FollowerRemovalNoElection(t *testing.T) {
Expand Down