From 55d6b2d16a485845c05e05a64a40eacd055ba337 Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Tue, 21 Sep 2021 23:48:48 -0400 Subject: [PATCH 1/9] modify `TestRaft_RemoveFollower` to check removed node state --- raft_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/raft_test.go b/raft_test.go index 51db44a82..fd4f34dbd 100644 --- a/raft_test.go +++ b/raft_test.go @@ -697,6 +697,7 @@ func TestRaft_RemoveFollower(t *testing.T) { if configuration := c.getConfiguration(followers[1]); len(configuration.Servers) != 2 { t.Fatalf("too many peers") } + require.Equal(t, Follower, follower.getState()) } func TestRaft_RemoveLeader(t *testing.T) { From 8d797b1694419703b26eec4c3702936831e7d033 Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Tue, 21 Sep 2021 23:49:22 -0400 Subject: [PATCH 2/9] add test to verify that a removed node is not able to vote --- raft_test.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/raft_test.go b/raft_test.go index fd4f34dbd..f34b9bd28 100644 --- a/raft_test.go +++ b/raft_test.go @@ -700,6 +700,72 @@ func TestRaft_RemoveFollower(t *testing.T) { require.Equal(t, Follower, follower.getState()) } +func TestRaft_RemovedFollower_Vote(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + t.Fatalf("expected two followers: %v", followers) + } + + // Remove a followerRemoved + followerRemoved := followers[0] + future := leader.RemoveServer(followerRemoved.localID, 0, 0) + if err := future.Error(); err != nil { + t.Fatalf("err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if configuration := c.getConfiguration(leader); len(configuration.Servers) != 2 { + t.Fatalf("too many peers") + } + if configuration := c.getConfiguration(followers[1]); len(configuration.Servers) != 2 { + t.Fatalf("too many peers") + } + require.Equal(t, Follower, followerRemoved.getState()) + + follower := followers[1] + + followerRemovedT := c.trans[c.IndexOf(followerRemoved)] + reqVote := RequestVoteRequest{ + RPCHeader: followerRemoved.getRPCHeader(), + Term: followerRemoved.getCurrentTerm() + 10, + Candidate: followerRemovedT.EncodePeer(followerRemoved.localID, followerRemoved.localAddr), + LastLogIndex: followerRemoved.LastIndex(), + LastLogTerm: followerRemoved.getCurrentTerm(), + LeadershipTransfer: false, + } + // a follower that thinks there's a leader should vote for that leader. + var resp RequestVoteResponse + + // + c.Partition([]ServerAddress{leader.localAddr}) + + time.Sleep(c.propagateTimeout) + require.Equal(t, Candidate, follower.getState()) + if err := followerRemovedT.RequestVote(follower.localID, follower.localAddr, &reqVote, &resp); err != nil { + t.Fatalf("RequestVote RPC failed %v", err) + } + + if resp.Granted { + t.Fatalf("expected vote to not be granted, but it was %+v", resp) + } +} + func TestRaft_RemoveLeader(t *testing.T) { // Make a cluster c := MakeCluster(3, t, nil) From b40a18c7f0df74486a8838fe4801e02900879a9d Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Tue, 21 Sep 2021 23:58:26 -0400 Subject: [PATCH 3/9] do not transition to `Candidate` state if not part of stable configuration --- raft.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/raft.go b/raft.go index 99f357585..bab792715 100644 --- a/raft.go +++ b/raft.go @@ -213,9 +213,13 @@ func (r *Raft) runFollower() { didWarn = true } } else { - r.logger.Warn("heartbeat timeout reached, starting election", "last-leader", lastLeader) metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) - r.setState(Candidate) + if hasVote(r.configurations.latest, r.localID) { + r.logger.Warn("heartbeat timeout reached, starting election", "last-leader", lastLeader) + r.setState(Candidate) + } else { + r.logger.Warn("heartbeat timeout reached, not part of stable configuration, aborting election") + } return } From 79b86827e81d592d29455e660c8097ea2626a657 Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Wed, 22 Sep 2021 08:34:58 -0400 Subject: [PATCH 4/9] add some comments to the tests --- raft_test.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/raft_test.go b/raft_test.go index f34b9bd28..1e7a2452d 100644 --- a/raft_test.go +++ b/raft_test.go @@ -697,6 +697,8 @@ func TestRaft_RemoveFollower(t *testing.T) { if configuration := c.getConfiguration(followers[1]); len(configuration.Servers) != 2 { t.Fatalf("too many peers") } + + // The removed node should remain in a follower state require.Equal(t, Follower, follower.getState()) } @@ -719,7 +721,7 @@ func TestRaft_RemovedFollower_Vote(t *testing.T) { t.Fatalf("expected two followers: %v", followers) } - // Remove a followerRemoved + // Remove a follower followerRemoved := followers[0] future := leader.RemoveServer(followerRemoved.localID, 0, 0) if err := future.Error(); err != nil { @@ -736,10 +738,11 @@ func TestRaft_RemovedFollower_Vote(t *testing.T) { if configuration := c.getConfiguration(followers[1]); len(configuration.Servers) != 2 { t.Fatalf("too many peers") } + // The removed node should be still in Follower state require.Equal(t, Follower, followerRemoved.getState()) + // Prepare a Vote request from the removed follower follower := followers[1] - followerRemovedT := c.trans[c.IndexOf(followerRemoved)] reqVote := RequestVoteRequest{ RPCHeader: followerRemoved.getRPCHeader(), @@ -752,15 +755,19 @@ func TestRaft_RemovedFollower_Vote(t *testing.T) { // a follower that thinks there's a leader should vote for that leader. var resp RequestVoteResponse - // + // partiton the leader to simulate an unstable cluster c.Partition([]ServerAddress{leader.localAddr}) - time.Sleep(c.propagateTimeout) + + // wait for the remaining follower to trigger an election require.Equal(t, Candidate, follower.getState()) + + // send a vote request from the removed follower to the Candidate follower if err := followerRemovedT.RequestVote(follower.localID, follower.localAddr, &reqVote, &resp); err != nil { t.Fatalf("RequestVote RPC failed %v", err) } + // the vote request should not be granted, because the voter is not part of the cluster anymore if resp.Granted { t.Fatalf("expected vote to not be granted, but it was %+v", resp) } From 702e2861e0e91d95ded2fe70e8e18aaf23f5127f Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Wed, 22 Sep 2021 10:18:14 -0400 Subject: [PATCH 5/9] do not return if not transitioning state --- raft.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/raft.go b/raft.go index bab792715..c5f465591 100644 --- a/raft.go +++ b/raft.go @@ -217,10 +217,13 @@ func (r *Raft) runFollower() { if hasVote(r.configurations.latest, r.localID) { r.logger.Warn("heartbeat timeout reached, starting election", "last-leader", lastLeader) r.setState(Candidate) + return } else { - r.logger.Warn("heartbeat timeout reached, not part of stable configuration, aborting election") + if !didWarn { + r.logger.Warn("heartbeat timeout reached, not part of stable configuration, aborting election") + didWarn = true + } } - return } case <-r.shutdownCh: From 9f012d8ba53e4527beee81deb95eac89b245e098 Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Wed, 22 Sep 2021 11:36:27 -0400 Subject: [PATCH 6/9] add wait loop to test --- raft_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/raft_test.go b/raft_test.go index 1e7a2452d..1fee1adb7 100644 --- a/raft_test.go +++ b/raft_test.go @@ -760,6 +760,11 @@ func TestRaft_RemovedFollower_Vote(t *testing.T) { time.Sleep(c.propagateTimeout) // wait for the remaining follower to trigger an election + count := 0 + for follower.getState() != Candidate && count < 1000 { + count++ + time.Sleep(1 * time.Millisecond) + } require.Equal(t, Candidate, follower.getState()) // send a vote request from the removed follower to the Candidate follower From ee892aad2573065ef4f76a3554984e542c813e9b Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Thu, 23 Sep 2021 15:28:57 -0400 Subject: [PATCH 7/9] remove test related to removed node voting --- raft_test.go | 76 ---------------------------------------------------- 1 file changed, 76 deletions(-) diff --git a/raft_test.go b/raft_test.go index 1fee1adb7..bf4bfcbd4 100644 --- a/raft_test.go +++ b/raft_test.go @@ -702,82 +702,6 @@ func TestRaft_RemoveFollower(t *testing.T) { require.Equal(t, Follower, follower.getState()) } -func TestRaft_RemovedFollower_Vote(t *testing.T) { - // Make a cluster - c := MakeCluster(3, t, nil) - defer c.Close() - - // Get the leader - leader := c.Leader() - - // Wait until we have 2 followers - limit := time.Now().Add(c.longstopTimeout) - var followers []*Raft - for time.Now().Before(limit) && len(followers) != 2 { - c.WaitEvent(nil, c.conf.CommitTimeout) - followers = c.GetInState(Follower) - } - if len(followers) != 2 { - t.Fatalf("expected two followers: %v", followers) - } - - // Remove a follower - followerRemoved := followers[0] - future := leader.RemoveServer(followerRemoved.localID, 0, 0) - if err := future.Error(); err != nil { - t.Fatalf("err: %v", err) - } - - // Wait a while - time.Sleep(c.propagateTimeout) - - // Other nodes should have fewer peers - if configuration := c.getConfiguration(leader); len(configuration.Servers) != 2 { - t.Fatalf("too many peers") - } - if configuration := c.getConfiguration(followers[1]); len(configuration.Servers) != 2 { - t.Fatalf("too many peers") - } - // The removed node should be still in Follower state - require.Equal(t, Follower, followerRemoved.getState()) - - // Prepare a Vote request from the removed follower - follower := followers[1] - followerRemovedT := c.trans[c.IndexOf(followerRemoved)] - reqVote := RequestVoteRequest{ - RPCHeader: followerRemoved.getRPCHeader(), - Term: followerRemoved.getCurrentTerm() + 10, - Candidate: followerRemovedT.EncodePeer(followerRemoved.localID, followerRemoved.localAddr), - LastLogIndex: followerRemoved.LastIndex(), - LastLogTerm: followerRemoved.getCurrentTerm(), - LeadershipTransfer: false, - } - // a follower that thinks there's a leader should vote for that leader. - var resp RequestVoteResponse - - // partiton the leader to simulate an unstable cluster - c.Partition([]ServerAddress{leader.localAddr}) - time.Sleep(c.propagateTimeout) - - // wait for the remaining follower to trigger an election - count := 0 - for follower.getState() != Candidate && count < 1000 { - count++ - time.Sleep(1 * time.Millisecond) - } - require.Equal(t, Candidate, follower.getState()) - - // send a vote request from the removed follower to the Candidate follower - if err := followerRemovedT.RequestVote(follower.localID, follower.localAddr, &reqVote, &resp); err != nil { - t.Fatalf("RequestVote RPC failed %v", err) - } - - // the vote request should not be granted, because the voter is not part of the cluster anymore - if resp.Granted { - t.Fatalf("expected vote to not be granted, but it was %+v", resp) - } -} - func TestRaft_RemoveLeader(t *testing.T) { // Make a cluster c := MakeCluster(3, t, nil) From 21e3294c3edc3345397f27291fcde629efc64b4d Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Mon, 27 Sep 2021 11:03:33 -0400 Subject: [PATCH 8/9] check `inConfig` instead of `hasVote` --- configuration.go | 11 +++++++++++ raft.go | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/configuration.go b/configuration.go index 5c6636058..1ac92fdc0 100644 --- a/configuration.go +++ b/configuration.go @@ -173,6 +173,17 @@ func hasVote(configuration Configuration, id ServerID) bool { return false } +// hasVote returns true if the server identified by 'id' is a Voter in the +// provided Configuration. +func inConfig(configuration Configuration, id ServerID) bool { + for _, server := range configuration.Servers { + if server.ID == id { + return true + } + } + return false +} + // checkConfiguration tests a cluster membership configuration for common // errors. func checkConfiguration(configuration Configuration) error { diff --git a/raft.go b/raft.go index c5f465591..155a89df4 100644 --- a/raft.go +++ b/raft.go @@ -214,7 +214,7 @@ func (r *Raft) runFollower() { } } else { metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) - if hasVote(r.configurations.latest, r.localID) { + if inConfig(r.configurations.latest, r.localID) { r.logger.Warn("heartbeat timeout reached, starting election", "last-leader", lastLeader) r.setState(Candidate) return From 26f8d55ca1cfafd0783441c576872513cd531069 Mon Sep 17 00:00:00 2001 From: Dhia Ayachi Date: Tue, 5 Oct 2021 13:46:34 -0400 Subject: [PATCH 9/9] updating warn log message to be more accurate --- raft.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raft.go b/raft.go index 155a89df4..a53492bd4 100644 --- a/raft.go +++ b/raft.go @@ -220,7 +220,7 @@ func (r *Raft) runFollower() { return } else { if !didWarn { - r.logger.Warn("heartbeat timeout reached, not part of stable configuration, aborting election") + r.logger.Warn("heartbeat timeout reached, not part of stable configuration, not triggering a leader election") didWarn = true } }