merge: #9824

9824: fix: don't replicate snapshot if member already has the latest snapshot r=oleschoenburg a=oleschoenburg ## Description This fixes an issue where a leader becomes stuck in a snapshot replication loop if the leader's log starts right after the snapshot index. After successfully installing a snapshot, the leader attempts to seek back to the log entry before the snapshot which will fail if the log starts after the snapshot. When that happens, the member context has no current entry until the next append request is send so we must not use the current entry of the member to decide between sending a snapshot or not. Instead, we can bail out and decide not to send a snapshot when the member has the latest snapshot. ## Related issues  closes #9820 Co-authored-by: Ole Schönburg <ole.schoenburg@gmail.com>
camunda · Jul 18, 2022 · 145c697 · 145c697
2 parents 2ae0d47 + df0d417
commit 145c697
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 0 deletions.
diff --git a/atomix/cluster/src/main/java/io/atomix/raft/roles/LeaderAppender.java b/atomix/cluster/src/main/java/io/atomix/raft/roles/LeaderAppender.java
@@ -799,6 +799,12 @@ private boolean shouldReplicateSnapshot(final RaftMemberContext member) {
     if (persistedSnapshot == null) {
       return false;
     }
+    if (member.getSnapshotIndex() >= persistedSnapshot.getIndex()) {
+      // Member has the latest snapshot, replicating the snapshot again wouldn't help.
+      // WARNING! This is load-bearing and not just an optimization. See
+      // https://github.com/camunda/zeebe/issues/9820 for context.
+      return false;
+    }
     if (raft.getLog().getFirstIndex() > member.getCurrentIndex()) {
       // Necessary events are not available anymore, we have to use the snapshot
       return true;

diff --git a/atomix/cluster/src/test/java/io/atomix/raft/RaftReplicationTest.java b/atomix/cluster/src/test/java/io/atomix/raft/RaftReplicationTest.java
@@ -17,6 +17,9 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 
+import io.atomix.raft.RaftServer.Role;
+import java.util.stream.Collectors;
+import org.awaitility.Awaitility;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -112,4 +115,47 @@ public void shouldReplicateSnapshotIfMemberLagAboveThreshold() throws Exception
     assertThat(follower.getContext().getPersistedSnapshotStore().getCurrentSnapshotIndex())
         .isEqualTo(snapshotIndex);
   }
+
+  @Test
+  // Regression test for https://github.com/camunda/zeebe/issues/9820
+  public void shouldNotGetStuckInSnapshotReplicationLoop() throws Exception {
+    // given -- a cluster where follower's log starts at snapshot index
+    final var initialLeader = raftRule.getLeader().orElseThrow();
+
+    final var snapshotIndex = raftRule.appendEntries(5);
+    raftRule.doSnapshotOnMember(initialLeader, snapshotIndex, 3);
+    raftRule.appendEntries(5);
+
+    raftRule.getServers().stream()
+        .filter(s -> s.getRole() == Role.FOLLOWER)
+        .toList()
+        .forEach(
+            (follower) -> {
+              try {
+                raftRule.shutdownServer(follower);
+                // force data loss so that follower must receive the snapshot & has no preceding log
+                raftRule.triggerDataLossOnNode(follower.name());
+                raftRule.joinCluster(follower.name());
+              } catch (final Exception e) {
+                throw new RuntimeException(e);
+              }
+            });
+
+    // when -- letting the initial leader re-join after data loss
+    raftRule.shutdownServer(initialLeader);
+    raftRule.awaitNewLeader();
+    raftRule.triggerDataLossOnNode(initialLeader.name());
+    raftRule.joinCluster(initialLeader.name());
+
+    // then -- all members should have snapshot and latest log
+    raftRule.allNodesHaveSnapshotWithIndex(snapshotIndex);
+    Awaitility.await("All members should have the latest log")
+        .until(
+            () ->
+                raftRule.getServers().stream()
+                        .map(s -> s.getContext().getLog().getLastIndex())
+                        .collect(Collectors.toSet())
+                        .size()
+                    == 1);
+  }
 }