From 307891f90b0efb495ae0bd3a7db85d535db092d8 Mon Sep 17 00:00:00 2001 From: petrpleshachkov <50169481+petrpleshachkov@users.noreply.github.com> Date: Tue, 1 Dec 2020 12:29:13 +0100 Subject: [PATCH] Fixed a data loss issue on lite member promotion (#17644) (#17757) Fixed a data loss issue on lite member promotion (#17644) Notify cluster members on lite member promotion to update memberGroupSize. Otherwise, a data member might be not aware of other promoted data members, and it may cause backup operations not being issued to other data members. Closes https://github.com/hazelcast/hazelcast/issues/17621 --- .../cluster/impl/MembershipManager.java | 10 +++ .../impl/InternalPartitionServiceImpl.java | 4 + .../cluster/impl/PromoteLiteMemberTest.java | 83 +++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java index d9b36b8c9103..f881acace49c 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java @@ -290,11 +290,17 @@ void updateMembers(MembersView membersView) { MemberImpl[] members = new MemberImpl[membersView.size()]; int memberIndex = 0; + // Indicates whether we received a notification on lite member membership change + // (e.g. its promotion to a data member) + boolean updatedLiteMember = false; for (MemberInfo memberInfo : membersView.getMembers()) { Address address = memberInfo.getAddress(); MemberImpl member = currentMemberMap.getMember(address); if (member != null && member.getUuid().equals(memberInfo.getUuid())) { + if (member.isLiteMember()) { + updatedLiteMember = true; + } member = createNewMemberImplIfChanged(memberInfo, member); members[memberIndex++] = member; continue; @@ -327,6 +333,10 @@ void updateMembers(MembersView membersView) { setMembers(MemberMap.createNew(membersView.getVersion(), members)); + if (updatedLiteMember) { + node.partitionService.updateMemberGroupSize(); + } + for (MemberImpl member : removedMembers) { closeConnection(member.getAddress(), "Member left event received from master"); handleMemberRemove(memberMapRef.get(), member); diff --git a/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java b/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java index 65654ea29012..377c79799031 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java @@ -336,6 +336,10 @@ public int getMaxAllowedBackupCount() { return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0); } + public void updateMemberGroupSize() { + partitionStateManager.updateMemberGroupsSize(); + } + @Override public void memberAdded(Member member) { logger.fine("Adding " + member); diff --git a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java index 2b435c3cac9d..a2f67ede74bc 100644 --- a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java +++ b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java @@ -27,6 +27,7 @@ import com.hazelcast.internal.partition.InternalPartition; import com.hazelcast.internal.util.RootCauseMatcher; import com.hazelcast.internal.util.UuidUtil; +import com.hazelcast.map.IMap; import com.hazelcast.spi.impl.InternalCompletableFuture; import com.hazelcast.spi.impl.operationservice.impl.Invocation; import com.hazelcast.spi.impl.operationservice.impl.InvocationRegistry; @@ -62,9 +63,11 @@ import static java.util.Arrays.asList; import static java.util.Collections.singletonList; import static org.hamcrest.Matchers.greaterThan; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @RunWith(HazelcastParallelClassRunner.class) @@ -329,6 +332,86 @@ public void run() { } } + @Test + public void test_lite_member_promotion_causes_no_data_loss_on_three_members() throws InterruptedException { + int entryCount = 1000; + + TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory(); + Config config = new Config().setLiteMember(true); + + // start first hazelcast instance as a lite member + HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config); + + // start second and third hazelcast instances as a lite member + HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config); + HazelcastInstance thirdHazelcastInstance = factory.newHazelcastInstance(config); + + // promote all instances to data members + firstHazelcastInstance.getCluster().promoteLocalLiteMember(); + secondHazelcastInstance.getCluster().promoteLocalLiteMember(); + thirdHazelcastInstance.getCluster().promoteLocalLiteMember(); + + // check if cluster is in a good shape + assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe())); + + // insert some dummy data into the testing map + String mapName = randomMapName(); + IMap testMap = firstHazelcastInstance.getMap(mapName); + for (int i = 0; i < entryCount; ++i) { + testMap.put("key" + i, "value" + i); + } + + // check all data is correctly inserted + assertEquals(entryCount, testMap.size()); + + // kill second instance + secondHazelcastInstance.getLifecycleService().terminate(); + + // backup count for the map is set to 1 + // even with 1 node down, no data loss is expected + assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size())); + assertTrueEventually(() -> assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size())); + } + + @Test + public void test_lite_member_promotion_causes_no_data_loss_on_two_members() throws InterruptedException { + int entryCount = 1000; + + TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory(); + Config config = new Config().setLiteMember(true); + + // start first hazelcast instance as a lite member + HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config); + + // start second hazelcast instance as a lite member + HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config); + + // promote all instances to data members + firstHazelcastInstance.getCluster().promoteLocalLiteMember(); + + secondHazelcastInstance.getCluster().promoteLocalLiteMember(); + + // check if cluster is in a good shape + assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe())); + + // insert some dummy data into the testing map + String mapName = randomMapName(); + IMap testMap = firstHazelcastInstance.getMap(mapName); + for (int i = 0; i < entryCount; ++i) { + testMap.put("key" + i, "value" + i); + } + + // check all data is correctly inserted + assertEquals(entryCount, testMap.size()); + + // kill second instance + secondHazelcastInstance.getLifecycleService().terminate(); + + // backup count for the map is set to 1 + // even with 1 node down, no data loss is expected + assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size())); + } + private void assertPromotionInvocationStarted(HazelcastInstance instance) { final OperationServiceImpl operationService = (OperationServiceImpl) getNode(instance).getNodeEngine().getOperationService();