From 3362c711db0de10598187c347162ac59cb779705 Mon Sep 17 00:00:00 2001 From: petrpleshachkov <50169481+petrpleshachkov@users.noreply.github.com> Date: Wed, 30 Sep 2020 11:16:12 +0200 Subject: [PATCH 1/4] Fixed a data loss issue on lite member promotion Notify cluster members on lite member promotion to update memberGroupSize. Otherwise, a data member might be not aware of other promoted data members, and it may cause backup operations not being issued to other data members. Closes https://github.com/hazelcast/hazelcast/issues/17621 --- .../cluster/impl/MembershipManager.java | 1 + .../cluster/impl/PromoteLiteMemberTest.java | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java index ac70ba95ae7c..d42d8f3e6728 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java @@ -367,6 +367,7 @@ private MemberImpl createNewMemberImplIfChanged(MemberInfo newMemberInfo, Member member = clusterService.promoteAndGetLocalMember(); } else { member = createMember(newMemberInfo, member.getAttributes()); + node.partitionService.memberAdded(member); } } else if (member.getMemberListJoinVersion() != newMemberInfo.getMemberListJoinVersion()) { if (member.getMemberListJoinVersion() != NA_MEMBER_LIST_JOIN_VERSION) { diff --git a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java index 9d18d8ead4dc..8566a691e7ba 100644 --- a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java +++ b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java @@ -27,6 +27,7 @@ import com.hazelcast.internal.partition.InternalPartition; import com.hazelcast.internal.util.RootCauseMatcher; import com.hazelcast.internal.util.UuidUtil; +import com.hazelcast.map.IMap; import com.hazelcast.spi.impl.InternalCompletableFuture; import com.hazelcast.spi.impl.operationservice.impl.Invocation; import com.hazelcast.spi.impl.operationservice.impl.InvocationRegistry; @@ -64,9 +65,11 @@ import static java.util.Arrays.asList; import static java.util.Collections.singletonList; import static org.hamcrest.Matchers.greaterThan; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @RunWith(HazelcastParallelClassRunner.class) @@ -323,6 +326,47 @@ public void promotion_shouldFail_whenMasterIsSuspected_duringPromotion() throws } } + @Test + public void lite_member_promotion_data_loss() throws InterruptedException { + int entryCount = 1000; + + TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory(); + Config config = new Config().setLiteMember(true); + + // start first hazelcast instance as a lite member + HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config); + + // start second and third hazelcast instances as a lite member + HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config); + HazelcastInstance thirdHazelcastInstance = factory.newHazelcastInstance(config); + + // promote all instances to data members + firstHazelcastInstance.getCluster().promoteLocalLiteMember(); + secondHazelcastInstance.getCluster().promoteLocalLiteMember(); + thirdHazelcastInstance.getCluster().promoteLocalLiteMember(); + + // check if cluster is in a good shape + assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe())); + + // insert some dummy data into the testing map + String mapName = randomMapName(); + IMap testMap = firstHazelcastInstance.getMap(mapName); + for (int i = 0; i < entryCount; ++i) { + testMap.put("key" + i, "value" + i); + } + + // check all data is correctly inserted + assertEquals(entryCount, testMap.size()); + + // kill second instance + secondHazelcastInstance.getLifecycleService().terminate(); + + // backup count for the map is set to 1 + // even with 1 node down, no data loss is expected + assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size())); + assertTrueEventually(() -> assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size())); + } + private void assertPromotionInvocationStarted(HazelcastInstance instance) { OperationServiceImpl operationService = getNode(instance).getNodeEngine().getOperationService(); InvocationRegistry invocationRegistry = operationService.getInvocationRegistry(); From 336f25081a7becba81d896423a6a950ddc46daeb Mon Sep 17 00:00:00 2001 From: petrpleshachkov <50169481+petrpleshachkov@users.noreply.github.com> Date: Mon, 5 Oct 2020 12:53:34 +0200 Subject: [PATCH 2/4] Update memberGroupSize on every lite membership change --- .../cluster/impl/MembershipManager.java | 14 ++++++- .../impl/InternalPartitionServiceImpl.java | 4 ++ .../cluster/impl/PromoteLiteMemberTest.java | 39 +++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java index d42d8f3e6728..cba9f4fc96d9 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java @@ -305,11 +305,15 @@ void updateMembers(MembersView membersView) { MemberImpl[] members = new MemberImpl[membersView.size()]; int memberIndex = 0; + boolean updatedLiteMember = false; for (MemberInfo memberInfo : membersView.getMembers()) { Address address = memberInfo.getAddress(); MemberImpl member = currentMemberMap.getMember(address); if (member != null && member.getUuid().equals(memberInfo.getUuid())) { + if (member.isLiteMember()) { + updatedLiteMember = true; + } member = createNewMemberImplIfChanged(memberInfo, member); members[memberIndex++] = member; continue; @@ -342,6 +346,10 @@ void updateMembers(MembersView membersView) { setMembers(MemberMap.createNew(membersView.getVersion(), members)); + if (updatedLiteMember) { + updateMembersGroupSize(); + } + for (MemberImpl member : removedMembers) { closeConnection(member.getAddress(), "Member left event received from master"); handleMemberRemove(memberMapRef.get(), member); @@ -367,8 +375,8 @@ private MemberImpl createNewMemberImplIfChanged(MemberInfo newMemberInfo, Member member = clusterService.promoteAndGetLocalMember(); } else { member = createMember(newMemberInfo, member.getAttributes()); - node.partitionService.memberAdded(member); } + node.partitionService.memberAdded(member); } else if (member.getMemberListJoinVersion() != newMemberInfo.getMemberListJoinVersion()) { if (member.getMemberListJoinVersion() != NA_MEMBER_LIST_JOIN_VERSION) { if (logger.isFineEnabled()) { @@ -411,6 +419,10 @@ private MemberImpl createMember(MemberInfo memberInfo, Map attri .build(); } + private void updateMembersGroupSize() { + node.partitionService.updateMemberGroupSize(); + } + private void repairPartitionTableIfReturningMember(MemberImpl member) { if (!clusterService.isMaster()) { return; diff --git a/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java b/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java index 6ee05dbbb113..ba88a95857e9 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/partition/impl/InternalPartitionServiceImpl.java @@ -346,6 +346,10 @@ public int getMaxAllowedBackupCount() { return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0); } + public void updateMemberGroupSize() { + partitionStateManager.updateMemberGroupsSize(); + } + @Override public void memberAdded(Member member) { logger.fine("Adding " + member); diff --git a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java index 8566a691e7ba..b1dda34d83de 100644 --- a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java +++ b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java @@ -367,6 +367,45 @@ public void lite_member_promotion_data_loss() throws InterruptedException { assertTrueEventually(() -> assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size())); } + @Test + public void lite_member_promotion_data_loss2() throws InterruptedException { + int entryCount = 1000; + + TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory(); + Config config = new Config().setLiteMember(true); + + // start first hazelcast instance as a lite member + HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config); + + // start second hazelcast instance as a lite member + HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config); + + // promote all instances to data members + firstHazelcastInstance.getCluster().promoteLocalLiteMember(); + + secondHazelcastInstance.getCluster().promoteLocalLiteMember(); + + // check if cluster is in a good shape + assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe())); + + // insert some dummy data into the testing map + String mapName = randomMapName(); + IMap testMap = firstHazelcastInstance.getMap(mapName); + for (int i = 0; i < entryCount; ++i) { + testMap.put("key" + i, "value" + i); + } + + // check all data is correctly inserted + assertEquals(entryCount, testMap.size()); + + // kill second instance + secondHazelcastInstance.getLifecycleService().terminate(); + + // backup count for the map is set to 1 + // even with 1 node down, no data loss is expected + assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size())); + } + private void assertPromotionInvocationStarted(HazelcastInstance instance) { OperationServiceImpl operationService = getNode(instance).getNodeEngine().getOperationService(); InvocationRegistry invocationRegistry = operationService.getInvocationRegistry(); From a0509105f07b4a645459cb82b5156481ad884604 Mon Sep 17 00:00:00 2001 From: petrpleshachkov <50169481+petrpleshachkov@users.noreply.github.com> Date: Mon, 5 Oct 2020 13:01:15 +0200 Subject: [PATCH 3/4] Removed unnecessary code --- .../com/hazelcast/internal/cluster/impl/MembershipManager.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java index cba9f4fc96d9..8bfbbdc593cd 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java @@ -376,7 +376,6 @@ private MemberImpl createNewMemberImplIfChanged(MemberInfo newMemberInfo, Member } else { member = createMember(newMemberInfo, member.getAttributes()); } - node.partitionService.memberAdded(member); } else if (member.getMemberListJoinVersion() != newMemberInfo.getMemberListJoinVersion()) { if (member.getMemberListJoinVersion() != NA_MEMBER_LIST_JOIN_VERSION) { if (logger.isFineEnabled()) { From 7ac4040c70f840bae22175794e658ce2993fa203 Mon Sep 17 00:00:00 2001 From: petrpleshachkov <50169481+petrpleshachkov@users.noreply.github.com> Date: Thu, 8 Oct 2020 12:46:45 +0200 Subject: [PATCH 4/4] Addressed reviewer's comment --- .../internal/cluster/impl/MembershipManager.java | 8 +++----- .../internal/cluster/impl/PromoteLiteMemberTest.java | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java index 8bfbbdc593cd..8036f4de236e 100644 --- a/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java +++ b/hazelcast/src/main/java/com/hazelcast/internal/cluster/impl/MembershipManager.java @@ -305,6 +305,8 @@ void updateMembers(MembersView membersView) { MemberImpl[] members = new MemberImpl[membersView.size()]; int memberIndex = 0; + // Indicates whether we received a notification on lite member membership change + // (e.g. its promotion to a data member) boolean updatedLiteMember = false; for (MemberInfo memberInfo : membersView.getMembers()) { Address address = memberInfo.getAddress(); @@ -347,7 +349,7 @@ void updateMembers(MembersView membersView) { setMembers(MemberMap.createNew(membersView.getVersion(), members)); if (updatedLiteMember) { - updateMembersGroupSize(); + node.partitionService.updateMemberGroupSize(); } for (MemberImpl member : removedMembers) { @@ -418,10 +420,6 @@ private MemberImpl createMember(MemberInfo memberInfo, Map attri .build(); } - private void updateMembersGroupSize() { - node.partitionService.updateMemberGroupSize(); - } - private void repairPartitionTableIfReturningMember(MemberImpl member) { if (!clusterService.isMaster()) { return; diff --git a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java index b1dda34d83de..643bd19a2069 100644 --- a/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java +++ b/hazelcast/src/test/java/com/hazelcast/internal/cluster/impl/PromoteLiteMemberTest.java @@ -327,7 +327,7 @@ public void promotion_shouldFail_whenMasterIsSuspected_duringPromotion() throws } @Test - public void lite_member_promotion_data_loss() throws InterruptedException { + public void test_lite_member_promotion_causes_no_data_loss_on_three_members() throws InterruptedException { int entryCount = 1000; TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory(); @@ -368,7 +368,7 @@ public void lite_member_promotion_data_loss() throws InterruptedException { } @Test - public void lite_member_promotion_data_loss2() throws InterruptedException { + public void test_lite_member_promotion_causes_no_data_loss_on_two_members() throws InterruptedException { int entryCount = 1000; TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();