Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BACKPORT] Fixed a data loss issue on lite member promotion (#17644) #17758

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -285,6 +285,7 @@ String memberListString() {
}

// handles both new and left members
@SuppressWarnings("checkstyle:npathcomplexity")
void updateMembers(MembersView membersView) {
MemberMap currentMemberMap = memberMapRef.get();

Expand All @@ -294,11 +295,17 @@ void updateMembers(MembersView membersView) {

MemberImpl[] members = new MemberImpl[membersView.size()];
int memberIndex = 0;
// Indicates whether we received a notification on lite member membership change
// (e.g. its promotion to a data member)
boolean updatedLiteMember = false;
for (MemberInfo memberInfo : membersView.getMembers()) {
Address address = memberInfo.getAddress();
MemberImpl member = currentMemberMap.getMember(address);

if (member != null && member.getUuid().equals(memberInfo.getUuid())) {
if (member.isLiteMember()) {
updatedLiteMember = true;
}
member = createNewMemberImplIfChanged(memberInfo, member);
members[memberIndex++] = member;
continue;
Expand Down Expand Up @@ -331,6 +338,10 @@ void updateMembers(MembersView membersView) {

setMembers(MemberMap.createNew(membersView.getVersion(), members));

if (updatedLiteMember) {
node.partitionService.updateMemberGroupSize();
}

for (MemberImpl member : removedMembers) {
closeConnection(member.getAddress(), "Member left event received from master");
handleMemberRemove(memberMapRef.get(), member);
Expand Down
Expand Up @@ -358,6 +358,10 @@ public int getMaxAllowedBackupCount() {
return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0);
}

public void updateMemberGroupSize() {
partitionStateManager.updateMemberGroupsSize();
}

@Override
public void memberAdded(Member member) {
logger.fine("Adding " + member);
Expand Down
Expand Up @@ -20,6 +20,7 @@
import com.hazelcast.config.Config;
import com.hazelcast.core.Cluster;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.IMap;
import com.hazelcast.core.Member;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.internal.cluster.impl.operations.PromoteLiteMemberOp;
Expand Down Expand Up @@ -64,6 +65,7 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

@RunWith(HazelcastParallelClassRunner.class)
Expand Down Expand Up @@ -374,6 +376,110 @@ private void memberAttributes_arePreserved_afterPromotion(boolean isMaster) thro
}
}

@Test
public void test_lite_member_promotion_causes_no_data_loss_on_three_members() throws InterruptedException {
final int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
final HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);

// start second and third hazelcast instances as a lite member
final HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);
final HazelcastInstance thirdHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();
secondHazelcastInstance.getCluster().promoteLocalLiteMember();
thirdHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(new AssertTask() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: could use waitClusterForSafeState here and assertEqualsEventually in subsequent assertTrueEventually usages below.

@Override
public void run() throws Exception {
assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe());
}
});

// insert some dummy data into the testing map
final String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size());
}
});
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size());
}
});
}

@Test
public void test_lite_member_promotion_causes_no_data_loss_on_two_members() throws InterruptedException {
final int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
final HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);
// start second hazelcast instance as a lite member
final HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();

secondHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
firstHazelcastInstance.getPartitionService().isClusterSafe();
}
});

// insert some dummy data into the testing map
final String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size());
}
});
}

private void assertPromotionInvocationStarted(HazelcastInstance instance) {
final OperationServiceImpl operationService =
(OperationServiceImpl) getNode(instance).getNodeEngine().getOperationService();
Expand Down