Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed a data loss issue on lite member promotion #17644

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -305,11 +305,15 @@ void updateMembers(MembersView membersView) {

MemberImpl[] members = new MemberImpl[membersView.size()];
int memberIndex = 0;
boolean updatedLiteMember = false;
for (MemberInfo memberInfo : membersView.getMembers()) {
Address address = memberInfo.getAddress();
MemberImpl member = currentMemberMap.getMember(address);

if (member != null && member.getUuid().equals(memberInfo.getUuid())) {
if (member.isLiteMember()) {
updatedLiteMember = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please add a short comment to the code to explain the meaning of updatedLiteMember?

}
member = createNewMemberImplIfChanged(memberInfo, member);
members[memberIndex++] = member;
continue;
Expand Down Expand Up @@ -342,6 +346,10 @@ void updateMembers(MembersView membersView) {

setMembers(MemberMap.createNew(membersView.getVersion(), members));

if (updatedLiteMember) {
updateMembersGroupSize();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is hard to to follow similar named methods maybe we can make updateMemberGroupsSize() method public and can call it here: node.partitionService.getPartitionStateManager().updateMemberGroupsSize()

}

for (MemberImpl member : removedMembers) {
closeConnection(member.getAddress(), "Member left event received from master");
handleMemberRemove(memberMapRef.get(), member);
Expand Down Expand Up @@ -410,6 +418,10 @@ private MemberImpl createMember(MemberInfo memberInfo, Map<String, String> attri
.build();
}

private void updateMembersGroupSize() {
node.partitionService.updateMemberGroupSize();
}

private void repairPartitionTableIfReturningMember(MemberImpl member) {
if (!clusterService.isMaster()) {
return;
Expand Down
Expand Up @@ -346,6 +346,10 @@ public int getMaxAllowedBackupCount() {
return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0);
}

public void updateMemberGroupSize() {
partitionStateManager.updateMemberGroupsSize();
}

@Override
public void memberAdded(Member member) {
logger.fine("Adding " + member);
Expand Down
Expand Up @@ -27,6 +27,7 @@
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.util.RootCauseMatcher;
import com.hazelcast.internal.util.UuidUtil;
import com.hazelcast.map.IMap;
import com.hazelcast.spi.impl.InternalCompletableFuture;
import com.hazelcast.spi.impl.operationservice.impl.Invocation;
import com.hazelcast.spi.impl.operationservice.impl.InvocationRegistry;
Expand Down Expand Up @@ -64,9 +65,11 @@
import static java.util.Arrays.asList;
import static java.util.Collections.singletonList;
import static org.hamcrest.Matchers.greaterThan;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

@RunWith(HazelcastParallelClassRunner.class)
Expand Down Expand Up @@ -323,6 +326,86 @@ public void promotion_shouldFail_whenMasterIsSuspected_duringPromotion() throws
}
}

@Test
public void lite_member_promotion_data_loss() throws InterruptedException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we give a better name to test? Seeing the state tested against and expected behavior in test method name would be helpful for the future readers.

int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);

// start second and third hazelcast instances as a lite member
HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);
HazelcastInstance thirdHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();
secondHazelcastInstance.getCluster().promoteLocalLiteMember();
thirdHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe()));

// insert some dummy data into the testing map
String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size()));
assertTrueEventually(() -> assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size()));
}

@Test
public void lite_member_promotion_data_loss2() throws InterruptedException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);

// start second hazelcast instance as a lite member
HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();

secondHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(() -> assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe()));

// insert some dummy data into the testing map
String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(() -> assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size()));
}

private void assertPromotionInvocationStarted(HazelcastInstance instance) {
OperationServiceImpl operationService = getNode(instance).getNodeEngine().getOperationService();
InvocationRegistry invocationRegistry = operationService.getInvocationRegistry();
Expand Down