Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing the conditions for fetching remote master history #89472

Merged
merged 5 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/89472.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 89472
summary: Fixing the conditions for fetching remote master history
area: Health
type: bug
issues:
- 89431
Original file line number Diff line number Diff line change
Expand Up @@ -430,28 +430,24 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.build()
);
int nullTransitionsThreshold = 1;
final List<String> dataNodes = internalCluster().startDataOnlyNodes(
2,
Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), nullTransitionsThreshold)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(60, TimeUnit.SECONDS))
.build()
);
ensureStableCluster(3);
for (int i = 0; i < 2; i++) {
for (int i = 0; i < nullTransitionsThreshold + 1; i++) {
final String masterNode = masterNodes.get(0);

// Simulating a painful gc by suspending all threads for a long time on the current elected master node.
SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), masterNode);

final CountDownLatch dataNodeMasterSteppedDown = new CountDownLatch(2);
internalCluster().getInstance(ClusterService.class, masterNode).addListener(event -> {
if (event.state().nodes().getMasterNodeId() == null) {
dataNodeMasterSteppedDown.countDown();
}
});
internalCluster().getInstance(ClusterService.class, dataNodes.get(0)).addListener(event -> {
if (event.state().nodes().getMasterNodeId() == null) {
dataNodeMasterSteppedDown.countDown();
Expand All @@ -470,7 +466,7 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
// Stop disruption
logger.info("--> unfreezing node [{}]", masterNode);
masterNodeDisruption.stopDisrupting();
ensureStableCluster(3);
ensureStableCluster(3, TimeValue.timeValueSeconds(30), false, randomFrom(dataNodes));
}
assertGreenMasterStability(internalCluster().client(randomFrom(dataNodes)));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -741,8 +741,16 @@ private boolean hasSeenMasterInHasMasterLookupTimeframe() {
public void clusterChanged(ClusterChangedEvent event) {
DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
if (currentMaster == null && previousMaster != null) {
if ((currentMaster == null && previousMaster != null) || (currentMaster != null && previousMaster == null)) {
if (masterHistoryService.getLocalMasterHistory().hasMasterGoneNullAtLeastNTimes(unacceptableNullTransitions)) {
/*
* If the master node has been going to null repeatedly, we want to make a remote request to it to see what it thinks of
* master stability. We want to query the most recent master whether the current master has just transitioned to null or
* just transitioned from null to not null. The reason that we make the latter request is that sometimes when the elected
* master goes to null the most recent master is not responsive for the duration of the request timeout (for example if
* that node is in the middle of a long GC pause which would be both the reason for it not being master and the reason it
* does not respond quickly to transport requests).
*/
DiscoveryNode master = masterHistoryService.getLocalMasterHistory().getMostRecentNonNullMaster();
/*
* If the most recent master was this box, there is no point in making a transport request -- we already know what this
Expand Down