Skip to content

Commit

Permalink
Fixing the conditions for fetching remote master history (elastic#89472)
Browse files Browse the repository at this point in the history
Fixing the conditions that the health API uses to determine when to check with a master node for its view
of master history if the master appears to have gone null repeatedly.
  • Loading branch information
masseyke committed Sep 13, 2022
1 parent 70585a8 commit c6833b5
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/89472.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 89472
summary: Fixing the conditions for fetching remote master history
area: Health
type: bug
issues:
- 89431
Original file line number Diff line number Diff line change
Expand Up @@ -426,28 +426,24 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.build()
);
int nullTransitionsThreshold = 1;
final List<String> dataNodes = internalCluster().startDataOnlyNodes(
2,
Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), nullTransitionsThreshold)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(60, TimeUnit.SECONDS))
.build()
);
ensureStableCluster(3);
for (int i = 0; i < 2; i++) {
for (int i = 0; i < nullTransitionsThreshold + 1; i++) {
final String masterNode = masterNodes.get(0);

// Simulating a painful gc by suspending all threads for a long time on the current elected master node.
SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), masterNode);

final CountDownLatch dataNodeMasterSteppedDown = new CountDownLatch(2);
internalCluster().getInstance(ClusterService.class, masterNode).addListener(event -> {
if (event.state().nodes().getMasterNodeId() == null) {
dataNodeMasterSteppedDown.countDown();
}
});
internalCluster().getInstance(ClusterService.class, dataNodes.get(0)).addListener(event -> {
if (event.state().nodes().getMasterNodeId() == null) {
dataNodeMasterSteppedDown.countDown();
Expand All @@ -466,7 +462,7 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
// Stop disruption
logger.info("--> unfreezing node [{}]", masterNode);
masterNodeDisruption.stopDisrupting();
ensureStableCluster(3);
ensureStableCluster(3, TimeValue.timeValueSeconds(30), false, randomFrom(dataNodes));
}
assertGreenMasterStability(internalCluster().client(randomFrom(dataNodes)));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,16 @@ private boolean hasSeenMasterInHasMasterLookupTimeframe() {
public void clusterChanged(ClusterChangedEvent event) {
DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
if (currentMaster == null && previousMaster != null) {
if ((currentMaster == null && previousMaster != null) || (currentMaster != null && previousMaster == null)) {
if (masterHistoryService.getLocalMasterHistory().hasMasterGoneNullAtLeastNTimes(unacceptableNullTransitions)) {
/*
* If the master node has been going to null repeatedly, we want to make a remote request to it to see what it thinks of
* master stability. We want to query the most recent master whether the current master has just transitioned to null or
* just transitioned from null to not null. The reason that we make the latter request is that sometimes when the elected
* master goes to null the most recent master is not responsive for the duration of the request timeout (for example if
* that node is in the middle of a long GC pause which would be both the reason for it not being master and the reason it
* does not respond quickly to transport requests).
*/
DiscoveryNode master = masterHistoryService.getLocalMasterHistory().getMostRecentNonNullMaster();
/*
* If the most recent master was this box, there is no point in making a transport request -- we already know what this
Expand Down

0 comments on commit c6833b5

Please sign in to comment.