From c8c420094f55d2a4c078ee8e29fec43bf23fa4da Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Thu, 1 Jul 2021 08:34:14 -0400 Subject: [PATCH] [ML] autoscaling context current capacity could be null, this commit handles that (#74822) context commit may be null. This should only really happen early in a cluster's life cycle or if a node was just recently brought online. Mainly because the current node sizes have not been discovered yet and cached. This change should really have been part of #74691 --- .../xpack/ml/integration/AutoscalingIT.java | 7 ++++--- .../ml/autoscaling/MlAutoscalingDeciderService.java | 13 ++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java index 6a554af5edf31..42382960ee769 100644 --- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java +++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java @@ -46,7 +46,7 @@ public class AutoscalingIT extends MlNativeAutodetectIntegTestCase { // This test assumes that xpack.ml.max_machine_memory_percent is 30 // and that xpack.ml.use_auto_machine_memory_percent is false - public void testMLAutoscalingCapacity() { + public void testMLAutoscalingCapacity() throws Exception { SortedMap deciders = new TreeMap<>(); deciders.put(MlAutoscalingDeciderService.NAME, Settings.builder().put(MlAutoscalingDeciderService.DOWN_SCALE_DELAY.getKey(), TimeValue.ZERO).build()); @@ -57,14 +57,15 @@ public void testMLAutoscalingCapacity() { ); assertAcked(client().execute(PutAutoscalingPolicyAction.INSTANCE, request).actionGet()); - assertMlCapacity( + assertBusy(() -> assertMlCapacity( client().execute( GetAutoscalingCapacityAction.INSTANCE, new GetAutoscalingCapacityAction.Request() ).actionGet(), "Requesting scale down as tier and/or node size could be smaller", 0L, - 0L); + 0L) + ); putJob("job1", 100); putJob("job2", 200); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java index 0e7aa8f51b8ae..097a8621e2692 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java @@ -534,9 +534,13 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider // Due to weird rounding errors, it may be that a scale down result COULD cause a scale up // Ensuring the scaleDown here forces the scale down result to always be lower than the current capacity. // This is safe as we know that ALL jobs are assigned at the current capacity - .map(result -> new AutoscalingDeciderResult( - ensureScaleDown(result.requiredCapacity(), context.currentCapacity()), result.reason() - )); + .map(result -> { + AutoscalingCapacity capacity = ensureScaleDown(result.requiredCapacity(), context.currentCapacity()); + if (capacity == null) { + return null; + } + return new AutoscalingDeciderResult(capacity, result.reason()); + }); if (maybeScaleDown.isPresent()) { final AutoscalingDeciderResult scaleDownDecisionResult = maybeScaleDown.get(); @@ -599,6 +603,9 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider } static AutoscalingCapacity ensureScaleDown(AutoscalingCapacity scaleDownResult, AutoscalingCapacity currentCapacity) { + if (scaleDownResult == null || currentCapacity == null) { + return null; + } AutoscalingCapacity newCapacity = new AutoscalingCapacity( new AutoscalingCapacity.AutoscalingResources( currentCapacity.total().storage(),