Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-23792] Do not terminate stopped EC2 nodes that were just booted #632

Merged
merged 8 commits into from
Jun 18, 2021
11 changes: 10 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2Computer.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public String getDecodedConsoleOutput() throws AmazonClientException {
GetConsoleOutputRequest request = new GetConsoleOutputRequest(getInstanceId());
return ec2.getConsoleOutput(request).getDecodedOutput();
}

/**
* Obtains the instance state description in EC2.
*
Expand Down Expand Up @@ -154,6 +154,15 @@ public String getUptimeString() throws AmazonClientException, InterruptedExcepti
return Util.getTimeSpanString(getUptime());
}

/**
* Return the time this instance was launched in ms since the epoch.
*
* @return Time this instance was launched, in ms since the epoch.
*/
public long launchedAtMs() throws InterruptedException {
unicolet marked this conversation as resolved.
Show resolved Hide resolved
return this.describeInstance().getLaunchTime().getTime();
}

/**
* When the agent is deleted, terminate the instance.
*/
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,13 @@ private long internalCheck(EC2Computer computer) {

if (computer.isIdle() && !DISABLED) {
final long uptime;
final long launchedAtMs;
InstanceState state;

try {
state = computer.getState(); //Get State before Uptime because getState will refresh the cached EC2 info
uptime = computer.getUptime();
launchedAtMs = computer.launchedAtMs();
} catch (AmazonClientException | InterruptedException e) {
// We'll just retry next time we test for idleness.
LOGGER.fine("Exception while checking host uptime for " + computer.getName()
Expand Down Expand Up @@ -190,7 +192,7 @@ private long internalCheck(EC2Computer computer) {
}
}

final long idleMilliseconds = this.clock.millis() - Math.max(computer.getIdleStartMilliseconds(), uptime);
final long idleMilliseconds = this.clock.millis() - Math.max(computer.getIdleStartMilliseconds(), launchedAtMs);


if (idleTerminationMinutes > 0) {
Expand Down
41 changes: 34 additions & 7 deletions src/test/java/hudson/plugins/ec2/EC2RetentionStrategyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
import hudson.plugins.ec2.util.SSHCredentialHelper;
import hudson.slaves.NodeProperty;
import hudson.slaves.OfflineCause;
import jenkins.util.NonLocalizable;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
Expand All @@ -26,18 +20,23 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.stream.Collectors;

import jenkins.util.NonLocalizable;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

public class EC2RetentionStrategyTest {

Expand Down Expand Up @@ -103,6 +102,7 @@ void idleTimeout() {
}
};
EC2Computer computer = new EC2Computer(slave) {
private final long launchedAtMs = new Date().getTime();

@Override
public EC2AbstractSlave getNode() {
Expand All @@ -114,6 +114,11 @@ public long getUptime() throws AmazonClientException, InterruptedException {
return ((minutes * 60L) + seconds) * 1000L;
}

@Override
public long launchedAtMs() throws InterruptedException {
return this.launchedAtMs;
}

@Override
public boolean isOffline() {
return isOffline == null ? super.isOffline() : isOffline;
Expand Down Expand Up @@ -238,6 +243,28 @@ public void testTerminateOfflineComputerIfNotConnecting() throws Exception {
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}


/**
* Do not terminate na instance if a computer just launched, and the
* retention strategy timeout has not expired yet. The idle timeout
* is correctly accounted for nodes that have been stopped and started
* again (i.e termination policy is stop, rather than terminate).
*/
@Test
public void testDoNotTerminateInstancesJustBooted() throws Exception {
unicolet marked this conversation as resolved.
Show resolved Hide resolved
logging.record(hudson.plugins.ec2.EC2RetentionStrategy.class, Level.FINE);
logging.capture(5);
final Instant inOneMinute = Instant.now().plus(Duration.ofSeconds(60));
// We terminate this one
new EC2RetentionStrategy(
"5",
Clock.fixed(inOneMinute.plusSeconds(1), zoneId),
inOneMinute.toEpochMilli()
).check(computerWithIdleTime(0, 0, null, false));
assertThat("The computer is terminated, but should not be", idleTimeoutCalled.get(), equalTo(false));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}

@Test
public void testInternalCheckRespectsWait() throws Exception {
List<Boolean> expected = new ArrayList<>();
Expand Down