Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-23792] Do not terminate stopped EC2 nodes that were just booted #632

Merged
merged 8 commits into from
Jun 18, 2021
11 changes: 10 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2Computer.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public String getDecodedConsoleOutput() throws AmazonClientException {
GetConsoleOutputRequest request = new GetConsoleOutputRequest(getInstanceId());
return ec2.getConsoleOutput(request).getDecodedOutput();
}

/**
* Obtains the instance state description in EC2.
*
Expand Down Expand Up @@ -154,6 +154,15 @@ public String getUptimeString() throws AmazonClientException, InterruptedExcepti
return Util.getTimeSpanString(getUptime());
}

/**
* Return the time this instance was launched in ms since the epoch.
*
* @return Time this instance was launched, in ms since the epoch.
*/
public long getLaunchTime() throws InterruptedException {
return this.describeInstance().getLaunchTime().getTime();
}

/**
* When the agent is deleted, terminate the instance.
*/
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,13 @@ private long internalCheck(EC2Computer computer) {

if (computer.isIdle() && !DISABLED) {
final long uptime;
final long launchedAtMs;
InstanceState state;

try {
state = computer.getState(); //Get State before Uptime because getState will refresh the cached EC2 info
uptime = computer.getUptime();
launchedAtMs = computer.getLaunchTime();
} catch (AmazonClientException | InterruptedException e) {
// We'll just retry next time we test for idleness.
LOGGER.fine("Exception while checking host uptime for " + computer.getName()
Expand Down Expand Up @@ -190,7 +192,7 @@ private long internalCheck(EC2Computer computer) {
}
}

final long idleMilliseconds = this.clock.millis() - computer.getIdleStartMilliseconds();
final long idleMilliseconds = this.clock.millis() - Math.max(computer.getIdleStartMilliseconds(), launchedAtMs);


if (idleTerminationMinutes > 0) {
Expand Down
95 changes: 81 additions & 14 deletions src/test/java/hudson/plugins/ec2/EC2RetentionStrategyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
import hudson.plugins.ec2.util.SSHCredentialHelper;
import hudson.slaves.NodeProperty;
import hudson.slaves.OfflineCause;
import jenkins.util.NonLocalizable;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
Expand All @@ -31,13 +25,17 @@
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.stream.Collectors;

import jenkins.util.NonLocalizable;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

public class EC2RetentionStrategyTest {

Expand Down Expand Up @@ -68,7 +66,7 @@ public void testOnBillingHourRetention() throws Exception {

for (int i = 0; i < upTime.size(); i++) {
int[] t = upTime.get(i);
EC2Computer computer = computerWithIdleTime(t[0], t[1]);
EC2Computer computer = computerWithUpTime(t[0], t[1]);
EC2RetentionStrategy rs = new EC2RetentionStrategy("-2");
checkRetentionStrategy(rs, computer);
assertEquals("Expected " + t[0] + "m" + t[1] + "s to be " + expected.get(i), (boolean) expected.get(i), idleTimeoutCalled.get());
Expand All @@ -77,15 +75,16 @@ public void testOnBillingHourRetention() throws Exception {
}
}

private EC2Computer computerWithIdleTime(final int minutes, final int seconds) throws Exception {
return computerWithIdleTime(minutes, seconds, false, null);
private EC2Computer computerWithUpTime(final int minutes, final int seconds) throws Exception {
return computerWithUpTime(minutes, seconds, false, null);
}

/*
* Creates a computer with the params passed. If isOnline is null, the computer returns the real value, otherwise,
* the computer returns the value established.
*/
private EC2Computer computerWithIdleTime(final int minutes, final int seconds, final Boolean isOffline, final Boolean isConnecting) throws Exception {
private EC2Computer computerWithUpTime(final int minutes, final int seconds, final Boolean isOffline, final Boolean isConnecting) throws Exception {
idleTimeoutCalled.set(false);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

required, since the state is otherwise never reset across tests. Admittedly, this could be fixed better, but I wanted to keep the PR scope as small as possible to make it easy to review :-)

final EC2AbstractSlave slave = new EC2AbstractSlave("name", "id", "description", "fs", 1, null, "label", null, null, "init", "tmpDir", new ArrayList<NodeProperty<?>>(), "remote", "jvm", false, "idle", null, "cloud", false, Integer.MAX_VALUE, null, ConnectionStrategy.PRIVATE_IP, -1) {
@Override
public void terminate() {
Expand All @@ -102,6 +101,7 @@ void idleTimeout() {
}
};
EC2Computer computer = new EC2Computer(slave) {
private final long launchedAtMs = Instant.now().minus(Duration.ofSeconds(minutes * 60L + seconds)).toEpochMilli();

@Override
public EC2AbstractSlave getNode() {
Expand All @@ -113,6 +113,11 @@ public long getUptime() throws AmazonClientException, InterruptedException {
return ((minutes * 60L) + seconds) * 1000L;
}

@Override
public long getLaunchTime() throws InterruptedException {
return this.launchedAtMs;
}

@Override
public boolean isOffline() {
return isOffline == null ? super.isOffline() : isOffline;
Expand Down Expand Up @@ -206,7 +211,7 @@ public void testTerminateOfflineComputerIfNotConnecting() throws Exception {
OfflineCause cause = OfflineCause.create(new NonLocalizable("Testing terminate on offline computer"));

// A computer returning the real isOffline value and still connecting
EC2Computer computer = computerWithIdleTime(0, 0, null, true);
EC2Computer computer = computerWithUpTime(0, 0, null, true);
computer.setTemporarilyOffline(true, cause);
// We don't terminate this one
rs.check(computer);
Expand All @@ -215,18 +220,80 @@ public void testTerminateOfflineComputerIfNotConnecting() throws Exception {

// A computer returning the real isOffline value and not connecting
rs = new EC2RetentionStrategy("1", clock, nextCheckAfter);
EC2Computer computer2 = computerWithIdleTime(0, 0, null, false);
EC2Computer computer2 = computerWithUpTime(0, 0, null, false);
computer.setTemporarilyOffline(true, cause);
// We terminate this one
rs.check(computer2);
assertThat("The computer is terminated, it should not accept more tasks", idleTimeoutCalled.get(), equalTo(true));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}


/**
* Do not terminate an instance if a computer just launched, and the
* retention strategy timeout has not expired yet. The quirks with idle timeout
* are now correctly accounted for nodes that have been stopped and started
* again (i.e termination policy is stop, rather than terminate).
*
* How does the test below work: for our "mock" EC2Computer, idle start time
* is always the time when the object has been created, and we cannot
* easily control idle start time in a test suite as the relevant method to override
* is declared final.
*
* We can achieve the same result where idle start time is < launch time
* by manipulating the node launch time variables. Since, as we said, idle
* start time is always now, what we end up doing is tricking the node
* into returning a launch time in the future. To do this we pass a negative
* value for minutes to {@link #computerWithUpTime(int, int, Boolean, Boolean)}.
*
* Now if we set retention time to something bigger than the computer uptime
* we can reproduce the issue, and prove its resolution.
*
* The time at which we perform the check must be < than computer uptime
* plus the retention interval.
*/
@Test
public void testDoNotTerminateInstancesJustBooted() throws Exception {
unicolet marked this conversation as resolved.
Show resolved Hide resolved
logging.record(hudson.plugins.ec2.EC2RetentionStrategy.class, Level.FINE);
logging.capture(5);
final int COMPUTER_UPTIME_MINUTES=5;
final int RETENTION_MINUTES=5;
final int CHECK_TIME_MINUTES=COMPUTER_UPTIME_MINUTES+RETENTION_MINUTES-1;
final Instant checkTime = Instant.now().plus(Duration.ofMinutes(CHECK_TIME_MINUTES));
new EC2RetentionStrategy(
String.format("%d", RETENTION_MINUTES),
Clock.fixed(checkTime.plusSeconds(1), zoneId),
checkTime.toEpochMilli()
).check(computerWithUpTime(-COMPUTER_UPTIME_MINUTES, 0, null, false));
assertThat("The computer is terminated, but should not be", idleTimeoutCalled.get(), equalTo(false));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}

/**
* Ensure that we terminate instances that stay unconnected, as soon as
* termination time expires.
*/
@Test
public void testCleanupUnconnectedInstanceAfterTerminationTime() throws Exception {
logging.record(hudson.plugins.ec2.EC2RetentionStrategy.class, Level.FINE);
logging.capture(5);
final int COMPUTER_UPTIME_MINUTES=5;
final int RETENTION_MINUTES=5;
final int CHECK_TIME_MINUTES=COMPUTER_UPTIME_MINUTES+RETENTION_MINUTES+1;
final Instant checkTime = Instant.now().plus(Duration.ofMinutes(CHECK_TIME_MINUTES));
new EC2RetentionStrategy(
String.format("%d", RETENTION_MINUTES),
Clock.fixed(checkTime.plusSeconds(1), zoneId),
checkTime.toEpochMilli()
).check(computerWithUpTime(-COMPUTER_UPTIME_MINUTES, 0, null, false));
assertThat("The computer is not terminated, but should be", idleTimeoutCalled.get(), equalTo(true));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}

@Test
public void testInternalCheckRespectsWait() throws Exception {
List<Boolean> expected = new ArrayList<>();
EC2Computer computer = computerWithIdleTime(0, 0);
EC2Computer computer = computerWithUpTime(0, 0);
List<int[]> upTimeAndCheckAfter = new ArrayList<>();

upTimeAndCheckAfter.add(new int[] { 0, -1 });
Expand Down