Skip to content

Commit

Permalink
[JENKINS-23792] Do not terminate stopped EC2 nodes that were just boo…
Browse files Browse the repository at this point in the history
…ted (#632)

* regression, do not terminate stopped EC2 nodes that were just booted again

when a node just booted it is not connected, and simultaneously its idle start time
could still be set to the last time the node was booted. If the instance is checked by the plugin while in this phase, the plugin will erroneously decide to terminate it again. The change in this PR adds a safety check to ensure that
idle time is calculated from the greatest of idle time as returned by jenkins or the node uptime, as returned by EC2.

Fixes: https://issues.jenkins.io/browse/JENKINS-23792

* get launch time the right way

* move test into its own test case

* remove old test case, which was moved into its own method a few lines below

* rename launchedAtMs to getLaunchTime

* provide valid test case for issue fixed by this PR https://issues.jenkins.io/browse/JENKINS-23792

* wording

* fix reason for assertion
  • Loading branch information
unicolet committed Jun 18, 2021
1 parent 97d3dfa commit 64ab73b
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 16 deletions.
11 changes: 10 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2Computer.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public String getDecodedConsoleOutput() throws AmazonClientException {
GetConsoleOutputRequest request = new GetConsoleOutputRequest(getInstanceId());
return ec2.getConsoleOutput(request).getDecodedOutput();
}

/**
* Obtains the instance state description in EC2.
*
Expand Down Expand Up @@ -154,6 +154,15 @@ public String getUptimeString() throws AmazonClientException, InterruptedExcepti
return Util.getTimeSpanString(getUptime());
}

/**
* Return the time this instance was launched in ms since the epoch.
*
* @return Time this instance was launched, in ms since the epoch.
*/
public long getLaunchTime() throws InterruptedException {
return this.describeInstance().getLaunchTime().getTime();
}

/**
* When the agent is deleted, terminate the instance.
*/
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,13 @@ private long internalCheck(EC2Computer computer) {

if (computer.isIdle() && !DISABLED) {
final long uptime;
final long launchedAtMs;
InstanceState state;

try {
state = computer.getState(); //Get State before Uptime because getState will refresh the cached EC2 info
uptime = computer.getUptime();
launchedAtMs = computer.getLaunchTime();
} catch (AmazonClientException | InterruptedException e) {
// We'll just retry next time we test for idleness.
LOGGER.fine("Exception while checking host uptime for " + computer.getName()
Expand Down Expand Up @@ -190,7 +192,7 @@ private long internalCheck(EC2Computer computer) {
}
}

final long idleMilliseconds = this.clock.millis() - computer.getIdleStartMilliseconds();
final long idleMilliseconds = this.clock.millis() - Math.max(computer.getIdleStartMilliseconds(), launchedAtMs);


if (idleTerminationMinutes > 0) {
Expand Down
95 changes: 81 additions & 14 deletions src/test/java/hudson/plugins/ec2/EC2RetentionStrategyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
import hudson.plugins.ec2.util.SSHCredentialHelper;
import hudson.slaves.NodeProperty;
import hudson.slaves.OfflineCause;
import jenkins.util.NonLocalizable;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
Expand All @@ -31,13 +25,17 @@
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.stream.Collectors;

import jenkins.util.NonLocalizable;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;

public class EC2RetentionStrategyTest {

Expand Down Expand Up @@ -68,7 +66,7 @@ public void testOnBillingHourRetention() throws Exception {

for (int i = 0; i < upTime.size(); i++) {
int[] t = upTime.get(i);
EC2Computer computer = computerWithIdleTime(t[0], t[1]);
EC2Computer computer = computerWithUpTime(t[0], t[1]);
EC2RetentionStrategy rs = new EC2RetentionStrategy("-2");
checkRetentionStrategy(rs, computer);
assertEquals("Expected " + t[0] + "m" + t[1] + "s to be " + expected.get(i), (boolean) expected.get(i), idleTimeoutCalled.get());
Expand All @@ -77,15 +75,16 @@ public void testOnBillingHourRetention() throws Exception {
}
}

private EC2Computer computerWithIdleTime(final int minutes, final int seconds) throws Exception {
return computerWithIdleTime(minutes, seconds, false, null);
private EC2Computer computerWithUpTime(final int minutes, final int seconds) throws Exception {
return computerWithUpTime(minutes, seconds, false, null);
}

/*
* Creates a computer with the params passed. If isOnline is null, the computer returns the real value, otherwise,
* the computer returns the value established.
*/
private EC2Computer computerWithIdleTime(final int minutes, final int seconds, final Boolean isOffline, final Boolean isConnecting) throws Exception {
private EC2Computer computerWithUpTime(final int minutes, final int seconds, final Boolean isOffline, final Boolean isConnecting) throws Exception {
idleTimeoutCalled.set(false);
final EC2AbstractSlave slave = new EC2AbstractSlave("name", "id", "description", "fs", 1, null, "label", null, null, "init", "tmpDir", new ArrayList<NodeProperty<?>>(), "remote", "jvm", false, "idle", null, "cloud", false, Integer.MAX_VALUE, null, ConnectionStrategy.PRIVATE_IP, -1) {
@Override
public void terminate() {
Expand All @@ -102,6 +101,7 @@ void idleTimeout() {
}
};
EC2Computer computer = new EC2Computer(slave) {
private final long launchedAtMs = Instant.now().minus(Duration.ofSeconds(minutes * 60L + seconds)).toEpochMilli();

@Override
public EC2AbstractSlave getNode() {
Expand All @@ -113,6 +113,11 @@ public long getUptime() throws AmazonClientException, InterruptedException {
return ((minutes * 60L) + seconds) * 1000L;
}

@Override
public long getLaunchTime() throws InterruptedException {
return this.launchedAtMs;
}

@Override
public boolean isOffline() {
return isOffline == null ? super.isOffline() : isOffline;
Expand Down Expand Up @@ -206,7 +211,7 @@ public void testTerminateOfflineComputerIfNotConnecting() throws Exception {
OfflineCause cause = OfflineCause.create(new NonLocalizable("Testing terminate on offline computer"));

// A computer returning the real isOffline value and still connecting
EC2Computer computer = computerWithIdleTime(0, 0, null, true);
EC2Computer computer = computerWithUpTime(0, 0, null, true);
computer.setTemporarilyOffline(true, cause);
// We don't terminate this one
rs.check(computer);
Expand All @@ -215,18 +220,80 @@ public void testTerminateOfflineComputerIfNotConnecting() throws Exception {

// A computer returning the real isOffline value and not connecting
rs = new EC2RetentionStrategy("1", clock, nextCheckAfter);
EC2Computer computer2 = computerWithIdleTime(0, 0, null, false);
EC2Computer computer2 = computerWithUpTime(0, 0, null, false);
computer.setTemporarilyOffline(true, cause);
// We terminate this one
rs.check(computer2);
assertThat("The computer is terminated, it should not accept more tasks", idleTimeoutCalled.get(), equalTo(true));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}


/**
* Do not terminate an instance if a computer just launched, and the
* retention strategy timeout has not expired yet. The quirks with idle timeout
* are now correctly accounted for nodes that have been stopped and started
* again (i.e termination policy is stop, rather than terminate).
*
* How does the test below work: for our "mock" EC2Computer, idle start time
* is always the time when the object has been created, and we cannot
* easily control idle start time in a test suite as the relevant method to override
* is declared final.
*
* We can achieve the same result where idle start time is < launch time
* by manipulating the node launch time variables. Since, as we said, idle
* start time is always now, what we end up doing is tricking the node
* into returning a launch time in the future. To do this we pass a negative
* value for minutes to {@link #computerWithUpTime(int, int, Boolean, Boolean)}.
*
* Now if we set retention time to something bigger than the computer uptime
* we can reproduce the issue, and prove its resolution.
*
* The time at which we perform the check must be < than computer uptime
* plus the retention interval.
*/
@Test
public void testDoNotTerminateInstancesJustBooted() throws Exception {
logging.record(hudson.plugins.ec2.EC2RetentionStrategy.class, Level.FINE);
logging.capture(5);
final int COMPUTER_UPTIME_MINUTES=5;
final int RETENTION_MINUTES=5;
final int CHECK_TIME_MINUTES=COMPUTER_UPTIME_MINUTES+RETENTION_MINUTES-1;
final Instant checkTime = Instant.now().plus(Duration.ofMinutes(CHECK_TIME_MINUTES));
new EC2RetentionStrategy(
String.format("%d", RETENTION_MINUTES),
Clock.fixed(checkTime.plusSeconds(1), zoneId),
checkTime.toEpochMilli()
).check(computerWithUpTime(-COMPUTER_UPTIME_MINUTES, 0, null, false));
assertThat("The computer is terminated, but should not be", idleTimeoutCalled.get(), equalTo(false));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}

/**
* Ensure that we terminate instances that stay unconnected, as soon as
* termination time expires.
*/
@Test
public void testCleanupUnconnectedInstanceAfterTerminationTime() throws Exception {
logging.record(hudson.plugins.ec2.EC2RetentionStrategy.class, Level.FINE);
logging.capture(5);
final int COMPUTER_UPTIME_MINUTES=5;
final int RETENTION_MINUTES=5;
final int CHECK_TIME_MINUTES=COMPUTER_UPTIME_MINUTES+RETENTION_MINUTES+1;
final Instant checkTime = Instant.now().plus(Duration.ofMinutes(CHECK_TIME_MINUTES));
new EC2RetentionStrategy(
String.format("%d", RETENTION_MINUTES),
Clock.fixed(checkTime.plusSeconds(1), zoneId),
checkTime.toEpochMilli()
).check(computerWithUpTime(-COMPUTER_UPTIME_MINUTES, 0, null, false));
assertThat("The computer is not terminated, but should be", idleTimeoutCalled.get(), equalTo(true));
assertThat(logging.getMessages(), hasItem(containsString("offline but not connecting, will check if it should be terminated because of the idle time configured")));
}

@Test
public void testInternalCheckRespectsWait() throws Exception {
List<Boolean> expected = new ArrayList<>();
EC2Computer computer = computerWithIdleTime(0, 0);
EC2Computer computer = computerWithUpTime(0, 0);
List<int[]> upTimeAndCheckAfter = new ArrayList<>();

upTimeAndCheckAfter.add(new int[] { 0, -1 });
Expand Down

0 comments on commit 64ab73b

Please sign in to comment.