Skip to content

Commit

Permalink
sonobuoy: re-set the assume role credentials if it expires
Browse files Browse the repository at this point in the history
We use IAM Role chaining on ephemeral testing infrastructure. Role chaining
limits AWS CLI or AWS API role session to a maximum of one hour. When we assume
the k8s new account role using role chaining and provide a DurationSeconds parameter
value greater than one hour, the operation fails. Testsys sonobuoy agent
needs more than one hour, so we need to refresh the credential when the
client asks us to provide credentials
  • Loading branch information
gthao313 committed Mar 18, 2024
1 parent f1b58de commit 1605299
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 9 deletions.
10 changes: 9 additions & 1 deletion bottlerocket/agents/src/bin/k8s-workload-agent/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand All @@ -115,7 +116,14 @@ where
.await?;
info!("Stored kubeconfig in {}", TEST_CLUSTER_KUBECONFIG_PATH);

rerun_failed_workload(TEST_CLUSTER_KUBECONFIG_PATH, &self.results_dir, info_client).await
rerun_failed_workload(
TEST_CLUSTER_KUBECONFIG_PATH,
&self.results_dir,
info_client,
&self.config,
&self.aws_secret_name.as_ref(),
)
.await
}

async fn terminate(&mut self) -> Result<(), Self::E> {
Expand Down
6 changes: 4 additions & 2 deletions bottlerocket/agents/src/bin/sonobuoy-test-agent/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ use test_agent::{
};
use testsys_model::{SecretName, TestResults};

// Default Sonobuoy agents assume role duration to 4 hours.
const DEFAULT_ASSUME_ROLE_SESSION_DURATION: i32 = 14400;
// Default Sonobuoy agents assume role duration to 1 hour.
const DEFAULT_ASSUME_ROLE_SESSION_DURATION: i32 = 3600;

struct SonobuoyTestRunner {
config: SonobuoyConfig,
Expand Down Expand Up @@ -111,6 +111,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand Down Expand Up @@ -153,6 +154,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand Down
36 changes: 33 additions & 3 deletions bottlerocket/agents/src/sonobuoy.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::error;
use agent_utils::aws::aws_config;
use bottlerocket_types::agent_config::{SonobuoyConfig, SONOBUOY_RESULTS_FILENAME};
use log::{error, info, trace};
use serde_json::Value;
Expand All @@ -9,7 +10,7 @@ use std::path::Path;
use std::process::Command;
use std::time::Duration;
use test_agent::InfoClient;
use testsys_model::{Outcome, TestResults};
use testsys_model::{Outcome, SecretName, TestResults};

/// Timeout for sonobuoy status to become available (seconds)
const SONOBUOY_STATUS_TIMEOUT: u64 = 900;
Expand All @@ -22,6 +23,7 @@ pub async fn run_sonobuoy<I>(
sonobuoy_config: &SonobuoyConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -102,7 +104,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Sonobuoy status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, None, info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
None,
info_client,
&sonobuoy_config.assume_role,
aws_secret_name,
)
.await?;
info!("Sonobuoy testing has completed, checking results");

results_sonobuoy(kubeconfig_path, results_dir)
Expand All @@ -115,6 +124,7 @@ pub async fn rerun_failed_sonobuoy<I>(
sonobuoy_config: &SonobuoyConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -175,7 +185,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Sonobuoy status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, None, info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
None,
info_client,
&sonobuoy_config.assume_role,
aws_secret_name,
)
.await?;
info!("Sonobuoy testing has completed, checking results");

results_sonobuoy(kubeconfig_path, results_dir)
Expand Down Expand Up @@ -220,6 +237,8 @@ pub async fn wait_for_sonobuoy_results<I>(
kubeconfig_path: &str,
namespace: Option<&str>,
info_client: &I,
assume_role: &Option<String>,
aws_secret_name: &Option<&SecretName>,
) -> Result<(), error::Error>
where
I: InfoClient,
Expand All @@ -230,10 +249,20 @@ where
..Default::default()
};
let mut retries = 0;
// Max duration for assume role credential is 3600 seconds, and we refresh every 110 loops * 30 seconds (loop sleep time) before it expires.
let mut credential_refresh_countdown = 110;

loop {
if retries > 5 {
return Err(error::Error::SonobuoyStatus { retries });
}

// Refresh the credentials if the countdown is 0
if credential_refresh_countdown == 0 {
aws_config(&aws_secret_name, assume_role, &None, &None, &None, true).await?;
credential_refresh_countdown = 110;
}

let kubeconfig_arg = vec!["--kubeconfig", kubeconfig_path];
let namespace_arg = namespace
.map(|namespace| vec!["--namespace", namespace])
Expand Down Expand Up @@ -300,6 +329,7 @@ where
.for_each(|e| error!("Unable to send test update: {}", e));

tokio::time::sleep(Duration::from_secs(30)).await;
credential_refresh_countdown -= 1;
}
}

Expand Down
23 changes: 20 additions & 3 deletions bottlerocket/agents/src/workload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::Duration;
use test_agent::InfoClient;
use testsys_model::TestResults;
use testsys_model::{SecretName, TestResults};

/// Timeout for sonobuoy status to become available (seconds)
const SONOBUOY_STATUS_TIMEOUT: u64 = 900;
Expand All @@ -24,6 +24,7 @@ pub async fn run_workload<I>(
workload_config: &WorkloadConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -92,7 +93,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Workload status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, Some("testsys-workload"), info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
Some("testsys-workload"),
info_client,
&workload_config.assume_role,
aws_secret_name,
)
.await?;
info!("Workload testing has completed, checking results");

results_workload(kubeconfig_path, results_dir)
Expand All @@ -103,6 +111,8 @@ pub async fn rerun_failed_workload<I>(
kubeconfig_path: &str,
results_dir: &Path,
info_client: &I,
workload_config: &WorkloadConfig,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -138,7 +148,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Workload status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, Some("testsys-workload"), info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
Some("testsys-workload"),
info_client,
&workload_config.assume_role,
aws_secret_name,
)
.await?;
info!("Workload testing has completed, checking results");

results_workload(kubeconfig_path, results_dir)
Expand Down

0 comments on commit 1605299

Please sign in to comment.