gruntwork-io · lahabana · Mar 2, 2023 · Mar 20, 2023 · denis256 · Mar 19, 2023
@@ -2,6 +2,7 @@ package k8s
 
 import (
 	"fmt"
+	"strings"
 
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
@@ -68,7 +69,36 @@ type PodNotAvailable struct {
 
 // Error is a simple function to return a formatted error message as a string
 func (err PodNotAvailable) Error() string {
-	return fmt.Sprintf("Pod %s is not available, reason: %s, message: %s", err.pod.Name, err.pod.Status.Reason, err.pod.Status.Message)
+	msg := fmt.Sprintf("Pod %s is not available, phase:%s", err.pod.Name, err.pod.Status.Phase)
+	if err.pod.Status.Reason != "" {
+		msg += fmt.Sprintf(", reason:%s", err.pod.Status.Reason)
+	}
+	if err.pod.Status.Message != "" {
+		msg += fmt.Sprintf(", message:'%s'", err.pod.Status.Message)
+	}
+	var res []string
+	for _, s := range err.pod.Status.Conditions {
+		if s.Status == "True" {
+			continue
+		}
+		res = append(res, fmt.Sprintf("'%s:%s->%s'", s.Type, s.Reason, s.Message))
+	}
+	if len(res) > 0 {
+		msg += fmt.Sprintf(", failingConditions:[%s]", strings.Join(res, ","))
+	}
+	res = []string{}
+	for _, s := range err.pod.Status.ContainerStatuses {
+		if s.State.Waiting != nil {
+			res = append(res, fmt.Sprintf("'waiting(%s):%s->%s'", s.Name, s.State.Waiting.Reason, s.State.Waiting.Message))
+		}
+		if s.State.Terminated != nil {
+			res = append(res, fmt.Sprintf("'terminated(%s):%s->%s'", s.Name, s.State.Terminated.Reason, s.State.Terminated.Message))
+		}
+	}
+	if len(res) > 0 {
+		msg += fmt.Sprintf(", nonRunningContainerState:[%s]", strings.Join(res, ","))
+	}
+	return msg
 }
 
 // NewPodNotAvailableError returnes a PodNotAvailable struct when Kubernetes deems a pod is not available

@@ -2,6 +2,7 @@ package k8s
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"time"
 
@@ -111,6 +112,18 @@ func WaitUntilPodAvailable(t testing.TestingT, options *KubectlOptions, podName
 // WaitUntilPodAvailableE waits until all of the containers within the pod are ready and started, retrying the check for the specified amount of times, sleeping
 // for the provided duration between each try.
 func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration) error {
+	return WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, 0)
+}
+
+// WaitUntilPodConsistentlyAvailable similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
+// This is useful to make sure a pod doesn't become unhealthy shortly after starting
+func WaitUntilPodConsistentlyAvailable(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) {
+	require.NoError(t, WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, successes))
+}
+
+// WaitUntilPodConsistentlyAvailableE similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
+// This is useful to make sure a pod doesn't become unhealthy shortly after starting
+func WaitUntilPodConsistentlyAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) error {
 	statusMsg := fmt.Sprintf("Wait for pod %s to be provisioned.", podName)
 	message, err := retry.DoWithRetryE(
 		t,
@@ -128,11 +141,38 @@ func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName
 			return "Pod is now available", nil
 		},
 	)
+	if err == nil {
+		for i := successes; ; i-- {
+			var pod *corev1.Pod
+			pod, err = GetPodE(t, options, podName)
+			if err != nil {
+				break
+			}
+			if !IsPodAvailable(pod) {
+				err = NewPodNotAvailableError(pod)
+				break
+			}
+			if i == 0 {
+				break
+			}
+			logger.Logf(t, "Wait %s before checking if pod %s is still available", sleepBetweenRetries, podName)
+			time.Sleep(sleepBetweenRetries)
+		}
+	}
 	if err != nil {
-		logger.Logf(t, "Timedout waiting for Pod to be provisioned: %s", err)
+		logger.Log(t, "Failed waiting for pod to be provisioned", err)
+		notAvailableError := PodNotAvailable{}
+		if errors.As(err, &notAvailableError) {
+			if notAvailableError.pod.Status.Phase == corev1.PodRunning {
+				_, logsError := GetPodLogsE(t, options, notAvailableError.pod, "")
+				if logsError != nil {
+					logger.Log(t, "Failed to retrieve logs", err)
+				}
+			}
+		}
 		return err
 	}
-	logger.Logf(t, message)
+	logger.Log(t, message)
 	return nil
 }
 

@@ -2,6 +2,7 @@
 package retry
 
 import (
+	"errors"
 	"fmt"
 	"regexp"
 	"time"
@@ -87,15 +88,15 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
 	var output interface{}
 	var err error
 
+	logger.Log(t, actionDescription)
 	for i := 0; i <= maxRetries; i++ {
-		logger.Log(t, actionDescription)
 
 		output, err = action()
 		if err == nil {
 			return output, nil
 		}
 
-		if _, isFatalErr := err.(FatalError); isFatalErr {
+		if errors.Is(err, FatalError{}) {
 			logger.Logf(t, "Returning due to fatal error: %v", err)
 			return output, err
 		}
@@ -104,7 +105,7 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
 		time.Sleep(sleepBetweenRetries)
 	}
 
-	return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries}
+	return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries, LastError: err}
 }
 
 // DoWithRetryableErrors runs the specified action. If it returns a value, return that value. If it returns an error,
@@ -202,12 +203,17 @@ func (err TimeoutExceeded) Error() string {
 type MaxRetriesExceeded struct {
 	Description string
 	MaxRetries  int
+	LastError   error
 }
 
 func (err MaxRetriesExceeded) Error() string {
 	return fmt.Sprintf("'%s' unsuccessful after %d retries", err.Description, err.MaxRetries)
 }
 
+func (err MaxRetriesExceeded) Unwrap() error {
+	return err.LastError
+}
+
 // FatalError is a marker interface for errors that should not be retried.
 type FatalError struct {
 	Underlying error
@@ -216,3 +222,7 @@ type FatalError struct {
 func (err FatalError) Error() string {
 	return fmt.Sprintf("FatalError{Underlying: %v}", err.Underlying)
 }
+
+func (err FatalError) Unwrap() error {
+	return err.Underlying
+}
@@ -0,0 +1,21 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-app
+spec:
+  selector:
+    matchLabels:
+      app: test-app
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: test-app
+    spec:
+      containers:
+        - name: app
+          image: busybox
+          command:
+            ["/bin/sh", "-c", "for i in `seq 1 2`; do echo ${i} sleeping 1 sec; sleep 1; done; echo Finished!"]
+
@@ -0,0 +1,28 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-app
+spec:
+  selector:
+    matchLabels:
+      app: test-app
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: test-app
+    spec:
+      containers:
+        - name: app
+          image: busybox
+          command:
+            ["/bin/sh", "-c", "touch /tmp/healthy;for i in `seq 1 3`; do echo ${i} sleeping 1 sec; sleep 1; done; rm /tmp/healthy; sleep 1000"]
+          readinessProbe:
+            exec:
+              command:
+                - cat
+                - /tmp/healthy
+            periodSeconds: 1
+            initialDelaySeconds: 1
+
@@ -0,0 +1,28 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-app
+spec:
+  selector:
+    matchLabels:
+      app: test-app
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: test-app
+    spec:
+      containers:
+        - name: app
+          image: busybox
+          command:
+            ["/bin/sh", "-c", "for i in `seq 1 10`; do echo ${i} sleeping 1 sec; sleep 1; done; touch /tmp/healthy"]
+          startupProbe:
+            exec:
+              command:
+                - cat
+                - /tmp/healthy
+            periodSeconds: 1
+            initialDelaySeconds: 1
+
@@ -0,0 +1,23 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-app
+spec:
+  selector:
+    matchLabels:
+      app: test-app
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: test-app
+    spec:
+      containers:
+        - name: nginx
+          image: nginx:1.15.7
+          ports:
+            - containerPort: 80
+          resources:
+            requests:
+              cpu: 100
@@ -0,0 +1,20 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-app
+spec:
+  selector:
+    matchLabels:
+      app: test-app
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: test-app
+    spec:
+      containers:
+        - name: app
+          image: not-an-place/not-an-app
+          ports:
+            - containerPort: 5000