Skip to content

Commit

Permalink
add logs when pod failed to start when using WaitUntilPodAvailableE
Browse files Browse the repository at this point in the history
Fix #1251

Signed-off-by: Charly Molter <charly.molter@konghq.com>
  • Loading branch information
lahabana committed Mar 10, 2023
1 parent 81b5820 commit c37568a
Show file tree
Hide file tree
Showing 9 changed files with 324 additions and 6 deletions.
32 changes: 31 additions & 1 deletion modules/k8s/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package k8s

import (
"fmt"
"strings"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -68,7 +69,36 @@ type PodNotAvailable struct {

// Error is a simple function to return a formatted error message as a string
func (err PodNotAvailable) Error() string {
return fmt.Sprintf("Pod %s is not available, reason: %s, message: %s", err.pod.Name, err.pod.Status.Reason, err.pod.Status.Message)
msg := fmt.Sprintf("Pod %s is not available, phase:%s", err.pod.Name, err.pod.Status.Phase)
if err.pod.Status.Reason != "" {
msg += fmt.Sprintf(", reason:%s", err.pod.Status.Reason)
}
if err.pod.Status.Message != "" {
msg += fmt.Sprintf(", message:'%s'", err.pod.Status.Message)
}
var res []string
for _, s := range err.pod.Status.Conditions {
if s.Status == "True" {
continue
}
res = append(res, fmt.Sprintf("'%s:%s->%s'", s.Type, s.Reason, s.Message))
}
if len(res) > 0 {
msg += fmt.Sprintf(", failingConditions:[%s]", strings.Join(res, ","))
}
res = []string{}
for _, s := range err.pod.Status.ContainerStatuses {
if s.State.Waiting != nil {
res = append(res, fmt.Sprintf("'waiting(%s):%s->%s'", s.Name, s.State.Waiting.Reason, s.State.Waiting.Message))
}
if s.State.Terminated != nil {
res = append(res, fmt.Sprintf("'terminated(%s):%s->%s'", s.Name, s.State.Terminated.Reason, s.State.Terminated.Message))
}
}
if len(res) > 0 {
msg += fmt.Sprintf(", nonRunningContainerState:[%s]", strings.Join(res, ","))
}
return msg
}

// NewPodNotAvailableError returnes a PodNotAvailable struct when Kubernetes deems a pod is not available
Expand Down
44 changes: 42 additions & 2 deletions modules/k8s/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package k8s

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -111,6 +112,18 @@ func WaitUntilPodAvailable(t testing.TestingT, options *KubectlOptions, podName
// WaitUntilPodAvailableE waits until all of the containers within the pod are ready and started, retrying the check for the specified amount of times, sleeping
// for the provided duration between each try.
func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration) error {
return WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, 0)
}

// WaitUntilPodConsistentlyAvailable similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
// This is useful to make sure a pod doesn't become unhealthy shortly after starting
func WaitUntilPodConsistentlyAvailable(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) {
require.NoError(t, WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, successes))
}

// WaitUntilPodConsistentlyAvailableE similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
// This is useful to make sure a pod doesn't become unhealthy shortly after starting
func WaitUntilPodConsistentlyAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) error {
statusMsg := fmt.Sprintf("Wait for pod %s to be provisioned.", podName)
message, err := retry.DoWithRetryE(
t,
Expand All @@ -128,11 +141,38 @@ func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName
return "Pod is now available", nil
},
)
if err == nil {
for i := successes; ; i-- {
var pod *corev1.Pod
pod, err = GetPodE(t, options, podName)
if err != nil {
break
}
if !IsPodAvailable(pod) {
err = NewPodNotAvailableError(pod)
break
}
if i == 0 {
break
}
logger.Logf(t, "Wait %s before checking if pod %s is still available", sleepBetweenRetries, podName)
time.Sleep(sleepBetweenRetries)
}
}
if err != nil {
logger.Logf(t, "Timedout waiting for Pod to be provisioned: %s", err)
logger.Log(t, "Failed waiting for pod to be provisioned", err)
notAvailableError := PodNotAvailable{}
if errors.As(err, &notAvailableError) {
if notAvailableError.pod.Status.Phase == corev1.PodRunning {
_, logsError := GetPodLogsE(t, options, notAvailableError.pod, "")
if logsError != nil {
logger.Log(t, "Failed to retrieve logs", err)
}
}
}
return err
}
logger.Logf(t, message)
logger.Log(t, message)
return nil
}

Expand Down
16 changes: 13 additions & 3 deletions modules/retry/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
package retry

import (
"errors"
"fmt"
"regexp"
"time"
Expand Down Expand Up @@ -87,15 +88,15 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
var output interface{}
var err error

logger.Log(t, actionDescription)
for i := 0; i <= maxRetries; i++ {
logger.Log(t, actionDescription)

output, err = action()
if err == nil {
return output, nil
}

if _, isFatalErr := err.(FatalError); isFatalErr {
if errors.Is(err, FatalError{}) {
logger.Logf(t, "Returning due to fatal error: %v", err)
return output, err
}
Expand All @@ -104,7 +105,7 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
time.Sleep(sleepBetweenRetries)
}

return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries}
return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries, LastError: err}
}

// DoWithRetryableErrors runs the specified action. If it returns a value, return that value. If it returns an error,
Expand Down Expand Up @@ -202,12 +203,17 @@ func (err TimeoutExceeded) Error() string {
type MaxRetriesExceeded struct {
Description string
MaxRetries int
LastError error
}

func (err MaxRetriesExceeded) Error() string {
return fmt.Sprintf("'%s' unsuccessful after %d retries", err.Description, err.MaxRetries)
}

func (err MaxRetriesExceeded) Unwrap() error {
return err.LastError
}

// FatalError is a marker interface for errors that should not be retried.
type FatalError struct {
Underlying error
Expand All @@ -216,3 +222,7 @@ type FatalError struct {
func (err FatalError) Error() string {
return fmt.Sprintf("FatalError{Underlying: %v}", err.Underlying)
}

func (err FatalError) Unwrap() error {
return err.Underlying
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "for i in `seq 1 2`; do echo ${i} sleeping 1 sec; sleep 1; done; echo Finished!"]

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "touch /tmp/healthy;for i in `seq 1 3`; do echo ${i} sleeping 1 sec; sleep 1; done; rm /tmp/healthy; sleep 1000"]
readinessProbe:
exec:
command:
- cat
- /tmp/healthy
periodSeconds: 1
initialDelaySeconds: 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "for i in `seq 1 10`; do echo ${i} sleeping 1 sec; sleep 1; done; touch /tmp/healthy"]
startupProbe:
exec:
command:
- cat
- /tmp/healthy
periodSeconds: 1
initialDelaySeconds: 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: nginx
image: nginx:1.15.7
ports:
- containerPort: 80
resources:
requests:
cpu: 100
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: not-an-place/not-an-app
ports:
- containerPort: 5000

0 comments on commit c37568a

Please sign in to comment.