Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add logs when pod failed to start when using WaitUntilPodAvailableE #1252

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
32 changes: 31 additions & 1 deletion modules/k8s/errors.go
Expand Up @@ -2,6 +2,7 @@ package k8s

import (
"fmt"
"strings"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -68,7 +69,36 @@ type PodNotAvailable struct {

// Error is a simple function to return a formatted error message as a string
func (err PodNotAvailable) Error() string {
return fmt.Sprintf("Pod %s is not available, reason: %s, message: %s", err.pod.Name, err.pod.Status.Reason, err.pod.Status.Message)
msg := fmt.Sprintf("Pod %s is not available, phase:%s", err.pod.Name, err.pod.Status.Phase)
if err.pod.Status.Reason != "" {
msg += fmt.Sprintf(", reason:%s", err.pod.Status.Reason)
}
if err.pod.Status.Message != "" {
msg += fmt.Sprintf(", message:'%s'", err.pod.Status.Message)
}
var res []string
for _, s := range err.pod.Status.Conditions {
if s.Status == "True" {
continue
}
res = append(res, fmt.Sprintf("'%s:%s->%s'", s.Type, s.Reason, s.Message))
}
if len(res) > 0 {
msg += fmt.Sprintf(", failingConditions:[%s]", strings.Join(res, ","))
}
res = []string{}
for _, s := range err.pod.Status.ContainerStatuses {
if s.State.Waiting != nil {
res = append(res, fmt.Sprintf("'waiting(%s):%s->%s'", s.Name, s.State.Waiting.Reason, s.State.Waiting.Message))
}
if s.State.Terminated != nil {
res = append(res, fmt.Sprintf("'terminated(%s):%s->%s'", s.Name, s.State.Terminated.Reason, s.State.Terminated.Message))
}
}
if len(res) > 0 {
msg += fmt.Sprintf(", nonRunningContainerState:[%s]", strings.Join(res, ","))
}
return msg
}

// NewPodNotAvailableError returnes a PodNotAvailable struct when Kubernetes deems a pod is not available
Expand Down
44 changes: 42 additions & 2 deletions modules/k8s/pod.go
Expand Up @@ -2,6 +2,7 @@ package k8s

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -111,6 +112,18 @@ func WaitUntilPodAvailable(t testing.TestingT, options *KubectlOptions, podName
// WaitUntilPodAvailableE waits until all of the containers within the pod are ready and started, retrying the check for the specified amount of times, sleeping
// for the provided duration between each try.
func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration) error {
return WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, 0)
}

// WaitUntilPodConsistentlyAvailable similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
// This is useful to make sure a pod doesn't become unhealthy shortly after starting
func WaitUntilPodConsistentlyAvailable(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) {
require.NoError(t, WaitUntilPodConsistentlyAvailableE(t, options, podName, retries, sleepBetweenRetries, successes))
}

// WaitUntilPodConsistentlyAvailableE similar to WaitUntilPodAvailable but one the pod is available we will retry `successes` occurrences and fail if ever the pod becomes unavailable.
// This is useful to make sure a pod doesn't become unhealthy shortly after starting
func WaitUntilPodConsistentlyAvailableE(t testing.TestingT, options *KubectlOptions, podName string, retries int, sleepBetweenRetries time.Duration, successes int) error {
statusMsg := fmt.Sprintf("Wait for pod %s to be provisioned.", podName)
message, err := retry.DoWithRetryE(
t,
Expand All @@ -128,11 +141,38 @@ func WaitUntilPodAvailableE(t testing.TestingT, options *KubectlOptions, podName
return "Pod is now available", nil
},
)
if err == nil {
for i := successes; ; i-- {
var pod *corev1.Pod
pod, err = GetPodE(t, options, podName)
if err != nil {
break
}
if !IsPodAvailable(pod) {
err = NewPodNotAvailableError(pod)
break
}
if i == 0 {
break
}
logger.Logf(t, "Wait %s before checking if pod %s is still available", sleepBetweenRetries, podName)
time.Sleep(sleepBetweenRetries)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic looks to be for a quite specific case: pod becomes ready and after a while changes status to not ready - not sure if should be included in PR for adding additional logs when pod fail to start

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up to you I can either do this as a separate issue or update PR title. The reason why this was added here is that in the case of a failing startup the pod may become ready and very quickly switch to not ready.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should go as a separate feature

}
}
if err != nil {
logger.Logf(t, "Timedout waiting for Pod to be provisioned: %s", err)
logger.Log(t, "Failed waiting for pod to be provisioned", err)
notAvailableError := PodNotAvailable{}
if errors.As(err, &notAvailableError) {
if notAvailableError.pod.Status.Phase == corev1.PodRunning {
_, logsError := GetPodLogsE(t, options, notAvailableError.pod, "")
if logsError != nil {
logger.Log(t, "Failed to retrieve logs", err)
}
}
}
return err
}
logger.Logf(t, message)
logger.Log(t, message)
return nil
}

Expand Down
16 changes: 13 additions & 3 deletions modules/retry/retry.go
Expand Up @@ -2,6 +2,7 @@
package retry

import (
"errors"
"fmt"
"regexp"
"time"
Expand Down Expand Up @@ -87,15 +88,15 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
var output interface{}
var err error

logger.Log(t, actionDescription)
for i := 0; i <= maxRetries; i++ {
logger.Log(t, actionDescription)

output, err = action()
if err == nil {
return output, nil
}

if _, isFatalErr := err.(FatalError); isFatalErr {
if errors.Is(err, FatalError{}) {
logger.Logf(t, "Returning due to fatal error: %v", err)
return output, err
}
Expand All @@ -104,7 +105,7 @@ func DoWithRetryInterfaceE(t testing.TestingT, actionDescription string, maxRetr
time.Sleep(sleepBetweenRetries)
}

return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries}
return output, MaxRetriesExceeded{Description: actionDescription, MaxRetries: maxRetries, LastError: err}
}

// DoWithRetryableErrors runs the specified action. If it returns a value, return that value. If it returns an error,
Expand Down Expand Up @@ -202,12 +203,17 @@ func (err TimeoutExceeded) Error() string {
type MaxRetriesExceeded struct {
Description string
MaxRetries int
LastError error
}

func (err MaxRetriesExceeded) Error() string {
return fmt.Sprintf("'%s' unsuccessful after %d retries", err.Description, err.MaxRetries)
}

func (err MaxRetriesExceeded) Unwrap() error {
return err.LastError
}

// FatalError is a marker interface for errors that should not be retried.
type FatalError struct {
Underlying error
Expand All @@ -216,3 +222,7 @@ type FatalError struct {
func (err FatalError) Error() string {
return fmt.Sprintf("FatalError{Underlying: %v}", err.Underlying)
}

func (err FatalError) Unwrap() error {
return err.Underlying
}
@@ -0,0 +1,21 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "for i in `seq 1 2`; do echo ${i} sleeping 1 sec; sleep 1; done; echo Finished!"]

@@ -0,0 +1,28 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "touch /tmp/healthy;for i in `seq 1 3`; do echo ${i} sleeping 1 sec; sleep 1; done; rm /tmp/healthy; sleep 1000"]
readinessProbe:
exec:
command:
- cat
- /tmp/healthy
periodSeconds: 1
initialDelaySeconds: 1

@@ -0,0 +1,28 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: busybox
command:
["/bin/sh", "-c", "for i in `seq 1 10`; do echo ${i} sleeping 1 sec; sleep 1; done; touch /tmp/healthy"]
startupProbe:
exec:
command:
- cat
- /tmp/healthy
periodSeconds: 1
initialDelaySeconds: 1

@@ -0,0 +1,23 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: nginx
image: nginx:1.15.7
ports:
- containerPort: 80
resources:
requests:
cpu: 100
@@ -0,0 +1,20 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-app
spec:
selector:
matchLabels:
app: test-app
replicas: 1
template:
metadata:
labels:
app: test-app
spec:
containers:
- name: app
image: not-an-place/not-an-app
ports:
- containerPort: 5000