From 80e12246abbbf58117153cc9f136aecacd4735d5 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Wed, 16 Nov 2022 20:10:41 -0500 Subject: [PATCH] fix(operator): Workflow stuck at running when init container failed. Fixes #10045 Signed-off-by: Yuan Tang --- workflow/controller/operator.go | 8 ++++++++ workflow/controller/operator_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index a17b2768ea55..17907e824ad6 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1337,6 +1337,14 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus new.Phase = old.Phase } } + // If the init container failed, we should mark the node as failed. + for _, c := range pod.Status.InitContainerStatuses { + if c.State.Terminated != nil && int(c.State.Terminated.ExitCode) != 0 { + new.Phase = wfv1.NodeFailed + woc.log.WithField("new.phase", new.Phase).Info("marking node as failed since init container has non-zero exit code") + break + } + } // if we are transitioning from Pending to a different state, clear out unchanged message if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message { diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 92e0a0525e80..a0cc3d7ad983 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -1331,6 +1331,32 @@ func TestAssessNodeStatus(t *testing.T) { }, node: &wfv1.NodeStatus{TemplateName: templateName}, want: wfv1.NodeFailed, + }, { + name: "pod failed - init container failed", + pod: &apiv1.Pod{ + Status: apiv1.PodStatus{ + InitContainerStatuses: []apiv1.ContainerStatus{ + { + Name: common.InitContainerName, + State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 1}}, + }, + }, + ContainerStatuses: []apiv1.ContainerStatus{ + { + Name: common.WaitContainerName, + State: apiv1.ContainerState{Terminated: nil}, + }, + { + Name: common.MainContainerName, + State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 0}}, + }, + }, + Message: "failed since init container failed", + Phase: apiv1.PodFailed, + }, + }, + node: &wfv1.NodeStatus{TemplateName: templateName}, + want: wfv1.NodeFailed, }, { name: "pod running", pod: &apiv1.Pod{