From bfe78ccd5dfd4405f84e328a567ef2a5aa376b50 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Fri, 19 Dec 2025 15:49:42 +0100 Subject: [PATCH] Make restart pod more flexible to different failure scenarios (#4340) --- .../ephemeralrunner_controller.go | 52 ++-- .../ephemeralrunner_controller_test.go | 232 ++++++++++++++++++ 2 files changed, 260 insertions(+), 24 deletions(-) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 40d25ad3..4d0d978f 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -313,37 +313,38 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ cs := runnerContainerStatus(pod) switch { case pod.Status.Phase == corev1.PodFailed: // All containers are stopped - switch { - case pod.Status.Reason == "Evicted": - log.Info("Pod evicted; Deleting ephemeral runner or pod", - "podPhase", pod.Status.Phase, - "podReason", pod.Status.Reason, - "podMessage", pod.Status.Message, - ) - + log.Info("Pod is in failed phase, inspecting runner container status", + "podReason", pod.Status.Reason, + "podMessage", pod.Status.Message, + "podConditions", pod.Status.Conditions, + ) + // If the runner pod did not have chance to start, terminated state may not be set. + // Therefore, we should try to restart it. + if cs == nil || cs.State.Terminated == nil { + log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted") return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + } - case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue. - log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod", - "podPhase", pod.Status.Phase, - "podReason", pod.Status.Reason, - "podMessage", pod.Status.Message, - ) - return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) - - default: - log.Info("Pod is in failed phase; updating ephemeral runner status", - "podPhase", pod.Status.Phase, - "podReason", pod.Status.Reason, - "podMessage", pod.Status.Message, - ) - if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil { - log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") + if cs.State.Terminated.ExitCode == 0 { + log.Info("Runner container has succeeded but pod is in failed phase; Assume successful exit") + // If the pod is in a failed state, that means that at least one container exited with non-zero exit code. + // If the runner container exits with 0, we assume that the runner has finished successfully. + // If side-car container exits with non-zero, it shouldn't affect the runner. Runner exit code + // drives the controller's inference of whether the job has succeeded or failed. + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete ephemeral runner after successful completion") return ctrl.Result{}, err } return ctrl.Result{}, nil } + log.Error( + errors.New("ephemeral runner container has failed, with runner container exit code non-zero"), + "Ephemeral runner container has failed, and runner container termination exit code is non-zero", + "containerTerminatedState", cs.State.Terminated, + ) + return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log) + case cs == nil: // starting, no container state yet log.Info("Waiting for runner container status to be available") @@ -397,6 +398,7 @@ func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Conte log.Info("Removed the runner from the service") return nil } + if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil { log.Error(err, "Failed to delete runner pod on failure") return err @@ -570,6 +572,8 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR // deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count. // It should not be responsible for setting the status to Failed. +// +// It should be called by deleteEphemeralRunnerOrPod which is responsible for deciding whether to delete the EphemeralRunner or just the Pod. func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error { if pod.DeletionTimestamp.IsZero() { log.Info("Deleting the ephemeral runner pod", "podId", pod.UID) diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 129dcd17..5ee0df03 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -261,6 +261,238 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") }) + It("It should delete ephemeral runner when pod failed before runner state is recorded and job assigned", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "1" + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + Eventually(func() (string, error) { + current := new(v1alpha1.EphemeralRunner) + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil { + return "", err + } + return current.Status.JobID, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1")) + + pod := new(corev1.Pod) + Eventually(func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + + pod.Status.Phase = corev1.PodFailed + pod.Status.ContainerStatuses = nil + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually(func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted") + }) + + It("It should delete ephemeral runner when pod failed before runner state is recorded and job not assigned", func() { + pod := new(corev1.Pod) + Eventually(func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + + oldPodUID := pod.UID + + pod.Status.Phase = corev1.PodFailed + pod.Status.ContainerStatuses = nil + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get( + ctx, + client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, + updated, + ) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(1)) + + Eventually( + func() (bool, error) { + newPod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod) + if err != nil { + return false, err + } + return newPod.UID != oldPodUID, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Pod should be re-created") + }) + + It("It should treat pod failed with runner container exit 0 as success with job id", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner") + + er.Status.JobID = "1" + err := k8sClient.Status().Update(ctx, er) + Expect(err).To(BeNil(), "failed to update ephemeral runner status") + + pod := new(corev1.Pod) + Eventually( + func() error { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return err + } + return nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + }, + }) + err = k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually( + func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") + }) + + It("It should treat pod failed with runner container exit 0 as success with no job id", func() { + pod := new(corev1.Pod) + Eventually( + func() error { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return err + } + return nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + }, + }) + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually( + func() bool { + check := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check) + return kerrors.IsNotFound(err) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") + }) + + It("It should mark as failed when job is not assigned and pod is failed", func() { + er := new(v1alpha1.EphemeralRunner) + Eventually(func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get ephemeral runner") + + pod := new(corev1.Pod) + Eventually( + func() error { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return err + } + return nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get pod") + + pod.Status.Phase = corev1.PodFailed + oldPodUID := pod.UID + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + }, + }) + + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "Failed to update pod status") + + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get( + ctx, + client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, + updated, + ) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(1)) + + Eventually( + func() (bool, error) { + newPod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod) + if err != nil { + return false, err + } + return newPod.UID != oldPodUID, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Pod should be re-created") + }) + It("It should failed if a pod template is invalid", func() { invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name) invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"