mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-25 02:59:14 +08:00
Make restart pod more flexible to different failure scenarios (#4340)
This commit is contained in:
@@ -313,37 +313,38 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
||||
cs := runnerContainerStatus(pod)
|
||||
switch {
|
||||
case pod.Status.Phase == corev1.PodFailed: // All containers are stopped
|
||||
switch {
|
||||
case pod.Status.Reason == "Evicted":
|
||||
log.Info("Pod evicted; Deleting ephemeral runner or pod",
|
||||
"podPhase", pod.Status.Phase,
|
||||
"podReason", pod.Status.Reason,
|
||||
"podMessage", pod.Status.Message,
|
||||
)
|
||||
|
||||
log.Info("Pod is in failed phase, inspecting runner container status",
|
||||
"podReason", pod.Status.Reason,
|
||||
"podMessage", pod.Status.Message,
|
||||
"podConditions", pod.Status.Conditions,
|
||||
)
|
||||
// If the runner pod did not have chance to start, terminated state may not be set.
|
||||
// Therefore, we should try to restart it.
|
||||
if cs == nil || cs.State.Terminated == nil {
|
||||
log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted")
|
||||
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||
}
|
||||
|
||||
case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue.
|
||||
log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod",
|
||||
"podPhase", pod.Status.Phase,
|
||||
"podReason", pod.Status.Reason,
|
||||
"podMessage", pod.Status.Message,
|
||||
)
|
||||
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||
|
||||
default:
|
||||
log.Info("Pod is in failed phase; updating ephemeral runner status",
|
||||
"podPhase", pod.Status.Phase,
|
||||
"podReason", pod.Status.Reason,
|
||||
"podMessage", pod.Status.Message,
|
||||
)
|
||||
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
|
||||
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
||||
if cs.State.Terminated.ExitCode == 0 {
|
||||
log.Info("Runner container has succeeded but pod is in failed phase; Assume successful exit")
|
||||
// If the pod is in a failed state, that means that at least one container exited with non-zero exit code.
|
||||
// If the runner container exits with 0, we assume that the runner has finished successfully.
|
||||
// If side-car container exits with non-zero, it shouldn't affect the runner. Runner exit code
|
||||
// drives the controller's inference of whether the job has succeeded or failed.
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(err, "Failed to delete ephemeral runner after successful completion")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
log.Error(
|
||||
errors.New("ephemeral runner container has failed, with runner container exit code non-zero"),
|
||||
"Ephemeral runner container has failed, and runner container termination exit code is non-zero",
|
||||
"containerTerminatedState", cs.State.Terminated,
|
||||
)
|
||||
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||
|
||||
case cs == nil:
|
||||
// starting, no container state yet
|
||||
log.Info("Waiting for runner container status to be available")
|
||||
@@ -397,6 +398,7 @@ func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Conte
|
||||
log.Info("Removed the runner from the service")
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
||||
log.Error(err, "Failed to delete runner pod on failure")
|
||||
return err
|
||||
@@ -570,6 +572,8 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
|
||||
|
||||
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
|
||||
// It should not be responsible for setting the status to Failed.
|
||||
//
|
||||
// It should be called by deleteEphemeralRunnerOrPod which is responsible for deciding whether to delete the EphemeralRunner or just the Pod.
|
||||
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
||||
if pod.DeletionTimestamp.IsZero() {
|
||||
log.Info("Deleting the ephemeral runner pod", "podId", pod.UID)
|
||||
|
||||
@@ -261,6 +261,238 @@ var _ = Describe("EphemeralRunner", func() {
|
||||
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should delete ephemeral runner when pod failed before runner state is recorded and job assigned", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "1"
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
Eventually(func() (string, error) {
|
||||
current := new(v1alpha1.EphemeralRunner)
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return current.Status.JobID, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1"))
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.ContainerStatuses = nil
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
Eventually(func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should delete ephemeral runner when pod failed before runner state is recorded and job not assigned", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
|
||||
|
||||
oldPodUID := pod.UID
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.ContainerStatuses = nil
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(
|
||||
ctx,
|
||||
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
|
||||
updated,
|
||||
)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return len(updated.Status.Failures), nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(1))
|
||||
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
newPod := new(corev1.Pod)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return newPod.UID != oldPodUID, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Pod should be re-created")
|
||||
})
|
||||
|
||||
It("It should treat pod failed with runner container exit 0 as success with job id", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
er.Status.JobID = "1"
|
||||
err := k8sClient.Status().Update(ctx, er)
|
||||
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() error {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
Eventually(
|
||||
func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should treat pod failed with runner container exit 0 as success with no job id", func() {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() error {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
Eventually(
|
||||
func() bool {
|
||||
check := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
|
||||
return kerrors.IsNotFound(err)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should mark as failed when job is not assigned and pod is failed", func() {
|
||||
er := new(v1alpha1.EphemeralRunner)
|
||||
Eventually(func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() error {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get pod")
|
||||
|
||||
pod.Status.Phase = corev1.PodFailed
|
||||
oldPodUID := pod.UID
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(
|
||||
ctx,
|
||||
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
|
||||
updated,
|
||||
)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return len(updated.Status.Failures), nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(1))
|
||||
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
newPod := new(corev1.Pod)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return newPod.UID != oldPodUID, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Pod should be re-created")
|
||||
})
|
||||
|
||||
It("It should failed if a pod template is invalid", func() {
|
||||
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
|
||||
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"
|
||||
|
||||
Reference in New Issue
Block a user