mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-20 06:56:51 +00:00
Re-schedule if the failed reason starts with OutOf (#4336)
This commit is contained in:
@@ -312,26 +312,45 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||||||
|
|
||||||
cs := runnerContainerStatus(pod)
|
cs := runnerContainerStatus(pod)
|
||||||
switch {
|
switch {
|
||||||
case cs == nil:
|
case pod.Status.Phase == corev1.PodFailed: // All containers are stopped
|
||||||
// starting, no container state yet
|
switch {
|
||||||
log.Info("Waiting for runner container status to be available")
|
case pod.Status.Reason == "Evicted":
|
||||||
return ctrl.Result{}, nil
|
log.Info("Pod evicted; Deleting ephemeral runner or pod",
|
||||||
case cs.State.Terminated == nil: // still running or evicted
|
"podPhase", pod.Status.Phase,
|
||||||
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" {
|
"podReason", pod.Status.Reason,
|
||||||
log.Info("Pod set the termination phase, but container state is not terminated. Deleting pod",
|
"podMessage", pod.Status.Message,
|
||||||
"PodPhase", pod.Status.Phase,
|
|
||||||
"PodReason", pod.Status.Reason,
|
|
||||||
"PodMessage", pod.Status.Message,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||||
log.Error(err, "failed to delete pod as failed on pod.Status.Phase: Failed")
|
|
||||||
|
case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue.
|
||||||
|
log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod",
|
||||||
|
"podPhase", pod.Status.Phase,
|
||||||
|
"podReason", pod.Status.Reason,
|
||||||
|
"podMessage", pod.Status.Message,
|
||||||
|
)
|
||||||
|
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||||
|
|
||||||
|
default:
|
||||||
|
log.Info("Pod is in failed phase; updating ephemeral runner status",
|
||||||
|
"podPhase", pod.Status.Phase,
|
||||||
|
"podReason", pod.Status.Reason,
|
||||||
|
"podMessage", pod.Status.Message,
|
||||||
|
)
|
||||||
|
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
|
||||||
|
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Ephemeral runner container is still running")
|
case cs == nil:
|
||||||
|
// starting, no container state yet
|
||||||
|
log.Info("Waiting for runner container status to be available")
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
|
||||||
|
case cs.State.Terminated == nil: // container is not terminated and pod phase is not failed, so runner is still running
|
||||||
|
log.Info("Runner container is still running; updating ephemeral runner status")
|
||||||
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
|
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
|
||||||
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
@@ -340,36 +359,7 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||||||
|
|
||||||
case cs.State.Terminated.ExitCode != 0: // failed
|
case cs.State.Terminated.ExitCode != 0: // failed
|
||||||
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
|
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
|
||||||
if ephemeralRunner.HasJob() {
|
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
|
||||||
log.Error(
|
|
||||||
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
|
|
||||||
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
|
|
||||||
)
|
|
||||||
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
|
|
||||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
|
||||||
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
|
|
||||||
log.Info("Trying to remove the runner from the service")
|
|
||||||
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(err, "Failed to get actions client for removing the runner from the service")
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil {
|
|
||||||
log.Error(err, "Failed to remove the runner from the service")
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
log.Info("Removed the runner from the service")
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
|
||||||
log.Error(err, "Failed to delete runner pod on failure")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
|
|
||||||
default: // succeeded
|
default: // succeeded
|
||||||
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
|
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
|
||||||
@@ -381,6 +371,40 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
|
||||||
|
if ephemeralRunner.HasJob() {
|
||||||
|
log.Error(
|
||||||
|
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
|
||||||
|
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
|
||||||
|
)
|
||||||
|
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
|
||||||
|
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||||
|
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("Deleted the ephemeral runner that has a job assigned but the pod has failed")
|
||||||
|
log.Info("Trying to remove the runner from the service")
|
||||||
|
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
|
||||||
|
if err != nil {
|
||||||
|
log.Error(err, "Failed to get actions client for removing the runner from the service")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := actionsClient.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)); err != nil {
|
||||||
|
log.Error(err, "Failed to remove the runner from the service")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
log.Info("Removed the runner from the service")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
|
||||||
|
log.Error(err, "Failed to delete runner pod on failure")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (r *EphemeralRunnerReconciler) cleanupRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ok bool, err error) {
|
func (r *EphemeralRunnerReconciler) cleanupRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ok bool, err error) {
|
||||||
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
if err := r.deleteRunnerFromService(ctx, ephemeralRunner, log); err != nil {
|
||||||
actionsError := &actions.ActionsError{}
|
actionsError := &actions.ActionsError{}
|
||||||
|
|||||||
@@ -745,6 +745,52 @@ var _ = Describe("EphemeralRunner", func() {
|
|||||||
).Should(BeEquivalentTo(true))
|
).Should(BeEquivalentTo(true))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("It should re-create pod on reason starting with OutOf", func() {
|
||||||
|
pod := new(corev1.Pod)
|
||||||
|
Eventually(
|
||||||
|
func() (bool, error) {
|
||||||
|
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
},
|
||||||
|
ephemeralRunnerTimeout,
|
||||||
|
ephemeralRunnerInterval,
|
||||||
|
).Should(BeEquivalentTo(true))
|
||||||
|
|
||||||
|
pod.Status.Phase = corev1.PodFailed
|
||||||
|
pod.Status.Reason = "OutOfpods"
|
||||||
|
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||||
|
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||||
|
State: corev1.ContainerState{},
|
||||||
|
})
|
||||||
|
err := k8sClient.Status().Update(ctx, pod)
|
||||||
|
Expect(err).To(BeNil(), "failed to patch pod status")
|
||||||
|
|
||||||
|
updated := new(v1alpha1.EphemeralRunner)
|
||||||
|
Eventually(func() (bool, error) {
|
||||||
|
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return len(updated.Status.Failures) == 1, nil
|
||||||
|
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
|
||||||
|
|
||||||
|
// should re-create after failure
|
||||||
|
Eventually(
|
||||||
|
func() (bool, error) {
|
||||||
|
pod := new(corev1.Pod)
|
||||||
|
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
},
|
||||||
|
ephemeralRunnerTimeout,
|
||||||
|
ephemeralRunnerInterval,
|
||||||
|
).Should(BeEquivalentTo(true))
|
||||||
|
})
|
||||||
|
|
||||||
It("It should not set the phase to succeeded without pod termination status", func() {
|
It("It should not set the phase to succeeded without pod termination status", func() {
|
||||||
pod := new(corev1.Pod)
|
pod := new(corev1.Pod)
|
||||||
Eventually(
|
Eventually(
|
||||||
|
|||||||
Reference in New Issue
Block a user