From 9f9409a4c1514001eac6bdfe47fd8106fd3cd14a Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Mon, 10 Nov 2025 13:58:25 +0100 Subject: [PATCH] Handle resource quota on status forbidden by retrying (#4305) Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../ephemeralrunner_controller.go | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index d33d381e..48185320 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -22,6 +22,7 @@ import ( "fmt" "net/http" "strconv" + "strings" "time" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -282,7 +283,34 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ case kerrors.IsAlreadyExists(err): log.Info("Runner pod already exists. Waiting for the pod event to be received") return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil - case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): + case kerrors.IsInvalid(err): + log.Error(err, "Failed to create a pod due to unrecoverable failure") + errMessage := fmt.Sprintf("Failed to create the pod: %v", err) + if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { + log.Error(err, "Failed to set ephemeral runner to phase Failed") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + case kerrors.IsForbidden(err): + if status, ok := err.(kerrors.APIStatus); ok || errors.As(err, &status) { + isResourceQuotaExceeded := strings.Contains(status.Status().Message, "exceeded quota:") + isAboutToExpire := ephemeralRunner.CreationTimestamp.Time.Add(10 * time.Minute).Before(time.Now()) + switch { + case isResourceQuotaExceeded && isAboutToExpire: + log.Error(err, "Failed to create a pod due to resource quota exceeded and the ephemeral runner is about to expire; re-creating the ephemeral runner") + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(err, "Failed to delete the ephemeral runner") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + case isResourceQuotaExceeded: + log.Error(err, "Resource quota is exceeded; requeue in 30s to retry pod creation") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + default: + // other forbidden errors + // fallthrough to the default handling below + } + } log.Error(err, "Failed to create a pod due to unrecoverable failure") errMessage := fmt.Sprintf("Failed to create the pod: %v", err) if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {