Create backoff mechanism for failed runners and allow re-creation of failed ephemeral runners (#4059)

This commit is contained in:
Nikola Jokic
2025-05-14 15:38:50 +02:00
committed by GitHub
parent d6e2790db5
commit cae7efa2c6
8 changed files with 167 additions and 81 deletions

View File

@@ -28,6 +28,7 @@ import (
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
@@ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct {
ResourceBuilder
}
// precompute backoff durations for failed ephemeral runners
// the len(failedRunnerBackoff) must be equal to maxFailures + 1
var failedRunnerBackoff = []time.Duration{
0,
5 * time.Second,
10 * time.Second,
20 * time.Second,
40 * time.Second,
80 * time.Second,
}
const maxFailures = 5
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete
@@ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}
}
if len(ephemeralRunner.Status.Failures) > maxFailures {
log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures))
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
now := metav1.Now()
lastFailure := ephemeralRunner.Status.LastFailure()
backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)]
nextReconciliation := lastFailure.Add(backoffDuration)
if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) {
log.Info("Backing off the next reconciliation due to failure",
"lastFailure", lastFailure,
"nextReconciliation", nextReconciliation,
"requeueAfter", nextReconciliation.Sub(now.Time),
)
return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil
}
secret := new(corev1.Secret)
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
if !kerrors.IsNotFound(err) {
@@ -196,39 +233,28 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
pod := new(corev1.Pod)
if err := r.Get(ctx, req.NamespacedName, pod); err != nil {
switch {
case !kerrors.IsNotFound(err):
if !kerrors.IsNotFound(err) {
log.Error(err, "Failed to fetch the pod")
return ctrl.Result{}, err
}
case len(ephemeralRunner.Status.Failures) > 5:
log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed")
errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil {
// Pod was not found. Create if the pod has never been created
log.Info("Creating new EphemeralRunner pod.")
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
switch {
case err == nil:
return result, nil
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
log.Error(err, "Failed to create a pod due to unrecoverable failure")
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
default:
// Pod was not found. Create if the pod has never been created
log.Info("Creating new EphemeralRunner pod.")
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
switch {
case err == nil:
return result, nil
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
log.Error(err, "Failed to create a pod due to unrecoverable failure")
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
default:
log.Error(err, "Failed to create the pod")
return ctrl.Result{}, err
}
log.Error(err, "Failed to create the pod")
return ctrl.Result{}, err
}
}
@@ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
log.Info("Updating ephemeral runner status to track the failure count")
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
if obj.Status.Failures == nil {
obj.Status.Failures = make(map[string]bool)
obj.Status.Failures = make(map[string]metav1.Time)
}
obj.Status.Failures[string(pod.UID)] = true
obj.Status.Failures[string(pod.UID)] = metav1.Now()
obj.Status.Ready = false
obj.Status.Reason = pod.Status.Reason
obj.Status.Message = pod.Status.Message