refactor: Make RunnerReplicaSet and Runner backed by the same logic that backs RunnerSet

2025-12-12 20:46:47 +00:00 · 2022-03-05 12:13:22 +00:00
parent c95e84a528
commit 14a878bfae
12 changed files with 278 additions and 903 deletions
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@@ -18,15 +18,12 @@ package controllers

 import (
 	"context"
-	"errors"
 	"fmt"
 	"strings"
 	"time"

 	"github.com/actions-runner-controller/actions-runner-controller/hash"
 	"github.com/go-logr/logr"
-	gogithub "github.com/google/go-github/v39/github"
-	"k8s.io/apimachinery/pkg/util/wait"

 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -92,12 +89,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 		return ctrl.Result{}, client.IgnoreNotFound(err)
 	}

-	err := runner.Validate()
-	if err != nil {
-		log.Info("Failed to validate runner spec", "error", err.Error())
-		return ctrl.Result{}, nil
-	}
-
 	if runner.ObjectMeta.DeletionTimestamp.IsZero() {
 		finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)

@@ -125,34 +116,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 		return r.processRunnerDeletion(runner, ctx, log, &pod)
 	}

-	registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
-	if registrationOnly && runner.Status.Phase != "" {
-		// At this point we are sure that the registration-only runner has successfully configured and
-		// is of `offline` status, because we set runner.Status.Phase to that of the runner pod only after
-		// successful registration.
-
-		var pod corev1.Pod
-		if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
-			if !kerrors.IsNotFound(err) {
-				log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
-
-				return ctrl.Result{Requeue: true}, nil
-			}
-		} else if err := r.Delete(ctx, &pod); err != nil {
-			if !kerrors.IsNotFound(err) {
-				log.Info(fmt.Sprintf("Retrying soon as we failed to delete registration-only runner pod: %v", err))
-
-				return ctrl.Result{Requeue: true}, nil
-			}
-		}
-
-		log.Info("Successfully deleted registration-only runner pod to free node and cluster resource")
-
-		// Return here to not recreate the deleted pod, because recreating it is the waste of cluster and node resource,
-		// and also defeats the original purpose of scale-from/to-zero we're trying to implement by using the registration-only runner.
-		return ctrl.Result{}, nil
-	}
-
 	var pod corev1.Pod
 	if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
 		if !kerrors.IsNotFound(err) {
@@ -162,281 +125,31 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 		return r.processRunnerCreation(ctx, runner, log)
 	}

-	// Pod already exists
-
-	if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
-		return r.processRunnerPodDeletion(ctx, runner, log, pod)
+	phase := string(pod.Status.Phase)
+	if phase == "" {
+		phase = "Created"
 	}

-	// If pod has ended up succeeded we need to restart it
-	// Happens e.g. when dind is in runner and run completes
-	stopped := runnerPodOrContainerIsStopped(&pod)
-
-	ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
-
-	if stopped && ephemeral {
-		log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
-
-		// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
-		// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
-		//
-		// In the next reconcilation loop, this triggers a runner unregistration.
-		// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
-		//
-		// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
-		// which handles the unregistrationa the removal of the completed pod, and so on.
-		if err := r.Delete(ctx, &runner); err != nil {
-			log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
-			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
+	if runner.Status.Phase != phase {
+		if pod.Status.Phase == corev1.PodRunning {
+			// Seeing this message, you can expect the runner to become `Running` soon.
+			log.V(1).Info(
+				"Runner appears to have been registered and running.",
+				"podCreationTimestamp", pod.CreationTimestamp,
+			)
 		}

-		return ctrl.Result{Requeue: true}, nil
-	}
+		updated := runner.DeepCopy()
+		updated.Status.Phase = phase
+		updated.Status.Reason = pod.Status.Reason
+		updated.Status.Message = pod.Status.Message

-	restart := stopped
-
-	if registrationOnly && stopped {
-		restart = false
-
-		log.Info(
-			"Observed that registration-only runner for scaling-from-zero has successfully stopped. " +
-				"Unlike other pods, this one will be recreated only when runner spec changes.",
-		)
-	}
-
-	if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
-		return ctrl.Result{}, err
-	} else if updated {
-		return ctrl.Result{Requeue: true}, nil
-	}
-
-	newPod, err := r.newPod(runner)
-	if err != nil {
-		log.Error(err, "Could not create pod")
-		return ctrl.Result{}, err
-	}
-
-	if registrationOnly {
-		newPod.Spec.Containers[0].Env = append(
-			newPod.Spec.Containers[0].Env,
-			corev1.EnvVar{
-				Name:  "RUNNER_REGISTRATION_ONLY",
-				Value: "true",
-			},
-		)
-	}
-
-	var registrationRecheckDelay time.Duration
-
-	// all checks done below only decide whether a restart is needed
-	// if a restart was already decided before, there is no need for the checks
-	// saving API calls and scary log messages
-	if !restart {
-		registrationCheckInterval := time.Minute
-		if r.RegistrationRecheckInterval > 0 {
-			registrationCheckInterval = r.RegistrationRecheckInterval
-		}
-
-		// We want to call ListRunners GitHub Actions API only once per runner per minute.
-		// This if block, in conjunction with:
-		//   return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
-		// achieves that.
-		if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil {
-			nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
-			now := time.Now()
-
-			// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
-			// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
-			// shorter than 1s
-			requeueAfter := nextCheckTime.Sub(now) - time.Second
-			if requeueAfter > 0 {
-				log.Info(
-					fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
-					"lastRegistrationCheckTime", lastCheckTime,
-					"registrationCheckInterval", registrationCheckInterval,
-				)
-
-				// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
-				// next sync period passes, which can be too much later than nextCheckTime.
-				//
-				// We need to requeue on this reconcilation even though we have already scheduled the initial
-				// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
-				// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
-				// other requeues- so the initial scheduled requeue may have been reset due to requeue on
-				// spec/status change.
-				return ctrl.Result{RequeueAfter: requeueAfter}, nil
-			}
-		}
-
-		notFound := false
-		offline := false
-
-		runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
-
-		currentTime := time.Now()
-
-		if err != nil {
-			var notFoundException *github.RunnerNotFound
-			var offlineException *github.RunnerOffline
-			if errors.As(err, &notFoundException) {
-				notFound = true
-			} else if errors.As(err, &offlineException) {
-				offline = true
-			} else {
-				var e *gogithub.RateLimitError
-				if errors.As(err, &e) {
-					// We log the underlying error when we failed calling GitHub API to list or unregisters,
-					// or the runner is still busy.
-					log.Error(
-						err,
-						fmt.Sprintf(
-							"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
-							retryDelayOnGitHubAPIRateLimitError,
-						),
-					)
-
-					return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
-				}
-
-				return ctrl.Result{}, err
-			}
-		}
-
-		// See the `newPod` function called above for more information
-		// about when this hash changes.
-		curHash := pod.Labels[LabelKeyPodTemplateHash]
-		newHash := newPod.Labels[LabelKeyPodTemplateHash]
-
-		if !runnerBusy && curHash != newHash {
-			restart = true
-		}
-
-		registrationTimeout := 10 * time.Minute
-		durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout))
-		registrationDidTimeout := durationAfterRegistrationTimeout > 0
-
-		if notFound {
-			if registrationDidTimeout {
-				log.Info(
-					"Runner failed to register itself to GitHub in timely manner. "+
-						"Recreating the pod to see if it resolves the issue. "+
-						"CAUTION: If you see this a lot, you should investigate the root cause. "+
-						"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
-					"podCreationTimestamp", pod.CreationTimestamp,
-					"currentTime", currentTime,
-					"configuredRegistrationTimeout", registrationTimeout,
-				)
-
-				restart = true
-			} else {
-				log.V(1).Info(
-					"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
-					"runnerName", runner.Name,
-				)
-			}
-		} else if offline {
-			if registrationOnly {
-				log.Info(
-					"Observed that registration-only runner for scaling-from-zero has successfully been registered.",
-					"podCreationTimestamp", pod.CreationTimestamp,
-					"currentTime", currentTime,
-					"configuredRegistrationTimeout", registrationTimeout,
-				)
-			} else if registrationDidTimeout {
-				if runnerBusy {
-					log.Info(
-						"Timeout out while waiting for the runner to be online, but observed that it's busy at the same time."+
-							"This is a known (unintuitive) behaviour of a runner that is already running a job. Please see https://github.com/actions-runner-controller/actions-runner-controller/issues/911",
-						"podCreationTimestamp", pod.CreationTimestamp,
-						"currentTime", currentTime,
-						"configuredRegistrationTimeout", registrationTimeout,
-					)
-				} else {
-					log.Info(
-						"Already existing GitHub runner still appears offline . "+
-							"Recreating the pod to see if it resolves the issue. "+
-							"CAUTION: If you see this a lot, you should investigate the root cause. ",
-						"podCreationTimestamp", pod.CreationTimestamp,
-						"currentTime", currentTime,
-						"configuredRegistrationTimeout", registrationTimeout,
-					)
-
-					restart = true
-				}
-			} else {
-				log.V(1).Info(
-					"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
-					"runnerName", runner.Name,
-				)
-			}
-		}
-
-		if (notFound || (offline && !registrationOnly)) && !registrationDidTimeout {
-			registrationRecheckJitter := 10 * time.Second
-			if r.RegistrationRecheckJitter > 0 {
-				registrationRecheckJitter = r.RegistrationRecheckJitter
-			}
-
-			registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
+		if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
+			log.Error(err, "Failed to update runner status for Phase/Reason/Message")
+			return ctrl.Result{}, err
 		}
 	}

-	// Don't do anything if there's no need to restart the runner
-	if !restart {
-		// This guard enables us to update runner.Status.Phase to `Running` only after
-		// the runner is registered to GitHub.
-		if registrationRecheckDelay > 0 {
-			log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
-
-			updated := runner.DeepCopy()
-			updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()}
-
-			if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
-				log.Error(err, "Failed to update runner status for LastRegistrationCheckTime")
-				return ctrl.Result{}, err
-			}
-
-			return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
-		}
-
-		if runner.Status.Phase != string(pod.Status.Phase) {
-			if pod.Status.Phase == corev1.PodRunning {
-				// Seeing this message, you can expect the runner to become `Running` soon.
-				log.Info(
-					"Runner appears to have registered and running.",
-					"podCreationTimestamp", pod.CreationTimestamp,
-				)
-			}
-
-			updated := runner.DeepCopy()
-			updated.Status.Phase = string(pod.Status.Phase)
-			updated.Status.Reason = pod.Status.Reason
-			updated.Status.Message = pod.Status.Message
-
-			if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
-				log.Error(err, "Failed to update runner status for Phase/Reason/Message")
-				return ctrl.Result{}, err
-			}
-		}
-
-		return ctrl.Result{}, nil
-	}
-
-	updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
-	if res != nil {
-		return *res, err
-	}
-
-	// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
-	// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
-	if err := r.Delete(ctx, updatedPod); err != nil {
-		log.Error(err, "Failed to delete pod resource")
-		return ctrl.Result{}, err
-	}
-
-	r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
-	log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
-
 	return ctrl.Result{}, nil
 }

@@ -480,11 +193,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
 	finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)

 	if removed {
-		_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
-		if res != nil {
-			return *res, err
-		}
-
 		newRunner := runner.DeepCopy()
 		newRunner.ObjectMeta.Finalizers = finalizers

@@ -499,60 +207,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
 	return ctrl.Result{}, nil
 }

-func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
-	unregistrationTimeout := DefaultUnregistrationTimeout
-
-	if r.UnregistrationTimeout > 0 {
-		unregistrationTimeout = r.UnregistrationTimeout
-	}
-	return unregistrationTimeout
-}
-
-func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
-	retryDelay := DefaultUnregistrationRetryDelay
-
-	if r.UnregistrationRetryDelay > 0 {
-		retryDelay = r.UnregistrationRetryDelay
-	}
-	return retryDelay
-}
-
-func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
-	deletionTimeout := 1 * time.Minute
-	currentTime := time.Now()
-	deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0
-
-	if deletionDidTimeout {
-		log.Info(
-			fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
-				"This is typically the case when a Kubernetes node became unreachable "+
-				"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
-			"podDeletionTimestamp", pod.DeletionTimestamp,
-			"currentTime", currentTime,
-			"configuredDeletionTimeout", deletionTimeout,
-		)
-
-		var force int64 = 0
-		// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
-		if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
-			// probably
-			if !kerrors.IsNotFound(err) {
-				log.Error(err, "Failed to forcefully delete pod resource ...")
-				return ctrl.Result{}, err
-			}
-			// forceful deletion finally succeeded
-			return ctrl.Result{Requeue: true}, nil
-		}
-
-		r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name))
-		log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository)
-		// give kube manager a little time to forcefully delete the stuck pod
-		return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
-	} else {
-		return ctrl.Result{}, nil
-	}
-}
-
 func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
 	if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
 		return ctrl.Result{}, err
@@ -584,6 +238,7 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a

 	r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name))
 	log.Info("Created runner pod", "repository", runner.Spec.Repository)
+
 	return ctrl.Result{}, nil
 }

@@ -696,7 +351,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {

 	registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)

-	pod, err := newRunnerPod(template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly)
+	pod, err := newRunnerPod(runner.Name, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly)
 	if err != nil {
 		return pod, err
 	}
@@ -813,7 +468,7 @@ func mutatePod(pod *corev1.Pod, token string) *corev1.Pod {
 	return updated
 }

-func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) {
+func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) {
 	var (
 		privileged                bool = true
 		dockerdInRunner           bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer
@@ -822,6 +477,12 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
 		dockerdInRunnerPrivileged bool = dockerdInRunner
 	)

+	template = *template.DeepCopy()
+
+	// This label selector is used by default when rd.Spec.Selector is empty.
+	template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerName)
+	template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation)
+
 	workDir := runnerSpec.WorkDir
 	if workDir == "" {
 		workDir = "/runner/_work"