refactor: Make RunnerReplicaSet and Runner backed by the same logic that backs RunnerSet

2025-12-11 03:57:01 +00:00 · 2022-03-05 12:13:22 +00:00
parent c95e84a528
commit 14a878bfae
12 changed files with 278 additions and 903 deletions
--- a/controllers/runnerreplicaset_controller.go
+++ b/controllers/runnerreplicaset_controller.go
@@ -18,13 +18,10 @@ package controllers

 import (
 	"context"
-	"errors"
-	"fmt"
 	"reflect"
 	"time"

 	"github.com/go-logr/logr"
-	gogithub "github.com/google/go-github/v39/github"

 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -32,7 +29,6 @@ import (
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"

-	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

 	"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
@@ -72,15 +68,35 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		return ctrl.Result{}, nil
 	}

+	if rs.ObjectMeta.Labels == nil {
+		rs.ObjectMeta.Labels = map[string]string{}
+	}
+
+	// Template hash is usually set by the upstream controller(RunnerDeplloyment controller) on authoring
+	// RunerReplicaset resource, but it may be missing when the user directly created RunnerReplicaSet.
+	// As a template hash is required by by the runner replica management, we dynamically add it here without ever persisting it.
+	if rs.ObjectMeta.Labels[LabelKeyRunnerTemplateHash] == "" {
+		template := rs.Spec.DeepCopy()
+		template.Replicas = nil
+		template.EffectiveTime = nil
+		templateHash := ComputeHash(template)
+
+		log.Info("Using auto-generated template hash", "value", templateHash)
+
+		rs.ObjectMeta.Labels = CloneAndAddLabel(rs.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
+		rs.Spec.Template.ObjectMeta.Labels = CloneAndAddLabel(rs.Spec.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
+	}
+
 	selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
 	if err != nil {
 		return ctrl.Result{}, err
 	}
+
 	// Get the Runners managed by the target RunnerReplicaSet
-	var allRunners v1alpha1.RunnerList
+	var runnerList v1alpha1.RunnerList
 	if err := r.List(
 		ctx,
-		&allRunners,
+		&runnerList,
 		client.InNamespace(req.Namespace),
 		client.MatchingLabelsSelector{Selector: selector},
 	); err != nil {
@@ -89,218 +105,43 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		}
 	}

-	var (
-		current   int
-		ready     int
-		available int
-
-		lastSyncTime *time.Time
-	)
-
-	for _, r := range allRunners.Items {
-		// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
-		// to not treat all the runners in the namespace as its children.
-		if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
-			// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
-			// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
-			// we don't need to bother calling GitHub API to re-mark the runner for deletion.
-			// Just hold on, and runners will disappear as long as the runner controller is up and running.
-			if !r.DeletionTimestamp.IsZero() {
-				continue
-			}
-
-			if r.Annotations != nil {
-				if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
-					t, err := time.Parse(time.RFC3339, a)
-					if err == nil {
-						if lastSyncTime == nil || lastSyncTime.Before(t) {
-							lastSyncTime = &t
-						}
-					}
-				}
-			}
-
-			current += 1
-
-			if r.Status.Phase == string(corev1.PodRunning) {
-				ready += 1
-				// available is currently the same as ready, as we don't yet have minReadySeconds for runners
-				available += 1
-			}
-		}
-	}
-
-	var desired int
-
+	replicas := 1
 	if rs.Spec.Replicas != nil {
-		desired = *rs.Spec.Replicas
-	} else {
-		desired = 1
-	}
-
-	// TODO: remove this registration runner cleanup later (v0.23.0 or v0.24.0)
-	//
-	// We had to have a registration-only runner to support scale-from-zero before.
-	// But since Sep 2021 Actions update on GitHub Cloud and GHES 3.3, it is unneceesary.
-	// See the below issues for more contexts:
-	// https://github.com/actions-runner-controller/actions-runner-controller/issues/516
-	// https://github.com/actions-runner-controller/actions-runner-controller/issues/859
-	//
-	// In the below block, we have a logic to remove existing registration-only runners as unnecessary.
-	// This logic is introduced since actions-runner-controller 0.21.0 and probably last one or two minor releases
-	// so that actions-runner-controller instance in everyone's cluster won't leave dangling registration-only runners.
-	registrationOnlyRunnerNsName := req.NamespacedName
-	registrationOnlyRunnerNsName.Name = registrationOnlyRunnerNameFor(rs.Name)
-	registrationOnlyRunner := v1alpha1.Runner{}
-	registrationOnlyRunnerExists := false
-	if err := r.Get(
-		ctx,
-		registrationOnlyRunnerNsName,
-		&registrationOnlyRunner,
-	); err != nil {
-		if !kerrors.IsNotFound(err) {
-			return ctrl.Result{}, err
-		}
-	} else {
-		registrationOnlyRunnerExists = true
-	}
-
-	if registrationOnlyRunnerExists {
-		if err := r.Client.Delete(ctx, &registrationOnlyRunner); err != nil {
-			log.Error(err, "Retrying soon because we failed to delete registration-only runner")
-
-			return ctrl.Result{Requeue: true}, nil
-		}
+		replicas = *rs.Spec.Replicas
 	}

 	effectiveTime := rs.Spec.EffectiveTime
 	ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral

-	if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
-		log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
-	} else if current > desired {
-		// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
-		// you're unlikely to fall into this branch.
-		//
-		// That's becaseu all the stakeholders work like this:
-		//
-		// 1. A runner pod completes with the runner container exiting with code 0
-		// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
-		// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
-		// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
-		// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
-		// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
-		// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
-		//
-		// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
-		// marked for deletion by the runner controller.
-		// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
-		// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
+	desired, err := r.newRunner(rs)
+	if err != nil {
+		log.Error(err, "Could not create runner")

-		n := current - desired
-
-		log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
-
-		// get runners that are currently offline/not busy/timed-out to register
-		var deletionCandidates []v1alpha1.Runner
-
-		for _, runner := range allRunners.Items {
-			busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
-			if err != nil {
-				notRegistered := false
-				offline := false
-
-				var notFoundException *github.RunnerNotFound
-				var offlineException *github.RunnerOffline
-				if errors.As(err, &notFoundException) {
-					log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
-					notRegistered = true
-				} else if errors.As(err, &offlineException) {
-					offline = true
-				} else {
-					var e *gogithub.RateLimitError
-					if errors.As(err, &e) {
-						// We log the underlying error when we failed calling GitHub API to list or unregisters,
-						// or the runner is still busy.
-						log.Error(
-							err,
-							fmt.Sprintf(
-								"Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls",
-								retryDelayOnGitHubAPIRateLimitError,
-							),
-						)
-
-						return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
-					}
-
-					return ctrl.Result{}, err
-				}
-
-				registrationTimeout := 15 * time.Minute
-				currentTime := time.Now()
-				registrationDidTimeout := currentTime.Sub(runner.CreationTimestamp.Add(registrationTimeout)) > 0
-
-				if notRegistered && registrationDidTimeout {
-					log.Info(
-						"Runner failed to register itself to GitHub in timely manner. "+
-							"Marking the runner for scale down. "+
-							"CAUTION: If you see this a lot, you should investigate the root cause. "+
-							"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
-						"runnerCreationTimestamp", runner.CreationTimestamp,
-						"currentTime", currentTime,
-						"configuredRegistrationTimeout", registrationTimeout,
-					)
-
-					deletionCandidates = append(deletionCandidates, runner)
-				}
-
-				// offline runners should always be a great target for scale down
-				if offline {
-					deletionCandidates = append(deletionCandidates, runner)
-				}
-			} else if !busy {
-				deletionCandidates = append(deletionCandidates, runner)
-			}
-		}
-
-		if len(deletionCandidates) < n {
-			n = len(deletionCandidates)
-		}
-
-		log.V(0).Info(fmt.Sprintf("Deleting %d runner(s)", n), "desired", desired, "current", current, "ready", ready)
-
-		for i := 0; i < n; i++ {
-			if err := r.Client.Delete(ctx, &deletionCandidates[i]); client.IgnoreNotFound(err) != nil {
-				log.Error(err, "Failed to delete runner resource")
-
-				return ctrl.Result{}, err
-			}
-
-			r.Recorder.Event(&rs, corev1.EventTypeNormal, "RunnerDeleted", fmt.Sprintf("Deleted runner '%s'", deletionCandidates[i].Name))
-			log.Info(fmt.Sprintf("Deleted runner %s", deletionCandidates[i].Name))
-		}
-	} else if desired > current {
-		n := desired - current
-
-		log.V(0).Info(fmt.Sprintf("Creating %d runner(s)", n), "desired", desired, "available", current, "ready", ready)
-
-		for i := 0; i < n; i++ {
-			newRunner, err := r.newRunner(rs)
-			if err != nil {
-				log.Error(err, "Could not create runner")
-
-				return ctrl.Result{}, err
-			}
-
-			if err := r.Client.Create(ctx, &newRunner); err != nil {
-				log.Error(err, "Failed to create runner resource")
-
-				return ctrl.Result{}, err
-			}
-		}
+		return ctrl.Result{}, err
 	}

-	var status v1alpha1.RunnerReplicaSetStatus
+	var live []client.Object
+	for _, r := range runnerList.Items {
+		r := r
+		live = append(live, &r)
+	}
+
+	res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, replicas, func() client.Object { return desired.DeepCopy() }, ephemeral, live)
+	if err != nil || res == nil {
+		return ctrl.Result{}, err
+	}
+
+	var (
+		status v1alpha1.RunnerReplicaSetStatus
+
+		current, available, ready int
+	)
+
+	for _, o := range res.currentObjects {
+		current += o.total
+		available += o.running
+		ready += o.running
+	}

 	status.Replicas = &current
 	status.AvailableReplicas = &available
@@ -322,6 +163,8 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
 }

 func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1alpha1.Runner, error) {
+	// Note that the upstream controller (runnerdeployment) is expected to add
+	// the "runner template hash" label to the template.meta which is necessary to make this controller work correctly
 	objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy()

 	objectMeta.GenerateName = rs.ObjectMeta.Name + "-"