feat: Support for scaling from/to zero (#465)

This is an attempt to support scaling from/to zero. The basic idea is that we create a one-off "registration-only" runner pod on RunnerReplicaSet being scaled to zero, so that there is one "offline" runner, which enables GitHub Actions to queue jobs instead of discarding those. GitHub Actions seems to immediately throw away the new job when there are no runners at all. Generally, having runners of any status, `busy`, `idle`, or `offline` would prevent GitHub actions from failing jobs. But retaining `busy` or `idle` runners means that we need to keep runner pods running, which conflicts with our desired to scale to/from zero, hence we retain `offline` runners. In this change, I enhanced the runnerreplicaset controller to create a registration-only runner on very beginning of its reconciliation logic, only when a runnerreplicaset is scaled to zero. The runner controller creates the registration-only runner pod, waits for it to become "offline", and then removes the runner pod. The runner on GitHub stays `offline`, until the runner resource on K8s is deleted. As we remove the registration-only runner pod as soon as it registers, this doesn't block cluster-autoscaler. Related to #447
2025-12-11 03:57:01 +00:00 · 2021-05-02 16:11:36 +09:00
parent 7e766282aa
commit dbd7b486d2
10 changed files with 302 additions and 50 deletions
--- a/controllers/integration_test.go
+++ b/controllers/integration_test.go
@@ -446,9 +446,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				ExpectCreate(ctx, rd, "test RunnerDeployment")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
 			}

@@ -554,9 +551,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				ExpectCreate(ctx, rd, "test RunnerDeployment")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
 			}

@@ -595,9 +589,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {

 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
 			}

@@ -606,9 +597,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				env.SendOrgCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
 			}

@@ -616,9 +604,8 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 			{
 				env.SendOrgCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after second webhook event")
+				env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 			}
-
-			env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 		})

 		It("should create and scale user's repository runners on pull_request event", func() {
@@ -884,9 +871,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				ExpectCreate(ctx, rd, "test RunnerDeployment")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
 			}

@@ -930,9 +914,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {

 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3)
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 			}

@@ -941,9 +922,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				env.SendUserCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 4, "runners after first webhook event")
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(4, "count of fake list runners")
 			}

@@ -951,9 +929,8 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 			{
 				env.SendUserCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 5, "runners after second webhook event")
+				env.ExpectRegisteredNumberCountEventuallyEquals(5, "count of fake list runners")
 			}
-
-			env.ExpectRegisteredNumberCountEventuallyEquals(5, "count of fake list runners")
 		})

 		It("should create and scale user's repository runners only on check_run event", func() {
@@ -1045,9 +1022,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 				env.SendUserCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
-			}
-
-			{
 				env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
 			}

@@ -1055,9 +1029,8 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
 			{
 				env.SendUserCheckRunEvent("test", "valid", "pending", "created")
 				ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after second webhook event")
+				env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 			}
-
-			env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
 		})

 	})
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@@ -48,6 +48,9 @@ const (
 	LabelKeyPodTemplateHash = "pod-template-hash"

 	retryDelayOnGitHubAPIRateLimitError = 30 * time.Second
+
+	// This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways
+	annotationKeyRegistrationOnly = "actions-runner-controller/registration-only"
 )

 // RunnerReconciler reconciles a Runner object
@@ -145,6 +148,34 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 		return ctrl.Result{}, nil
 	}

+	registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
+	if registrationOnly && runner.Status.Phase != "" {
+		// At this point we are sure that the registration-only runner has successfully configured and
+		// is of `offline` status, because we set runner.Status.Phase to that of the runner pod only after
+		// successful registration.
+
+		var pod corev1.Pod
+		if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
+			if !kerrors.IsNotFound(err) {
+				log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
+
+				return ctrl.Result{Requeue: true}, nil
+			}
+		} else if err := r.Delete(ctx, &pod); err != nil {
+			if !kerrors.IsNotFound(err) {
+				log.Info(fmt.Sprintf("Retrying soon as we failed to delete registration-only runner pod: %v", err))
+
+				return ctrl.Result{Requeue: true}, nil
+			}
+		}
+
+		log.Info("Successfully deleted egistration-only runner pod to free node and cluster resource")
+
+		// Return here to not recreate the deleted pod, because recreating it is the waste of cluster and node resource,
+		// and also defeats the original purpose of scale-from/to-zero we're trying to implement by using the registration-only runner.
+		return ctrl.Result{}, nil
+	}
+
 	var pod corev1.Pod
 	if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
 		if !kerrors.IsNotFound(err) {
@@ -221,20 +252,33 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {

 		// If pod has ended up succeeded we need to restart it
 		// Happens e.g. when dind is in runner and run completes
-		restart := pod.Status.Phase == corev1.PodSucceeded
+		stopped := pod.Status.Phase == corev1.PodSucceeded

-		if pod.Status.Phase == corev1.PodRunning {
-			for _, status := range pod.Status.ContainerStatuses {
-				if status.Name != containerName {
-					continue
-				}
+		if !stopped {
+			if pod.Status.Phase == corev1.PodRunning {
+				for _, status := range pod.Status.ContainerStatuses {
+					if status.Name != containerName {
+						continue
+					}

-				if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
-					restart = true
+					if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
+						stopped = true
+					}
 				}
 			}
 		}

+		restart := stopped
+
+		if registrationOnly && stopped {
+			restart = false
+
+			log.Info(
+				"Observed that registration-only runner for scaling-from-zero has successfully stopped. " +
+					"Unlike other pods, this one will be recreated only when runner spec changes.",
+			)
+		}
+
 		if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
 			return ctrl.Result{}, err
 		} else if updated {
@@ -247,11 +291,21 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 			return ctrl.Result{}, err
 		}

+		if registrationOnly {
+			newPod.Spec.Containers[0].Env = append(
+				newPod.Spec.Containers[0].Env,
+				corev1.EnvVar{
+					Name:  "RUNNER_REGISTRATION_ONLY",
+					Value: "true",
+				},
+			)
+		}
+
 		var registrationRecheckDelay time.Duration

 		// all checks done below only decide whether a restart is needed
 		// if a restart was already decided before, there is no need for the checks
-		// saving API calls and scary{ log messages
+		// saving API calls and scary log messages
 		if !restart {
 			registrationCheckInterval := time.Minute
 			if r.RegistrationRecheckInterval > 0 {
@@ -356,7 +410,14 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 					)
 				}
 			} else if offline {
-				if registrationDidTimeout {
+				if registrationOnly {
+					log.Info(
+						"Observed that registration-only runner for scaling-from-zero has successfully been registered.",
+						"podCreationTimestamp", pod.CreationTimestamp,
+						"currentTime", currentTime,
+						"configuredRegistrationTimeout", registrationTimeout,
+					)
+				} else if registrationDidTimeout {
 					log.Info(
 						"Already existing GitHub runner still appears offline . "+
 							"Recreating the pod to see if it resolves the issue. "+
@@ -375,7 +436,7 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 				}
 			}

-			if (notFound || offline) && !registrationDidTimeout {
+			if (notFound || (offline && !registrationOnly)) && !registrationDidTimeout {
 				registrationRecheckJitter := 10 * time.Second
 				if r.RegistrationRecheckJitter > 0 {
 					registrationRecheckJitter = r.RegistrationRecheckJitter
@@ -566,6 +627,14 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
 		},
 	}

+	if metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) {
+		env = append(env, corev1.EnvVar{
+			Name:  "RUNNER_REGISTRATION_ONLY",
+			Value: "true",
+		},
+		)
+	}
+
 	env = append(env, runner.Spec.Env...)

 	labels := map[string]string{}
--- a/controllers/runnerdeployment_controller.go
+++ b/controllers/runnerdeployment_controller.go
@@ -188,7 +188,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 		return ctrl.Result{}, err
 	}

-	// Do we old runner replica sets that should eventually deleted?
+	// Do we have old runner replica sets that should eventually deleted?
 	if len(oldSets) > 0 {
 		readyReplicas := newestSet.Status.ReadyReplicas

--- a/controllers/runnerreplicaset_controller.go
+++ b/controllers/runnerreplicaset_controller.go
@@ -68,6 +68,65 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 		return ctrl.Result{}, nil
 	}

+	registrationOnlyRunnerNeeded := rs.Spec.Replicas != nil && *rs.Spec.Replicas == 0
+	registrationOnlyRunner := v1alpha1.Runner{}
+	registrationOnlyRunnerNsName := req.NamespacedName
+	registrationOnlyRunnerNsName.Name = registrationOnlyRunnerNameFor(rs.Name)
+
+	registrationOnlyRunnerExists := false
+	if err := r.Get(
+		ctx,
+		registrationOnlyRunnerNsName,
+		&registrationOnlyRunner,
+	); err != nil {
+		if !kerrors.IsNotFound(err) {
+			return ctrl.Result{}, err
+		}
+	} else {
+		registrationOnlyRunnerExists = true
+	}
+
+	if registrationOnlyRunnerNeeded {
+		if registrationOnlyRunnerExists {
+			if registrationOnlyRunner.Status.Phase == "" {
+				log.Info("Still waiting for the registration-only runner to be registered")
+
+				return ctrl.Result{}, nil
+			}
+		} else {
+			// A registration-only runner does not exist and is needed, hence create it.
+
+			runnerForScaleFromToZero, err := r.newRunner(rs)
+			if err != nil {
+				return ctrl.Result{}, fmt.Errorf("failed to create runner for scale from/to zero: %v", err)
+			}
+
+			runnerForScaleFromToZero.ObjectMeta.Name = registrationOnlyRunnerNsName.Name
+			runnerForScaleFromToZero.ObjectMeta.GenerateName = ""
+			runnerForScaleFromToZero.ObjectMeta.Labels = nil
+			metav1.SetMetaDataAnnotation(&runnerForScaleFromToZero.ObjectMeta, annotationKeyRegistrationOnly, "true")
+
+			if err := r.Client.Create(ctx, &runnerForScaleFromToZero); err != nil {
+				log.Error(err, "Failed to create runner for scale from/to zero")
+
+				return ctrl.Result{}, err
+			}
+
+			// We can continue to deleting runner pods only after the
+			// registration-only runner gets registered.
+			return ctrl.Result{}, nil
+		}
+	} else {
+		// A registration-only runner exists and is not needed, hence delete it.
+		if registrationOnlyRunnerExists {
+			if err := r.Client.Delete(ctx, &registrationOnlyRunner); err != nil {
+				log.Error(err, "Retrying soon because we failed to delete registration-only runner")
+
+				return ctrl.Result{Requeue: true}, nil
+			}
+		}
+	}
+
 	selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
 	if err != nil {
 		return ctrl.Result{}, err
@@ -95,7 +154,7 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 	for _, r := range allRunners.Items {
 		// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
 		// to not treat all the runners in the namespace as its children.
-		if metav1.IsControlledBy(&r, &rs) {
+		if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
 			myRunners = append(myRunners, r)

 			available += 1
@@ -265,3 +324,7 @@ func (r *RunnerReplicaSetReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Named(name).
 		Complete(r)
 }
+
+func registrationOnlyRunnerNameFor(rsName string) string {
+	return rsName + "-registration-only"
+}
--- a/controllers/runnerreplicaset_controller_test.go
+++ b/controllers/runnerreplicaset_controller_test.go
@@ -2,6 +2,7 @@ package controllers

 import (
 	"context"
+	"fmt"
 	"math/rand"
 	"net/http/httptest"
 	"time"
@@ -262,8 +263,36 @@ var _ = Context("Inside of a new namespace", func() {

 				Eventually(
 					func() int {
-						err := k8sClient.List(ctx, &runners, client.InNamespace(ns.Name))
-						if err != nil {
+						selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
+							MatchLabels: map[string]string{
+								"foo": "bar",
+							},
+						})
+						Expect(err).ToNot(HaveOccurred())
+
+						var regOnly actionsv1alpha1.Runner
+						if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns.Name, Name: registrationOnlyRunnerNameFor(name)}, &regOnly); err != nil {
+							logf.Log.Info(fmt.Sprintf("Failed getting registration-only runner in test: %v", err))
+							return -1
+						} else {
+							updated := regOnly.DeepCopy()
+							updated.Status.Phase = "Completed"
+
+							if err := k8sClient.Status().Patch(ctx, updated, client.MergeFrom(&regOnly)); err != nil {
+								logf.Log.Info(fmt.Sprintf("Failed updating registration-only runner in test: %v", err))
+								return -1
+							}
+
+							runnersList.Add(&github.Runner{
+								ID:     pointer.Int64Ptr(1001),
+								Name:   pointer.StringPtr(regOnly.Name),
+								OS:     pointer.StringPtr("linux"),
+								Status: pointer.StringPtr("offline"),
+								Busy:   pointer.BoolPtr(false),
+							})
+						}
+
+						if err := k8sClient.List(ctx, &runners, client.InNamespace(ns.Name), client.MatchingLabelsSelector{Selector: selector}); err != nil {
 							logf.Log.Error(err, "list runners")
 							return -1
 						}