feat: Repository-wide RunnerDeployment Autoscaling (#57)

* feat: Repository-wide RunnerDeployment Autoscaling This adds `maxReplicas` and `minReplicas` to the RunnerDeploymentSpec. If and only if both fields are set, the controller computes and sets desired `replicas` automatically depending on the demand. The number of demanded runner replicas is computed by `queued workflow runs + in_progress workflow runs` for the repository. The support for organizational runners is not included. Ref https://github.com/summerwind/actions-runner-controller/issues/10
2025-12-14 13:46:45 +00:00 · 2020-06-27 17:26:46 +09:00
parent 512cae68a1
commit 5bb2694349
12 changed files with 521 additions and 20 deletions
--- a/controllers/autoscaling.go
+++ b/controllers/autoscaling.go
@@ -0,0 +1,92 @@
+package controllers
+
+import (
+	"context"
+	"fmt"
+	"github.com/summerwind/actions-runner-controller/api/v1alpha1"
+	"strings"
+)
+
+type NotSupported struct {
+}
+
+var _ error = NotSupported{}
+
+func (e NotSupported) Error() string {
+	return "Autoscaling is currently supported only when spec.repository is set"
+}
+
+func (r *RunnerDeploymentReconciler) determineDesiredReplicas(rd v1alpha1.RunnerDeployment) (*int, error) {
+	if rd.Spec.Replicas != nil {
+		return nil, fmt.Errorf("bug: determineDesiredReplicas should not be called for deplomeny with specific replicas")
+	} else if rd.Spec.MinReplicas == nil {
+		return nil, fmt.Errorf("runnerdeployment %s/%s is missing minReplicas", rd.Namespace, rd.Name)
+	} else if rd.Spec.MaxReplicas == nil {
+		return nil, fmt.Errorf("runnerdeployment %s/%s is missing maxReplicas", rd.Namespace, rd.Name)
+	}
+
+	var replicas int
+
+	repoID := rd.Spec.Template.Spec.Repository
+	if repoID == "" {
+		return nil, NotSupported{}
+	}
+
+	repo := strings.Split(repoID, "/")
+	user, repoName := repo[0], repo[1]
+	list, _, err := r.GitHubClient.Actions.ListRepositoryWorkflowRuns(context.TODO(), user, repoName, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	var total, inProgress, queued, completed, unknown int
+
+	for _, r := range list.WorkflowRuns {
+		total++
+
+		// In May 2020, there are only 3 statuses.
+		// Follow the below links for more details:
+		// - https://developer.github.com/v3/actions/workflow-runs/#list-repository-workflow-runs
+		// - https://developer.github.com/v3/checks/runs/#create-a-check-run
+		switch r.GetStatus() {
+		case "completed":
+			completed++
+		case "in_progress":
+			inProgress++
+		case "queued":
+			queued++
+		default:
+			unknown++
+		}
+	}
+
+	minReplicas := *rd.Spec.MinReplicas
+	maxReplicas := *rd.Spec.MaxReplicas
+	necessaryReplicas := queued + inProgress
+
+	var desiredReplicas int
+
+	if necessaryReplicas < minReplicas {
+		desiredReplicas = minReplicas
+	} else if necessaryReplicas > maxReplicas {
+		desiredReplicas = maxReplicas
+	} else {
+		desiredReplicas = necessaryReplicas
+	}
+
+	rd.Status.Replicas = &desiredReplicas
+	replicas = desiredReplicas
+
+	r.Log.V(1).Info(
+		"Calculated desired replicas",
+		"computed_replicas_desired", desiredReplicas,
+		"spec_replicas_min", minReplicas,
+		"spec_replicas_max", maxReplicas,
+		"workflow_runs_completed", completed,
+		"workflow_runs_in_progress", inProgress,
+		"workflow_runs_queued", queued,
+		"workflow_runs_unknown", unknown,
+	)
+
+	return &replicas, nil
+}
--- a/controllers/autoscaling_test.go
+++ b/controllers/autoscaling_test.go
@@ -0,0 +1,199 @@
+package controllers
+
+import (
+	"fmt"
+	"github.com/summerwind/actions-runner-controller/api/v1alpha1"
+	"github.com/summerwind/actions-runner-controller/github"
+	"github.com/summerwind/actions-runner-controller/github/fake"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"net/http/httptest"
+	"net/url"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	"testing"
+)
+
+func newGithubClient(server *httptest.Server) *github.Client {
+	client, err := github.NewClientWithAccessToken("token")
+	if err != nil {
+		panic(err)
+	}
+
+	baseURL, err := url.Parse(server.URL + "/")
+	if err != nil {
+		panic(err)
+	}
+	client.Client.BaseURL = baseURL
+
+	return client
+}
+
+func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
+	intPtr := func(v int) *int {
+		return &v
+	}
+
+	metav1Now := metav1.Now()
+	testcases := []struct {
+		repo         string
+		org          string
+		fixed        *int
+		max          *int
+		min          *int
+		sReplicas    *int
+		sTime        *metav1.Time
+		workflowRuns string
+		want         int
+		err          string
+	}{
+		// 3 demanded, max at 3
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 4, "workflow_runs":[{"status":"queued"}, {"status":"in_progress"}, {"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         3,
+		},
+		// 2 demanded, max at 3, currently 3, delay scaling down due to grace period
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(3),
+			sReplicas:    intPtr(3),
+			sTime:        &metav1Now,
+			workflowRuns: `{"total_count": 4, "workflow_runs":[{"status":"queued"}, {"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         3,
+		},
+		// 3 demanded, max at 2
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(2),
+			workflowRuns: `{"total_count": 4, "workflow_runs":[{"status":"queued"}, {"status":"in_progress"}, {"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         2,
+		},
+		// 2 demanded, min at 2
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 3, "workflow_runs":[{"status":"queued"}, {"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         2,
+		},
+		// 1 demanded, min at 2
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 2, "workflow_runs":[{"status":"queued"}, {"status":"completed"}]}"`,
+			want:         2,
+		},
+		// 1 demanded, min at 2
+		{
+			repo:         "test/valid",
+			min:          intPtr(2),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 2, "workflow_runs":[{"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         2,
+		},
+		// 1 demanded, min at 1
+		{
+			repo:         "test/valid",
+			min:          intPtr(1),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 2, "workflow_runs":[{"status":"queued"}, {"status":"completed"}]}"`,
+			want:         1,
+		},
+		// 1 demanded, min at 1
+		{
+			repo:         "test/valid",
+			min:          intPtr(1),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 2, "workflow_runs":[{"status":"in_progress"}, {"status":"completed"}]}"`,
+			want:         1,
+		},
+		// fixed at 3
+		{
+			repo:  "test/valid",
+			fixed: intPtr(3),
+			want:  3,
+		},
+		// org runner, fixed at 3
+		{
+			org:   "test",
+			fixed: intPtr(3),
+			want:  3,
+		},
+		// org runner, 1 demanded, min at 1
+		{
+			org:          "test",
+			min:          intPtr(1),
+			max:          intPtr(3),
+			workflowRuns: `{"total_count": 2, "workflow_runs":[{"status":"in_progress"}, {"status":"completed"}]}"`,
+			err:          "Autoscaling is currently supported only when spec.repository is set",
+		},
+	}
+
+	for i := range testcases {
+		tc := testcases[i]
+
+		log := zap.New(func(o *zap.Options) {
+			o.Development = true
+		})
+
+		scheme := runtime.NewScheme()
+		_ = clientgoscheme.AddToScheme(scheme)
+		_ = v1alpha1.AddToScheme(scheme)
+
+		t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
+			server := fake.NewServer(fake.WithListRepositoryWorkflowRunsResponse(200, tc.workflowRuns))
+			defer server.Close()
+			client := newGithubClient(server)
+
+			r := &RunnerDeploymentReconciler{
+				Log:          log,
+				GitHubClient: client,
+				Scheme:       scheme,
+			}
+
+			rd := v1alpha1.RunnerDeployment{
+				TypeMeta: metav1.TypeMeta{},
+				Spec: v1alpha1.RunnerDeploymentSpec{
+					Template: v1alpha1.RunnerTemplate{
+						Spec: v1alpha1.RunnerSpec{
+							Repository: tc.repo,
+						},
+					},
+					Replicas:    tc.fixed,
+					MaxReplicas: tc.max,
+					MinReplicas: tc.min,
+				},
+				Status: v1alpha1.RunnerDeploymentStatus{
+					Replicas:                   tc.sReplicas,
+					LastSuccessfulScaleOutTime: tc.sTime,
+				},
+			}
+
+			rs, err := r.newRunnerReplicaSetWithAutoscaling(rd)
+			if err != nil {
+				if tc.err == "" {
+					t.Fatalf("unexpected error: expected none, got %v", err)
+				} else if err.Error() != tc.err {
+					t.Fatalf("unexpected error: expected %v, got %v", tc.err, err)
+				}
+				return
+			}
+
+			got := rs.Spec.Replicas
+
+			if got == nil {
+				t.Fatalf("unexpected value of rs.Spec.Replicas: nil")
+			}
+
+			if *got != tc.want {
+				t.Errorf("%d: incorrect desired replicas: want %d, got %d", i, tc.want, *got)
+			}
+		})
+	}
+}
--- a/controllers/runnerdeployment_controller.go
+++ b/controllers/runnerdeployment_controller.go
@@ -20,10 +20,12 @@ import (
 	"context"
 	"fmt"
 	"hash/fnv"
-	"k8s.io/apimachinery/pkg/types"
 	"sort"
 	"time"

+	"github.com/summerwind/actions-runner-controller/github"
+	"k8s.io/apimachinery/pkg/types"
+
 	"github.com/davecgh/go-spew/spew"
 	"github.com/go-logr/logr"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -47,9 +49,10 @@ const (
 // RunnerDeploymentReconciler reconciles a Runner object
 type RunnerDeploymentReconciler struct {
 	client.Client
-	Log      logr.Logger
-	Recorder record.EventRecorder
-	Scheme   *runtime.Scheme
+	GitHubClient *github.Client
+	Log          logr.Logger
+	Recorder     record.EventRecorder
+	Scheme       *runtime.Scheme
 }

 // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerdeployments,verbs=get;list;watch;create;update;patch;delete
@@ -94,15 +97,19 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 		oldSets = myRunnerReplicaSets[1:]
 	}

-	desiredRS, err := r.newRunnerReplicaSet(rd)
+	desiredRS, err := r.newRunnerReplicaSetWithAutoscaling(rd)
 	if err != nil {
+		if _, ok := err.(NotSupported); ok {
+			r.Recorder.Event(&rd, corev1.EventTypeNormal, "RunnerReplicaSetAutoScaleNotSupported", err.Error())
+		}
+
 		log.Error(err, "Could not create runnerreplicaset")

 		return ctrl.Result{}, err
 	}

 	if newestSet == nil {
-		if err := r.Client.Create(ctx, &desiredRS); err != nil {
+		if err := r.Client.Create(ctx, desiredRS); err != nil {
 			log.Error(err, "Failed to create runnerreplicaset resource")

 			return ctrl.Result{}, err
@@ -118,7 +125,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 		return ctrl.Result{}, nil
 	}

-	desiredTemplateHash, ok := getTemplateHash(&desiredRS)
+	desiredTemplateHash, ok := getTemplateHash(desiredRS)
 	if !ok {
 		log.Info("Failed to get template hash of desired runnerreplicaset resource. It must be in an invalid state. Please manually delete the runnerreplicaset so that it is recreated")

@@ -126,7 +133,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 	}

 	if newestTemplateHash != desiredTemplateHash {
-		if err := r.Client.Create(ctx, &desiredRS); err != nil {
+		if err := r.Client.Create(ctx, desiredRS); err != nil {
 			log.Error(err, "Failed to create runnerreplicaset resource")

 			return ctrl.Result{}, err
@@ -184,6 +191,23 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
 		}
 	}

+	if rd.Spec.Replicas == nil && desiredRS.Spec.Replicas != nil {
+		updated := rd.DeepCopy()
+		updated.Status.Replicas = desiredRS.Spec.Replicas
+
+		if (rd.Status.Replicas == nil && *desiredRS.Spec.Replicas > 1) ||
+			(rd.Status.Replicas != nil && *desiredRS.Spec.Replicas > *rd.Status.Replicas) {
+
+			updated.Status.LastSuccessfulScaleOutTime = &metav1.Time{Time: time.Now()}
+		}
+
+		if err := r.Status().Update(ctx, updated); err != nil {
+			log.Error(err, "Failed to update runnerdeployment status")
+
+			return ctrl.Result{}, err
+		}
+	}
+
 	return ctrl.Result{}, nil
 }

@@ -241,7 +265,7 @@ func CloneAndAddLabel(labels map[string]string, labelKey, labelValue string) map
 	return newLabels
 }

-func (r *RunnerDeploymentReconciler) newRunnerReplicaSet(rd v1alpha1.RunnerDeployment) (v1alpha1.RunnerReplicaSet, error) {
+func (r *RunnerDeploymentReconciler) newRunnerReplicaSet(rd v1alpha1.RunnerDeployment, computedReplicas *int) (*v1alpha1.RunnerReplicaSet, error) {
 	newRSTemplate := *rd.Spec.Template.DeepCopy()
 	templateHash := ComputeHash(&newRSTemplate)
 	// Add template hash label to selector.
@@ -262,11 +286,15 @@ func (r *RunnerDeploymentReconciler) newRunnerReplicaSet(rd v1alpha1.RunnerDeplo
 		},
 	}

-	if err := ctrl.SetControllerReference(&rd, &rs, r.Scheme); err != nil {
-		return rs, err
+	if computedReplicas != nil {
+		rs.Spec.Replicas = computedReplicas
 	}

-	return rs, nil
+	if err := ctrl.SetControllerReference(&rd, &rs, r.Scheme); err != nil {
+		return &rs, err
+	}
+
+	return &rs, nil
 }

 func (r *RunnerDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
@@ -293,3 +321,36 @@ func (r *RunnerDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Owns(&v1alpha1.RunnerReplicaSet{}).
 		Complete(r)
 }
+
+func (r *RunnerDeploymentReconciler) newRunnerReplicaSetWithAutoscaling(rd v1alpha1.RunnerDeployment) (*v1alpha1.RunnerReplicaSet, error) {
+	var computedReplicas *int
+
+	if rd.Spec.Replicas == nil {
+		replicas, err := r.determineDesiredReplicas(rd)
+		if err != nil {
+			return nil, err
+		}
+
+		var scaleDownDelay time.Duration
+
+		if rd.Spec.ScaleDownDelaySecondsAfterScaleUp != nil {
+			scaleDownDelay = time.Duration(*rd.Spec.ScaleDownDelaySecondsAfterScaleUp) * time.Second
+		} else {
+			scaleDownDelay = 10 * time.Minute
+		}
+
+		now := time.Now()
+
+		if rd.Status.Replicas == nil ||
+			*rd.Status.Replicas < *replicas ||
+			rd.Status.LastSuccessfulScaleOutTime == nil ||
+			rd.Status.LastSuccessfulScaleOutTime.Add(scaleDownDelay).Before(now) {
+
+			computedReplicas = replicas
+		} else {
+			computedReplicas = rd.Status.Replicas
+		}
+	}
+
+	return r.newRunnerReplicaSet(rd, computedReplicas)
+}