feat: Workflow job based ephemeral runner scaling (#721)

This add support for two upcoming enhancements on the GitHub side of self-hosted runners, ephemeral runners, and `workflow_jow` events. You can't use these yet. **These features are not yet generally available to all GitHub users**. Please take this pull request as a preparation to make it available to actions-runner-controller users as soon as possible after GitHub released the necessary features on their end. **Ephemeral runners**: The former, ephemeral runners, is basically the reliable alternative to `--once`, which we've been using when you enabled `ephemeral: true` (default in actions-runner-controller). `--once` has been suffering from a race issue #466. `--ephemeral` fixes that. To enable ephemeral runners with `actions/runner`, you give `--ephemeral` to `config.sh`. This updated version of `actions-runner-controller` does it for you, by using `--ephemeral` instead of `--once` when you set `RUNNER_FEATURE_FLAG_EPHEMERAL=true`. Please read the section `Ephemeral Runners` in the updated version of our README for more information. Note that ephemeral runners is not released on GitHub yet. And `RUNNER_FEATURE_FLAG_EPHEMERAL=true` won't work at all until the feature gets released on GitHub. Stay tuned for an announcement from GitHub! **`workflow_job` events**: `workflow_job` is the additional webhook event that corresponds to each GitHub Actions workflow job run. It provides `actions-runner-controller` a solid foundation to improve our webhook-based autoscale. Formerly, we've been exploiting webhook events like `check_run` for autoscaling. However, as none of our supported events has included `labels`, you had to configure an HRA to only match relevant `check_run` events. It wasn't trivial. In contrast, a `workflow_job` event payload contains `labels` of runners requested. `actions-runner-controller` is able to automatically decide which HRA to scale by filtering the corresponding RunnerDeployment by `labels` included in the webhook payload. So all you need to use webhook-based autoscale will be to enable `workflow_job` on GitHub and expose actions-runner-controller's webhook server to the internet. Note that the current implementation of `workflow_job` support works in two ways, increment, and decrement. An increment happens when the webhook server receives` workflow_job` of `queued` status. A decrement happens when it receives `workflow_job` of `completed` status. The latter is used to make scaling-down faster so that you waste money less than before. You still don't suffer from flapping, as a scale-down is still subject to `scaleDownDelaySecondsAfterScaleOut `. Please read the section `Example 3: Scale on each `workflow_job` event` in the updated version of our README for more information on its usage.
2025-12-11 12:06:57 +00:00 · 2021-08-11 09:52:04 +09:00
parent d528d18211
commit fabead8c8e
9 changed files with 338 additions and 16 deletions
--- a/controllers/horizontal_runner_autoscaler_webhook.go
+++ b/controllers/horizontal_runner_autoscaler_webhook.go
@@ -183,6 +183,45 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
 				"action", e.GetAction(),
 			)
 		}
+	case *gogithub.WorkflowJobEvent:
+		if workflowJob := e.GetWorkflowJob(); workflowJob != nil {
+			log = log.WithValues(
+				"workflowJob.status", workflowJob.GetStatus(),
+				"workflowJob.labels", workflowJob.Labels,
+				"repository.name", e.Repo.GetName(),
+				"repository.owner.login", e.Repo.Owner.GetLogin(),
+				"repository.owner.type", e.Repo.Owner.GetType(),
+				"action", e.GetAction(),
+			)
+		}
+
+		labels := e.WorkflowJob.Labels
+
+		switch e.GetAction() {
+		case "queued", "completed":
+			target, err = autoscaler.getJobScaleUpTargetForRepoOrOrg(
+				context.TODO(),
+				log,
+				e.Repo.GetName(),
+				e.Repo.Owner.GetLogin(),
+				e.Repo.Owner.GetType(),
+				labels,
+			)
+
+			if target != nil {
+				if e.GetAction() == "queued" {
+					target.Amount = 1
+				} else if e.GetAction() == "completed" {
+					// A nagative amount is processed in the tryScale func as a scale-down request,
+					// that erasese the oldest CapacityReservation with the same amount.
+					// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
+					// so that the resulting desired replicas decreases by 1.
+					target.Amount = -1
+				}
+			}
+		default:
+
+		}
 	case *gogithub.PingEvent:
 		ok = true

@@ -227,7 +266,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
 		return
 	}

-	if err := autoscaler.tryScaleUp(context.TODO(), target); err != nil {
+	if err := autoscaler.tryScale(context.TODO(), target); err != nil {
 		log.Error(err, "could not scale up")

 		return
@@ -237,7 +276,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons

 	w.WriteHeader(http.StatusOK)

-	msg := fmt.Sprintf("scaled %s by 1", target.Name)
+	msg := fmt.Sprintf("scaled %s by %d", target.Name, target.Amount)

 	autoscaler.Log.Info(msg)

@@ -394,7 +433,137 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getScaleUpTarget(ctx
 	return nil, nil
 }

-func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScaleUp(ctx context.Context, target *ScaleTarget) error {
+func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getJobScaleUpTargetForRepoOrOrg(ctx context.Context, log logr.Logger, repo, owner, ownerType string, labels []string) (*ScaleTarget, error) {
+	repositoryRunnerKey := owner + "/" + repo
+
+	if target, err := autoscaler.getJobScaleTarget(ctx, repositoryRunnerKey, labels); err != nil {
+		log.Info("finding repository-wide runner", "repository", repositoryRunnerKey)
+		return nil, err
+	} else if target != nil {
+		log.Info("job scale up target is repository-wide runners", "repository", repo)
+		return target, nil
+	}
+
+	if ownerType == "User" {
+		log.V(1).Info("no repository runner found", "organization", owner)
+
+		return nil, nil
+	}
+
+	if target, err := autoscaler.getJobScaleTarget(ctx, owner, labels); err != nil {
+		log.Info("finding organizational runner", "organization", owner)
+		return nil, err
+	} else if target != nil {
+		log.Info("job scale up target is organizational runners", "organization", owner)
+		return target, nil
+	} else {
+		log.V(1).Info("no repository runner or organizational runner found",
+			"repository", repositoryRunnerKey,
+			"organization", owner,
+		)
+	}
+
+	return nil, nil
+}
+
+func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getJobScaleTarget(ctx context.Context, name string, labels []string) (*ScaleTarget, error) {
+	hras, err := autoscaler.findHRAsByKey(ctx, name)
+	if err != nil {
+		return nil, err
+	}
+
+	autoscaler.Log.V(1).Info(fmt.Sprintf("Found %d HRAs by key", len(hras)), "key", name)
+
+HRA:
+	for _, hra := range hras {
+		if !hra.ObjectMeta.DeletionTimestamp.IsZero() {
+			continue
+		}
+
+		if len(hra.Spec.ScaleUpTriggers) > 1 {
+			autoscaler.Log.V(1).Info("Skipping this HRA as it has too many ScaleUpTriggers to be used in workflow_job based scaling", "hra", hra.Name)
+
+			continue
+		}
+
+		var duration metav1.Duration
+
+		if len(hra.Spec.ScaleUpTriggers) > 0 {
+			duration = hra.Spec.ScaleUpTriggers[0].Duration
+		}
+
+		if duration.Duration <= 0 {
+			// Try to release the reserved capacity after at least 10 minutes by default,
+			// we won't end up in the reserved capacity remained forever in case GitHub somehow stopped sending us "completed" workflow_job events.
+			// GitHub usually send us those but nothing is 100% guaranteed, e.g. in case of something went wrong on GitHub :)
+			// Probably we'd better make this configurable via custom resources in the future?
+			duration.Duration = 10 * time.Minute
+		}
+
+		switch hra.Spec.ScaleTargetRef.Kind {
+		case "RunnerSet":
+			var rs v1alpha1.RunnerSet
+
+			if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil {
+				return nil, err
+			}
+
+			if len(labels) == 1 && labels[0] == "self-hosted" {
+				return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
+			}
+
+			// Ensure that the RunnerSet-managed runners have all the labels requested by the workflow_job.
+			for _, l := range labels {
+				var matched bool
+				for _, l2 := range rs.Spec.Labels {
+					if l == l2 {
+						matched = true
+						break
+					}
+				}
+
+				if !matched {
+					continue HRA
+				}
+			}
+
+			return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
+		case "RunnerDeployment", "":
+			var rd v1alpha1.RunnerDeployment
+
+			if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
+				return nil, err
+			}
+
+			if len(labels) == 1 && labels[0] == "self-hosted" {
+				return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
+			}
+
+			// Ensure that the RunnerDeployment-managed runners have all the labels requested by the workflow_job.
+			for _, l := range labels {
+				var matched bool
+				for _, l2 := range rd.Spec.Template.Labels {
+					if l == l2 {
+						matched = true
+						break
+					}
+				}
+
+				if !matched {
+					continue HRA
+				}
+			}
+
+			return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
+		default:
+			return nil, fmt.Errorf("unsupported scaleTargetRef.kind: %v", hra.Spec.ScaleTargetRef.Kind)
+		}
+	}
+
+	return nil, nil
+}
+
+func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.Context, target *ScaleTarget) error {
 	if target == nil {
 		return nil
 	}
@@ -403,16 +572,38 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScaleUp(ctx contex

 	amount := 1

-	if target.ScaleUpTrigger.Amount > 0 {
+	if target.ScaleUpTrigger.Amount != 0 {
 		amount = target.ScaleUpTrigger.Amount
 	}

 	capacityReservations := getValidCapacityReservations(copy)

-	copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
-		ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
-		Replicas:       amount,
-	})
+	if amount > 0 {
+		copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
+			ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
+			Replicas:       amount,
+		})
+	} else if amount < 0 {
+		var reservations []v1alpha1.CapacityReservation
+
+		var found bool
+
+		for _, r := range capacityReservations {
+			if !found && r.Replicas+amount == 0 {
+				found = true
+			} else {
+				reservations = append(reservations, r)
+			}
+		}
+
+		copy.Spec.CapacityReservations = reservations
+	}
+
+	autoscaler.Log.Info(
+		"Patching hra for capacityReservations update",
+		"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
+		"after", copy.Spec.CapacityReservations,
+	)

 	if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
 		return fmt.Errorf("patching horizontalrunnerautoscaler to add capacity reservation: %w", err)
@@ -450,13 +641,26 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) SetupWithManager(mgr
 			return nil
 		}

-		var rd v1alpha1.RunnerDeployment
+		switch hra.Spec.ScaleTargetRef.Kind {
+		case "", "RunnerDeployment":
+			var rd v1alpha1.RunnerDeployment

-		if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
-			return nil
+			if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
+				return nil
+			}
+
+			return []string{rd.Spec.Template.Spec.Repository, rd.Spec.Template.Spec.Organization}
+		case "RunnerSet":
+			var rs v1alpha1.RunnerSet
+
+			if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil {
+				return nil
+			}
+
+			return []string{rs.Spec.Repository, rs.Spec.Organization}
 		}

-		return []string{rd.Spec.Template.Spec.Repository, rd.Spec.Template.Spec.Organization}
+		return nil
 	}); err != nil {
 		return err
 	}