mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-11 12:06:57 +00:00
feat: Workflow job based ephemeral runner scaling (#721)
This add support for two upcoming enhancements on the GitHub side of self-hosted runners, ephemeral runners, and `workflow_jow` events. You can't use these yet. **These features are not yet generally available to all GitHub users**. Please take this pull request as a preparation to make it available to actions-runner-controller users as soon as possible after GitHub released the necessary features on their end. **Ephemeral runners**: The former, ephemeral runners, is basically the reliable alternative to `--once`, which we've been using when you enabled `ephemeral: true` (default in actions-runner-controller). `--once` has been suffering from a race issue #466. `--ephemeral` fixes that. To enable ephemeral runners with `actions/runner`, you give `--ephemeral` to `config.sh`. This updated version of `actions-runner-controller` does it for you, by using `--ephemeral` instead of `--once` when you set `RUNNER_FEATURE_FLAG_EPHEMERAL=true`. Please read the section `Ephemeral Runners` in the updated version of our README for more information. Note that ephemeral runners is not released on GitHub yet. And `RUNNER_FEATURE_FLAG_EPHEMERAL=true` won't work at all until the feature gets released on GitHub. Stay tuned for an announcement from GitHub! **`workflow_job` events**: `workflow_job` is the additional webhook event that corresponds to each GitHub Actions workflow job run. It provides `actions-runner-controller` a solid foundation to improve our webhook-based autoscale. Formerly, we've been exploiting webhook events like `check_run` for autoscaling. However, as none of our supported events has included `labels`, you had to configure an HRA to only match relevant `check_run` events. It wasn't trivial. In contrast, a `workflow_job` event payload contains `labels` of runners requested. `actions-runner-controller` is able to automatically decide which HRA to scale by filtering the corresponding RunnerDeployment by `labels` included in the webhook payload. So all you need to use webhook-based autoscale will be to enable `workflow_job` on GitHub and expose actions-runner-controller's webhook server to the internet. Note that the current implementation of `workflow_job` support works in two ways, increment, and decrement. An increment happens when the webhook server receives` workflow_job` of `queued` status. A decrement happens when it receives `workflow_job` of `completed` status. The latter is used to make scaling-down faster so that you waste money less than before. You still don't suffer from flapping, as a scale-down is still subject to `scaleDownDelaySecondsAfterScaleOut `. Please read the section `Example 3: Scale on each `workflow_job` event` in the updated version of our README for more information on its usage.
This commit is contained in:
@@ -183,6 +183,45 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
"action", e.GetAction(),
|
||||
)
|
||||
}
|
||||
case *gogithub.WorkflowJobEvent:
|
||||
if workflowJob := e.GetWorkflowJob(); workflowJob != nil {
|
||||
log = log.WithValues(
|
||||
"workflowJob.status", workflowJob.GetStatus(),
|
||||
"workflowJob.labels", workflowJob.Labels,
|
||||
"repository.name", e.Repo.GetName(),
|
||||
"repository.owner.login", e.Repo.Owner.GetLogin(),
|
||||
"repository.owner.type", e.Repo.Owner.GetType(),
|
||||
"action", e.GetAction(),
|
||||
)
|
||||
}
|
||||
|
||||
labels := e.WorkflowJob.Labels
|
||||
|
||||
switch e.GetAction() {
|
||||
case "queued", "completed":
|
||||
target, err = autoscaler.getJobScaleUpTargetForRepoOrOrg(
|
||||
context.TODO(),
|
||||
log,
|
||||
e.Repo.GetName(),
|
||||
e.Repo.Owner.GetLogin(),
|
||||
e.Repo.Owner.GetType(),
|
||||
labels,
|
||||
)
|
||||
|
||||
if target != nil {
|
||||
if e.GetAction() == "queued" {
|
||||
target.Amount = 1
|
||||
} else if e.GetAction() == "completed" {
|
||||
// A nagative amount is processed in the tryScale func as a scale-down request,
|
||||
// that erasese the oldest CapacityReservation with the same amount.
|
||||
// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
|
||||
// so that the resulting desired replicas decreases by 1.
|
||||
target.Amount = -1
|
||||
}
|
||||
}
|
||||
default:
|
||||
|
||||
}
|
||||
case *gogithub.PingEvent:
|
||||
ok = true
|
||||
|
||||
@@ -227,7 +266,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
return
|
||||
}
|
||||
|
||||
if err := autoscaler.tryScaleUp(context.TODO(), target); err != nil {
|
||||
if err := autoscaler.tryScale(context.TODO(), target); err != nil {
|
||||
log.Error(err, "could not scale up")
|
||||
|
||||
return
|
||||
@@ -237,7 +276,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
msg := fmt.Sprintf("scaled %s by 1", target.Name)
|
||||
msg := fmt.Sprintf("scaled %s by %d", target.Name, target.Amount)
|
||||
|
||||
autoscaler.Log.Info(msg)
|
||||
|
||||
@@ -394,7 +433,137 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getScaleUpTarget(ctx
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScaleUp(ctx context.Context, target *ScaleTarget) error {
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getJobScaleUpTargetForRepoOrOrg(ctx context.Context, log logr.Logger, repo, owner, ownerType string, labels []string) (*ScaleTarget, error) {
|
||||
repositoryRunnerKey := owner + "/" + repo
|
||||
|
||||
if target, err := autoscaler.getJobScaleTarget(ctx, repositoryRunnerKey, labels); err != nil {
|
||||
log.Info("finding repository-wide runner", "repository", repositoryRunnerKey)
|
||||
return nil, err
|
||||
} else if target != nil {
|
||||
log.Info("job scale up target is repository-wide runners", "repository", repo)
|
||||
return target, nil
|
||||
}
|
||||
|
||||
if ownerType == "User" {
|
||||
log.V(1).Info("no repository runner found", "organization", owner)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if target, err := autoscaler.getJobScaleTarget(ctx, owner, labels); err != nil {
|
||||
log.Info("finding organizational runner", "organization", owner)
|
||||
return nil, err
|
||||
} else if target != nil {
|
||||
log.Info("job scale up target is organizational runners", "organization", owner)
|
||||
return target, nil
|
||||
} else {
|
||||
log.V(1).Info("no repository runner or organizational runner found",
|
||||
"repository", repositoryRunnerKey,
|
||||
"organization", owner,
|
||||
)
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getJobScaleTarget(ctx context.Context, name string, labels []string) (*ScaleTarget, error) {
|
||||
hras, err := autoscaler.findHRAsByKey(ctx, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
autoscaler.Log.V(1).Info(fmt.Sprintf("Found %d HRAs by key", len(hras)), "key", name)
|
||||
|
||||
HRA:
|
||||
for _, hra := range hras {
|
||||
if !hra.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(hra.Spec.ScaleUpTriggers) > 1 {
|
||||
autoscaler.Log.V(1).Info("Skipping this HRA as it has too many ScaleUpTriggers to be used in workflow_job based scaling", "hra", hra.Name)
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
var duration metav1.Duration
|
||||
|
||||
if len(hra.Spec.ScaleUpTriggers) > 0 {
|
||||
duration = hra.Spec.ScaleUpTriggers[0].Duration
|
||||
}
|
||||
|
||||
if duration.Duration <= 0 {
|
||||
// Try to release the reserved capacity after at least 10 minutes by default,
|
||||
// we won't end up in the reserved capacity remained forever in case GitHub somehow stopped sending us "completed" workflow_job events.
|
||||
// GitHub usually send us those but nothing is 100% guaranteed, e.g. in case of something went wrong on GitHub :)
|
||||
// Probably we'd better make this configurable via custom resources in the future?
|
||||
duration.Duration = 10 * time.Minute
|
||||
}
|
||||
|
||||
switch hra.Spec.ScaleTargetRef.Kind {
|
||||
case "RunnerSet":
|
||||
var rs v1alpha1.RunnerSet
|
||||
|
||||
if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(labels) == 1 && labels[0] == "self-hosted" {
|
||||
return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
|
||||
}
|
||||
|
||||
// Ensure that the RunnerSet-managed runners have all the labels requested by the workflow_job.
|
||||
for _, l := range labels {
|
||||
var matched bool
|
||||
for _, l2 := range rs.Spec.Labels {
|
||||
if l == l2 {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !matched {
|
||||
continue HRA
|
||||
}
|
||||
}
|
||||
|
||||
return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
|
||||
case "RunnerDeployment", "":
|
||||
var rd v1alpha1.RunnerDeployment
|
||||
|
||||
if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(labels) == 1 && labels[0] == "self-hosted" {
|
||||
return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
|
||||
}
|
||||
|
||||
// Ensure that the RunnerDeployment-managed runners have all the labels requested by the workflow_job.
|
||||
for _, l := range labels {
|
||||
var matched bool
|
||||
for _, l2 := range rd.Spec.Template.Labels {
|
||||
if l == l2 {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !matched {
|
||||
continue HRA
|
||||
}
|
||||
}
|
||||
|
||||
return &ScaleTarget{HorizontalRunnerAutoscaler: hra, ScaleUpTrigger: v1alpha1.ScaleUpTrigger{Duration: duration}}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported scaleTargetRef.kind: %v", hra.Spec.ScaleTargetRef.Kind)
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.Context, target *ScaleTarget) error {
|
||||
if target == nil {
|
||||
return nil
|
||||
}
|
||||
@@ -403,16 +572,38 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScaleUp(ctx contex
|
||||
|
||||
amount := 1
|
||||
|
||||
if target.ScaleUpTrigger.Amount > 0 {
|
||||
if target.ScaleUpTrigger.Amount != 0 {
|
||||
amount = target.ScaleUpTrigger.Amount
|
||||
}
|
||||
|
||||
capacityReservations := getValidCapacityReservations(copy)
|
||||
|
||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
if amount > 0 {
|
||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
} else if amount < 0 {
|
||||
var reservations []v1alpha1.CapacityReservation
|
||||
|
||||
var found bool
|
||||
|
||||
for _, r := range capacityReservations {
|
||||
if !found && r.Replicas+amount == 0 {
|
||||
found = true
|
||||
} else {
|
||||
reservations = append(reservations, r)
|
||||
}
|
||||
}
|
||||
|
||||
copy.Spec.CapacityReservations = reservations
|
||||
}
|
||||
|
||||
autoscaler.Log.Info(
|
||||
"Patching hra for capacityReservations update",
|
||||
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
|
||||
"after", copy.Spec.CapacityReservations,
|
||||
)
|
||||
|
||||
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
|
||||
return fmt.Errorf("patching horizontalrunnerautoscaler to add capacity reservation: %w", err)
|
||||
@@ -450,13 +641,26 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) SetupWithManager(mgr
|
||||
return nil
|
||||
}
|
||||
|
||||
var rd v1alpha1.RunnerDeployment
|
||||
switch hra.Spec.ScaleTargetRef.Kind {
|
||||
case "", "RunnerDeployment":
|
||||
var rd v1alpha1.RunnerDeployment
|
||||
|
||||
if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
|
||||
return nil
|
||||
if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rd); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []string{rd.Spec.Template.Spec.Repository, rd.Spec.Template.Spec.Organization}
|
||||
case "RunnerSet":
|
||||
var rs v1alpha1.RunnerSet
|
||||
|
||||
if err := autoscaler.Client.Get(context.Background(), types.NamespacedName{Namespace: hra.Namespace, Name: hra.Spec.ScaleTargetRef.Name}, &rs); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return []string{rs.Spec.Repository, rs.Spec.Organization}
|
||||
}
|
||||
|
||||
return []string{rd.Spec.Template.Spec.Repository, rd.Spec.Template.Spec.Organization}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user