Make webhook-based scale race-free (#1477)

* Make webhook-based scale operation asynchronous This prevents race condition in the webhook-based autoscaler when it received another webhook event while processing another webhook event and both ended up scaling up the same horizontal runner autoscaler. Ref #1321 * Fix typos * Update rather than Patch HRA to avoid race among webhook-based autoscaler servers * Batch capacity reservation updates for efficient use of apiserver * Fix potential never-ending HRA update conflicts in batch update * Extract batchScaler out of webhook-based autoscaler for testability * Fix log levels and batch scaler hang on start * Correlate webhook event with scale trigger amount in logs * Fix log message
2025-12-11 12:06:57 +00:00 · 2022-06-27 18:31:48 +09:00
parent 84d16c1c12
commit e2c8163b8c
6 changed files with 328 additions and 61 deletions
--- a/controllers/horizontal_runner_autoscaler_webhook.go
+++ b/controllers/horizontal_runner_autoscaler_webhook.go
@@ -23,9 +23,9 @@ import (
 	"io/ioutil"
 	"net/http"
 	"strings"
+	"sync"
 	"time"

-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"

@@ -46,6 +46,8 @@ const (

 	keyPrefixEnterprise = "enterprises/"
 	keyRunnerGroup      = "/group/"
+
+	DefaultQueueLimit = 100
 )

 // HorizontalRunnerAutoscalerGitHubWebhook autoscales a HorizontalRunnerAutoscaler and the RunnerDeployment on each
@@ -68,6 +70,15 @@ type HorizontalRunnerAutoscalerGitHubWebhook struct {
 	// Set to empty for letting it watch for all namespaces.
 	Namespace string
 	Name      string
+
+	// QueueLimit is the maximum length of the bounded queue of scale targets and their associated operations
+	// A scale target is enqueued on each retrieval of each eligible webhook event, so that it is processed asynchronously.
+	QueueLimit int
+
+	worker      *worker
+	workerInit  sync.Once
+	workerStart sync.Once
+	batchCh     chan *ScaleTarget
 }

 func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Reconcile(_ context.Context, request reconcile.Request) (reconcile.Result, error) {
@@ -312,9 +323,19 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
 		return
 	}

-	if err := autoscaler.tryScale(context.TODO(), target); err != nil {
-		log.Error(err, "could not scale up")
+	autoscaler.workerInit.Do(func() {
+		batchScaler := newBatchScaler(context.Background(), autoscaler.Client, autoscaler.Log)

+		queueLimit := autoscaler.QueueLimit
+		if queueLimit == 0 {
+			queueLimit = DefaultQueueLimit
+		}
+		autoscaler.worker = newWorker(context.Background(), queueLimit, batchScaler.Add)
+	})
+
+	target.log = &log
+	if ok := autoscaler.worker.Add(target); !ok {
+		log.Error(err, "Could not scale up due to queue full")
 		return
 	}

@@ -383,6 +404,8 @@ func matchTriggerConditionAgainstEvent(types []string, eventAction *string) bool
 type ScaleTarget struct {
 	v1alpha1.HorizontalRunnerAutoscaler
 	v1alpha1.ScaleUpTrigger
+
+	log *logr.Logger
 }

 func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) searchScaleTargets(hras []v1alpha1.HorizontalRunnerAutoscaler, f func(v1alpha1.ScaleUpTrigger) bool) []ScaleTarget {
@@ -770,63 +793,6 @@ HRA:
 	return nil, nil
 }

-func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.Context, target *ScaleTarget) error {
-	if target == nil {
-		return nil
-	}
-
-	copy := target.HorizontalRunnerAutoscaler.DeepCopy()
-
-	amount := 1
-
-	if target.ScaleUpTrigger.Amount != 0 {
-		amount = target.ScaleUpTrigger.Amount
-	}
-
-	capacityReservations := getValidCapacityReservations(copy)
-
-	if amount > 0 {
-		now := time.Now()
-		copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
-			EffectiveTime:  metav1.Time{Time: now},
-			ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
-			Replicas:       amount,
-		})
-	} else if amount < 0 {
-		var reservations []v1alpha1.CapacityReservation
-
-		var found bool
-
-		for _, r := range capacityReservations {
-			if !found && r.Replicas+amount == 0 {
-				found = true
-			} else {
-				reservations = append(reservations, r)
-			}
-		}
-
-		copy.Spec.CapacityReservations = reservations
-	}
-
-	before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
-	expired := before - len(capacityReservations)
-	after := len(copy.Spec.CapacityReservations)
-
-	autoscaler.Log.V(1).Info(
-		fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name),
-		"before", before,
-		"expired", expired,
-		"amount", amount,
-		"after", after,
-	)
-
-	if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
-		return fmt.Errorf("patching horizontalrunnerautoscaler to add capacity reservation: %w", err)
-	}
-
-	return nil
-}
-
 func getValidCapacityReservations(autoscaler *v1alpha1.HorizontalRunnerAutoscaler) []v1alpha1.CapacityReservation {
 	var capacityReservations []v1alpha1.CapacityReservation