mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-11 12:06:57 +00:00
Make webhook-based scale race-free (#1477)
* Make webhook-based scale operation asynchronous This prevents race condition in the webhook-based autoscaler when it received another webhook event while processing another webhook event and both ended up scaling up the same horizontal runner autoscaler. Ref #1321 * Fix typos * Update rather than Patch HRA to avoid race among webhook-based autoscaler servers * Batch capacity reservation updates for efficient use of apiserver * Fix potential never-ending HRA update conflicts in batch update * Extract batchScaler out of webhook-based autoscaler for testability * Fix log levels and batch scaler hang on start * Correlate webhook event with scale trigger amount in logs * Fix log message
This commit is contained in:
@@ -23,9 +23,9 @@ import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
||||
|
||||
@@ -46,6 +46,8 @@ const (
|
||||
|
||||
keyPrefixEnterprise = "enterprises/"
|
||||
keyRunnerGroup = "/group/"
|
||||
|
||||
DefaultQueueLimit = 100
|
||||
)
|
||||
|
||||
// HorizontalRunnerAutoscalerGitHubWebhook autoscales a HorizontalRunnerAutoscaler and the RunnerDeployment on each
|
||||
@@ -68,6 +70,15 @@ type HorizontalRunnerAutoscalerGitHubWebhook struct {
|
||||
// Set to empty for letting it watch for all namespaces.
|
||||
Namespace string
|
||||
Name string
|
||||
|
||||
// QueueLimit is the maximum length of the bounded queue of scale targets and their associated operations
|
||||
// A scale target is enqueued on each retrieval of each eligible webhook event, so that it is processed asynchronously.
|
||||
QueueLimit int
|
||||
|
||||
worker *worker
|
||||
workerInit sync.Once
|
||||
workerStart sync.Once
|
||||
batchCh chan *ScaleTarget
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Reconcile(_ context.Context, request reconcile.Request) (reconcile.Result, error) {
|
||||
@@ -312,9 +323,19 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
return
|
||||
}
|
||||
|
||||
if err := autoscaler.tryScale(context.TODO(), target); err != nil {
|
||||
log.Error(err, "could not scale up")
|
||||
autoscaler.workerInit.Do(func() {
|
||||
batchScaler := newBatchScaler(context.Background(), autoscaler.Client, autoscaler.Log)
|
||||
|
||||
queueLimit := autoscaler.QueueLimit
|
||||
if queueLimit == 0 {
|
||||
queueLimit = DefaultQueueLimit
|
||||
}
|
||||
autoscaler.worker = newWorker(context.Background(), queueLimit, batchScaler.Add)
|
||||
})
|
||||
|
||||
target.log = &log
|
||||
if ok := autoscaler.worker.Add(target); !ok {
|
||||
log.Error(err, "Could not scale up due to queue full")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -383,6 +404,8 @@ func matchTriggerConditionAgainstEvent(types []string, eventAction *string) bool
|
||||
type ScaleTarget struct {
|
||||
v1alpha1.HorizontalRunnerAutoscaler
|
||||
v1alpha1.ScaleUpTrigger
|
||||
|
||||
log *logr.Logger
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) searchScaleTargets(hras []v1alpha1.HorizontalRunnerAutoscaler, f func(v1alpha1.ScaleUpTrigger) bool) []ScaleTarget {
|
||||
@@ -770,63 +793,6 @@ HRA:
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.Context, target *ScaleTarget) error {
|
||||
if target == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := target.HorizontalRunnerAutoscaler.DeepCopy()
|
||||
|
||||
amount := 1
|
||||
|
||||
if target.ScaleUpTrigger.Amount != 0 {
|
||||
amount = target.ScaleUpTrigger.Amount
|
||||
}
|
||||
|
||||
capacityReservations := getValidCapacityReservations(copy)
|
||||
|
||||
if amount > 0 {
|
||||
now := time.Now()
|
||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||
EffectiveTime: metav1.Time{Time: now},
|
||||
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
} else if amount < 0 {
|
||||
var reservations []v1alpha1.CapacityReservation
|
||||
|
||||
var found bool
|
||||
|
||||
for _, r := range capacityReservations {
|
||||
if !found && r.Replicas+amount == 0 {
|
||||
found = true
|
||||
} else {
|
||||
reservations = append(reservations, r)
|
||||
}
|
||||
}
|
||||
|
||||
copy.Spec.CapacityReservations = reservations
|
||||
}
|
||||
|
||||
before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
|
||||
expired := before - len(capacityReservations)
|
||||
after := len(copy.Spec.CapacityReservations)
|
||||
|
||||
autoscaler.Log.V(1).Info(
|
||||
fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name),
|
||||
"before", before,
|
||||
"expired", expired,
|
||||
"amount", amount,
|
||||
"after", after,
|
||||
)
|
||||
|
||||
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
|
||||
return fmt.Errorf("patching horizontalrunnerautoscaler to add capacity reservation: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getValidCapacityReservations(autoscaler *v1alpha1.HorizontalRunnerAutoscaler) []v1alpha1.CapacityReservation {
|
||||
var capacityReservations []v1alpha1.CapacityReservation
|
||||
|
||||
|
||||
Reference in New Issue
Block a user