Fix overscaling when the controller is much faster then the listener (#3371)

Co-authored-by: Francesco Renzi <rentziass@gmail.com>
2025-12-20 06:56:51 +00:00 · 2024-03-20 15:36:12 +01:00
parent 46cfbb6ec7
commit 7a643a5107
16 changed files with 262 additions and 158 deletions
--- a/cmd/ghalistener/app/app.go
+++ b/cmd/ghalistener/app/app.go
@@ -34,7 +34,7 @@ type Listener interface {
 //go:generate mockery --name Worker --output ./mocks --outpkg mocks --case underscore
 type Worker interface {
 	HandleJobStarted(ctx context.Context, jobInfo *actions.JobStarted) error
-	HandleDesiredRunnerCount(ctx context.Context, count int) (int, error)
+	HandleDesiredRunnerCount(ctx context.Context, count int, jobsCompleted int) (int, error)
 }

 func New(config config.Config) (*App, error) {
--- a/cmd/ghalistener/app/mocks/worker.go
+++ b/cmd/ghalistener/app/mocks/worker.go
@@ -15,23 +15,23 @@ type Worker struct {
 	mock.Mock
 }

-// HandleDesiredRunnerCount provides a mock function with given fields: ctx, count
-func (_m *Worker) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) {
-	ret := _m.Called(ctx, count)
+// HandleDesiredRunnerCount provides a mock function with given fields: ctx, count, acquireCount
+func (_m *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, acquireCount int) (int, error) {
+	ret := _m.Called(ctx, count, acquireCount)

 	var r0 int
 	var r1 error
-	if rf, ok := ret.Get(0).(func(context.Context, int) (int, error)); ok {
-		return rf(ctx, count)
+	if rf, ok := ret.Get(0).(func(context.Context, int, int) (int, error)); ok {
+		return rf(ctx, count, acquireCount)
 	}
-	if rf, ok := ret.Get(0).(func(context.Context, int) int); ok {
-		r0 = rf(ctx, count)
+	if rf, ok := ret.Get(0).(func(context.Context, int, int) int); ok {
+		r0 = rf(ctx, count, acquireCount)
 	} else {
 		r0 = ret.Get(0).(int)
 	}

-	if rf, ok := ret.Get(1).(func(context.Context, int) error); ok {
-		r1 = rf(ctx, count)
+	if rf, ok := ret.Get(1).(func(context.Context, int, int) error); ok {
+		r1 = rf(ctx, count, acquireCount)
 	} else {
 		r1 = ret.Error(1)
 	}
--- a/cmd/ghalistener/listener/listener.go
+++ b/cmd/ghalistener/listener/listener.go
@@ -114,7 +114,7 @@ func New(config Config) (*Listener, error) {
 //go:generate mockery --name Handler --output ./mocks --outpkg mocks --case underscore
 type Handler interface {
 	HandleJobStarted(ctx context.Context, jobInfo *actions.JobStarted) error
-	HandleDesiredRunnerCount(ctx context.Context, count int) (int, error)
+	HandleDesiredRunnerCount(ctx context.Context, count, jobsCompleted int) (int, error)
 }

 // Listen listens for incoming messages and handles them using the provided handler.
@@ -145,7 +145,7 @@ func (l *Listener) Listen(ctx context.Context, handler Handler) error {
 	}
 	l.metrics.PublishStatistics(initialMessage.Statistics)

-	desiredRunners, err := handler.HandleDesiredRunnerCount(ctx, initialMessage.Statistics.TotalAssignedJobs)
+	desiredRunners, err := handler.HandleDesiredRunnerCount(ctx, initialMessage.Statistics.TotalAssignedJobs, 0)
 	if err != nil {
 		return fmt.Errorf("handling initial message failed: %w", err)
 	}
@@ -207,7 +207,7 @@ func (l *Listener) handleMessage(ctx context.Context, handler Handler, msg *acti
 		l.metrics.PublishJobStarted(jobStarted)
 	}

-	desiredRunners, err := handler.HandleDesiredRunnerCount(ctx, parsedMsg.statistics.TotalAssignedJobs)
+	desiredRunners, err := handler.HandleDesiredRunnerCount(ctx, parsedMsg.statistics.TotalAssignedJobs, len(parsedMsg.jobsCompleted))
 	if err != nil {
 		return fmt.Errorf("failed to handle desired runner count: %w", err)
 	}
@@ -284,7 +284,6 @@ func (l *Listener) getMessage(ctx context.Context) (*actions.RunnerScaleSetMessa
 	}

 	return msg, nil
-
 }

 func (l *Listener) deleteLastMessage(ctx context.Context) error {
--- a/cmd/ghalistener/listener/listener_test.go
+++ b/cmd/ghalistener/listener/listener_test.go
@@ -427,7 +427,7 @@ func TestListener_Listen(t *testing.T) {

 		var called bool
 		handler := listenermocks.NewHandler(t)
-		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything).
+		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything, 0).
 			Return(0, nil).
 			Run(
 				func(mock.Arguments) {
@@ -485,11 +485,11 @@ func TestListener_Listen(t *testing.T) {
 		config.Client = client

 		handler := listenermocks.NewHandler(t)
-		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything).
+		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything, 0).
 			Return(0, nil).
 			Once()

-		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything).
+		handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything, 0).
 			Return(0, nil).
 			Once()

--- a/cmd/ghalistener/listener/metrics_test.go
+++ b/cmd/ghalistener/listener/metrics_test.go
@@ -86,7 +86,7 @@ func TestInitialMetrics(t *testing.T) {
 		config.Client = client

 		handler := listenermocks.NewHandler(t)
-		handler.On("HandleDesiredRunnerCount", mock.Anything, sessionStatistics.TotalAssignedJobs).
+		handler.On("HandleDesiredRunnerCount", mock.Anything, sessionStatistics.TotalAssignedJobs, 0).
 			Return(sessionStatistics.TotalAssignedJobs, nil).
 			Once()

@@ -178,7 +178,7 @@ func TestHandleMessageMetrics(t *testing.T) {

 	handler := listenermocks.NewHandler(t)
 	handler.On("HandleJobStarted", mock.Anything, jobsStarted[0]).Return(nil).Once()
-	handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything).Return(desiredResult, nil).Once()
+	handler.On("HandleDesiredRunnerCount", mock.Anything, mock.Anything, 2).Return(desiredResult, nil).Once()

 	client := listenermocks.NewClient(t)
 	client.On("DeleteMessage", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Once()
--- a/cmd/ghalistener/listener/mocks/handler.go
+++ b/cmd/ghalistener/listener/mocks/handler.go
@@ -15,23 +15,23 @@ type Handler struct {
 	mock.Mock
 }

-// HandleDesiredRunnerCount provides a mock function with given fields: ctx, count
-func (_m *Handler) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) {
-	ret := _m.Called(ctx, count)
+// HandleDesiredRunnerCount provides a mock function with given fields: ctx, count, jobsCompleted
+func (_m *Handler) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCompleted int) (int, error) {
+	ret := _m.Called(ctx, count, jobsCompleted)

 	var r0 int
 	var r1 error
-	if rf, ok := ret.Get(0).(func(context.Context, int) (int, error)); ok {
-		return rf(ctx, count)
+	if rf, ok := ret.Get(0).(func(context.Context, int, int) (int, error)); ok {
+		return rf(ctx, count, jobsCompleted)
 	}
-	if rf, ok := ret.Get(0).(func(context.Context, int) int); ok {
-		r0 = rf(ctx, count)
+	if rf, ok := ret.Get(0).(func(context.Context, int, int) int); ok {
+		r0 = rf(ctx, count, jobsCompleted)
 	} else {
 		r0 = ret.Get(0).(int)
 	}

-	if rf, ok := ret.Get(1).(func(context.Context, int) error); ok {
-		r1 = rf(ctx, count)
+	if rf, ok := ret.Get(1).(func(context.Context, int, int) error); ok {
+		r1 = rf(ctx, count, jobsCompleted)
 	} else {
 		r1 = ret.Error(1)
 	}
--- a/cmd/ghalistener/worker/worker.go
+++ b/cmd/ghalistener/worker/worker.go
@@ -38,18 +38,20 @@ type Config struct {
 // The Worker's role is to process the messages it receives from the listener.
 // It then initiates Kubernetes API requests to carry out the necessary actions.
 type Worker struct {
-	clientset *kubernetes.Clientset
-	config    Config
-	lastPatch int
-	logger    *logr.Logger
+	clientset   *kubernetes.Clientset
+	config      Config
+	lastPatch   int
+	lastPatchID int
+	logger      *logr.Logger
 }

 var _ listener.Handler = (*Worker)(nil)

 func New(config Config, options ...Option) (*Worker, error) {
 	w := &Worker{
-		config:    config,
-		lastPatch: -1,
+		config:      config,
+		lastPatch:   -1,
+		lastPatchID: -1,
 	}

 	conf, err := rest.InClusterConfig()
@@ -161,7 +163,7 @@ func (w *Worker) HandleJobStarted(ctx context.Context, jobInfo *actions.JobStart
 // The function then scales the ephemeral runner set by applying the merge patch.
 // Finally, it logs the scaled ephemeral runner set details and returns nil if successful.
 // If any error occurs during the process, it returns an error with a descriptive message.
-func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) {
+func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCompleted int) (int, error) {
 	// Max runners should always be set by the resource builder either to the configured value,
 	// or the maximum int32 (resourcebuilder.newAutoScalingListener()).
 	targetRunnerCount := min(w.config.MinRunners+count, w.config.MaxRunners)
@@ -172,17 +174,22 @@ func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int) (int,
 		"min", w.config.MinRunners,
 		"max", w.config.MaxRunners,
 		"currentRunnerCount", w.lastPatch,
+		"jobsCompleted", jobsCompleted,
 	}

-	if targetRunnerCount == w.lastPatch {
-		w.logger.Info("Skipping patching of EphemeralRunnerSet as the desired count has not changed", logValues...)
+	if w.lastPatch == targetRunnerCount && jobsCompleted == 0 {
+		w.logger.Info("Skipping patch", logValues...)
 		return targetRunnerCount, nil
 	}

+	w.lastPatchID++
+	w.lastPatch = targetRunnerCount
+
 	original, err := json.Marshal(
 		&v1alpha1.EphemeralRunnerSet{
 			Spec: v1alpha1.EphemeralRunnerSetSpec{
 				Replicas: -1,
+				PatchID:  -1,
 			},
 		},
 	)
@@ -194,6 +201,7 @@ func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int) (int,
 		&v1alpha1.EphemeralRunnerSet{
 			Spec: v1alpha1.EphemeralRunnerSetSpec{
 				Replicas: targetRunnerCount,
+				PatchID:  w.lastPatchID,
 			},
 		},
 	)