mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-11 03:57:01 +00:00
Provide scale-set listener metrics (#2559)
Co-authored-by: Tingluo Huang <tingluohuang@github.com> Co-authored-by: Bassem Dghaidi <568794+Link-@users.noreply.github.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
@@ -25,6 +26,31 @@ type Service struct {
|
||||
kubeManager KubernetesManager
|
||||
settings *ScaleSettings
|
||||
currentRunnerCount int
|
||||
metricsExporter metricsExporter
|
||||
errs []error
|
||||
}
|
||||
|
||||
func WithPrometheusMetrics(conf RunnerScaleSetListenerConfig) func(*Service) {
|
||||
return func(svc *Service) {
|
||||
parsedURL, err := actions.ParseGitHubConfigFromURL(conf.ConfigureUrl)
|
||||
if err != nil {
|
||||
svc.errs = append(svc.errs, err)
|
||||
}
|
||||
|
||||
svc.metricsExporter.withBaseLabels(baseLabels{
|
||||
scaleSetName: conf.EphemeralRunnerSetName,
|
||||
scaleSetNamespace: conf.EphemeralRunnerSetNamespace,
|
||||
enterprise: parsedURL.Enterprise,
|
||||
organization: parsedURL.Organization,
|
||||
repository: parsedURL.Repository,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func WithLogger(logger logr.Logger) func(*Service) {
|
||||
return func(s *Service) {
|
||||
s.logger = logger.WithName("service")
|
||||
}
|
||||
}
|
||||
|
||||
func NewService(
|
||||
@@ -33,7 +59,7 @@ func NewService(
|
||||
manager KubernetesManager,
|
||||
settings *ScaleSettings,
|
||||
options ...func(*Service),
|
||||
) *Service {
|
||||
) (*Service, error) {
|
||||
s := &Service{
|
||||
ctx: ctx,
|
||||
rsClient: rsClient,
|
||||
@@ -47,7 +73,11 @@ func NewService(
|
||||
option(s)
|
||||
}
|
||||
|
||||
return s
|
||||
if len(s.errs) > 0 {
|
||||
return nil, errors.Join(s.errs...)
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (s *Service) Start() error {
|
||||
@@ -81,6 +111,8 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
|
||||
"busy runners", message.Statistics.TotalBusyRunners,
|
||||
"idle runners", message.Statistics.TotalIdleRunners)
|
||||
|
||||
s.metricsExporter.publishStatistics(message.Statistics)
|
||||
|
||||
if message.MessageType != "RunnerScaleSetJobMessages" {
|
||||
s.logger.Info("skip message with unknown message type.", "messageType", message.MessageType)
|
||||
return nil
|
||||
@@ -110,27 +142,54 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
|
||||
if err := json.Unmarshal(message, &jobAvailable); err != nil {
|
||||
return fmt.Errorf("could not decode job available message. %w", err)
|
||||
}
|
||||
s.logger.Info("job available message received.", "RequestId", jobAvailable.RunnerRequestId)
|
||||
s.logger.Info(
|
||||
"job available message received.",
|
||||
"RequestId",
|
||||
jobAvailable.RunnerRequestId,
|
||||
)
|
||||
availableJobs = append(availableJobs, jobAvailable.RunnerRequestId)
|
||||
case "JobAssigned":
|
||||
var jobAssigned actions.JobAssigned
|
||||
if err := json.Unmarshal(message, &jobAssigned); err != nil {
|
||||
return fmt.Errorf("could not decode job assigned message. %w", err)
|
||||
}
|
||||
s.logger.Info("job assigned message received.", "RequestId", jobAssigned.RunnerRequestId)
|
||||
s.logger.Info(
|
||||
"job assigned message received.",
|
||||
"RequestId",
|
||||
jobAssigned.RunnerRequestId,
|
||||
)
|
||||
// s.metricsExporter.publishJobAssigned(&jobAssigned)
|
||||
case "JobStarted":
|
||||
var jobStarted actions.JobStarted
|
||||
if err := json.Unmarshal(message, &jobStarted); err != nil {
|
||||
return fmt.Errorf("could not decode job started message. %w", err)
|
||||
}
|
||||
s.logger.Info("job started message received.", "RequestId", jobStarted.RunnerRequestId, "RunnerId", jobStarted.RunnerId)
|
||||
s.logger.Info(
|
||||
"job started message received.",
|
||||
"RequestId",
|
||||
jobStarted.RunnerRequestId,
|
||||
"RunnerId",
|
||||
jobStarted.RunnerId,
|
||||
)
|
||||
s.metricsExporter.publishJobStarted(&jobStarted)
|
||||
s.updateJobInfoForRunner(jobStarted)
|
||||
case "JobCompleted":
|
||||
var jobCompleted actions.JobCompleted
|
||||
if err := json.Unmarshal(message, &jobCompleted); err != nil {
|
||||
return fmt.Errorf("could not decode job completed message. %w", err)
|
||||
}
|
||||
s.logger.Info("job completed message received.", "RequestId", jobCompleted.RunnerRequestId, "Result", jobCompleted.Result, "RunnerId", jobCompleted.RunnerId, "RunnerName", jobCompleted.RunnerName)
|
||||
s.logger.Info(
|
||||
"job completed message received.",
|
||||
"RequestId",
|
||||
jobCompleted.RunnerRequestId,
|
||||
"Result",
|
||||
jobCompleted.Result,
|
||||
"RunnerId",
|
||||
jobCompleted.RunnerId,
|
||||
"RunnerName",
|
||||
jobCompleted.RunnerName,
|
||||
)
|
||||
s.metricsExporter.publishJobCompleted(&jobCompleted)
|
||||
default:
|
||||
s.logger.Info("unknown job message type.", "messageType", messageType.MessageType)
|
||||
}
|
||||
@@ -146,13 +205,15 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
|
||||
|
||||
func (s *Service) scaleForAssignedJobCount(count int) error {
|
||||
targetRunnerCount := int(math.Max(math.Min(float64(s.settings.MaxRunners), float64(count)), float64(s.settings.MinRunners)))
|
||||
s.metricsExporter.publishDesiredRunners(targetRunnerCount)
|
||||
if targetRunnerCount != s.currentRunnerCount {
|
||||
s.logger.Info("try scale runner request up/down base on assigned job count",
|
||||
"assigned job", count,
|
||||
"decision", targetRunnerCount,
|
||||
"min", s.settings.MinRunners,
|
||||
"max", s.settings.MaxRunners,
|
||||
"currentRunnerCount", s.currentRunnerCount)
|
||||
"currentRunnerCount", s.currentRunnerCount,
|
||||
)
|
||||
err := s.kubeManager.ScaleEphemeralRunnerSet(s.ctx, s.settings.Namespace, s.settings.ResourceName, targetRunnerCount)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not scale ephemeral runner set (%s/%s). %w", s.settings.Namespace, s.settings.ResourceName, err)
|
||||
@@ -173,7 +234,8 @@ func (s *Service) updateJobInfoForRunner(jobInfo actions.JobStarted) {
|
||||
"workflowRef", jobInfo.JobWorkflowRef,
|
||||
"workflowRunId", jobInfo.WorkflowRunId,
|
||||
"jobDisplayName", jobInfo.JobDisplayName,
|
||||
"requestId", jobInfo.RunnerRequestId)
|
||||
"requestId", jobInfo.RunnerRequestId,
|
||||
)
|
||||
err := s.kubeManager.UpdateEphemeralRunnerWithJobInfo(s.ctx, s.settings.Namespace, jobInfo.RunnerName, jobInfo.OwnerName, jobInfo.RepositoryName, jobInfo.JobWorkflowRef, jobInfo.JobDisplayName, jobInfo.WorkflowRunId, jobInfo.RunnerRequestId)
|
||||
if err != nil {
|
||||
s.logger.Error(err, "could not update ephemeral runner with job info", "runnerName", jobInfo.RunnerName, "requestId", jobInfo.RunnerRequestId)
|
||||
|
||||
@@ -21,7 +21,7 @@ func TestNewService(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -36,6 +36,7 @@ func TestNewService(t *testing.T) {
|
||||
},
|
||||
)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, logger, service.logger)
|
||||
}
|
||||
|
||||
@@ -47,7 +48,7 @@ func TestStart(t *testing.T) {
|
||||
require.NoError(t, log_err, "Error creating logger")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -61,9 +62,11 @@ func TestStart(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
|
||||
|
||||
err := service.Start()
|
||||
err = service.Start()
|
||||
|
||||
assert.NoError(t, err, "Unexpected error")
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
@@ -79,7 +82,7 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
|
||||
require.NoError(t, log_err, "Error creating logger")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -93,6 +96,7 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", ctx, mock.Anything).Run(func(args mock.Arguments) {
|
||||
_ = service.scaleForAssignedJobCount(5)
|
||||
@@ -100,9 +104,9 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
|
||||
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
|
||||
|
||||
err := service.Start()
|
||||
|
||||
err = service.Start()
|
||||
assert.NoError(t, err, "Unexpected error")
|
||||
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
assert.True(t, mockKubeManager.AssertExpectations(t), "All expectations should be met")
|
||||
}
|
||||
@@ -116,7 +120,7 @@ func TestStart_ScaleToMinRunnersFailed(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -130,13 +134,14 @@ func TestStart_ScaleToMinRunnersFailed(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
c := mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(fmt.Errorf("error")).Once()
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", ctx, mock.Anything).Run(func(args mock.Arguments) {
|
||||
_ = service.scaleForAssignedJobCount(5)
|
||||
}).Return(c.ReturnArguments.Get(0))
|
||||
|
||||
err := service.Start()
|
||||
err = service.Start()
|
||||
|
||||
assert.ErrorContains(t, err, "could not get and process message", "Unexpected error")
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
@@ -151,7 +156,7 @@ func TestStart_GetMultipleMessages(t *testing.T) {
|
||||
require.NoError(t, log_err, "Error creating logger")
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -165,10 +170,12 @@ func TestStart_GetMultipleMessages(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(nil).Times(5)
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
|
||||
|
||||
err := service.Start()
|
||||
err = service.Start()
|
||||
|
||||
assert.NoError(t, err, "Unexpected error")
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
@@ -184,7 +191,7 @@ func TestStart_ErrorOnMessage(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -198,10 +205,12 @@ func TestStart_ErrorOnMessage(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(nil).Times(2)
|
||||
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(fmt.Errorf("error")).Once()
|
||||
|
||||
err := service.Start()
|
||||
err = service.Start()
|
||||
|
||||
assert.ErrorContains(t, err, "could not get and process message. error", "Unexpected error")
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
@@ -217,7 +226,7 @@ func TestProcessMessage_NoStatistic(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -231,8 +240,9 @@ func TestProcessMessage_NoStatistic(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "test",
|
||||
Body: "test",
|
||||
@@ -252,7 +262,7 @@ func TestProcessMessage_IgnoreUnknownMessageType(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -266,8 +276,9 @@ func TestProcessMessage_IgnoreUnknownMessageType(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "unknown",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -290,7 +301,7 @@ func TestProcessMessage_InvalidBatchMessageJson(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -305,7 +316,9 @@ func TestProcessMessage_InvalidBatchMessageJson(t *testing.T) {
|
||||
},
|
||||
)
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
require.NoError(t, err)
|
||||
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -328,7 +341,7 @@ func TestProcessMessage_InvalidJobMessageJson(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -342,8 +355,9 @@ func TestProcessMessage_InvalidJobMessageJson(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -366,7 +380,7 @@ func TestProcessMessage_MultipleMessages(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -380,10 +394,12 @@ func TestProcessMessage_MultipleMessages(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return ids[0] == 3 && ids[1] == 4 })).Return(nil).Once()
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -407,7 +423,7 @@ func TestProcessMessage_AcquireJobsFailed(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -421,9 +437,11 @@ func TestProcessMessage_AcquireJobsFailed(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return ids[0] == 1 })).Return(fmt.Errorf("error")).Once()
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -447,7 +465,7 @@ func TestScaleForAssignedJobCount_DeDupScale(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -461,9 +479,11 @@ func TestScaleForAssignedJobCount_DeDupScale(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Return(nil).Once()
|
||||
|
||||
err := service.scaleForAssignedJobCount(2)
|
||||
err = service.scaleForAssignedJobCount(2)
|
||||
require.NoError(t, err, "Unexpected error")
|
||||
err = service.scaleForAssignedJobCount(2)
|
||||
require.NoError(t, err, "Unexpected error")
|
||||
@@ -486,7 +506,7 @@ func TestScaleForAssignedJobCount_ScaleWithinMinMax(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -500,13 +520,15 @@ func TestScaleForAssignedJobCount_ScaleWithinMinMax(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 1).Return(nil).Once()
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 3).Return(nil).Once()
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(nil).Once()
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 1).Return(nil).Once()
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(nil).Once()
|
||||
|
||||
err := service.scaleForAssignedJobCount(0)
|
||||
err = service.scaleForAssignedJobCount(0)
|
||||
require.NoError(t, err, "Unexpected error")
|
||||
err = service.scaleForAssignedJobCount(3)
|
||||
require.NoError(t, err, "Unexpected error")
|
||||
@@ -531,7 +553,7 @@ func TestScaleForAssignedJobCount_ScaleFailed(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -545,9 +567,11 @@ func TestScaleForAssignedJobCount_ScaleFailed(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Return(fmt.Errorf("error"))
|
||||
|
||||
err := service.scaleForAssignedJobCount(2)
|
||||
err = service.scaleForAssignedJobCount(2)
|
||||
|
||||
assert.ErrorContains(t, err, "could not scale ephemeral runner set (namespace/resource). error", "Unexpected error")
|
||||
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
|
||||
@@ -563,7 +587,7 @@ func TestProcessMessage_JobStartedMessage(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -577,12 +601,14 @@ func TestProcessMessage_JobStartedMessage(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
service.currentRunnerCount = 1
|
||||
|
||||
mockKubeManager.On("UpdateEphemeralRunnerWithJobInfo", ctx, service.settings.Namespace, "runner1", "owner1", "repo1", ".github/workflows/ci.yaml", "job1", int64(100), int64(3)).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
|
||||
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return len(ids) == 0 })).Return(nil).Once()
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
@@ -606,7 +632,7 @@ func TestProcessMessage_JobStartedMessageIgnoreRunnerUpdateError(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
service := NewService(
|
||||
service, err := NewService(
|
||||
ctx,
|
||||
mockRsClient,
|
||||
mockKubeManager,
|
||||
@@ -620,12 +646,14 @@ func TestProcessMessage_JobStartedMessageIgnoreRunnerUpdateError(t *testing.T) {
|
||||
s.logger = logger
|
||||
},
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
||||
service.currentRunnerCount = 1
|
||||
|
||||
mockKubeManager.On("UpdateEphemeralRunnerWithJobInfo", ctx, service.settings.Namespace, "runner1", "owner1", "repo1", ".github/workflows/ci.yaml", "job1", int64(100), int64(3)).Run(func(args mock.Arguments) { cancel() }).Return(fmt.Errorf("error")).Once()
|
||||
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return len(ids) == 0 })).Return(nil).Once()
|
||||
|
||||
err := service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
err = service.processMessage(&actions.RunnerScaleSetMessage{
|
||||
MessageId: 1,
|
||||
MessageType: "RunnerScaleSetJobMessages",
|
||||
Statistics: &actions.RunnerScaleSetStatistic{
|
||||
|
||||
@@ -25,13 +25,17 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/actions/actions-runner-controller/build"
|
||||
"github.com/actions/actions-runner-controller/github/actions"
|
||||
"github.com/actions/actions-runner-controller/logging"
|
||||
"github.com/go-logr/logr"
|
||||
"github.com/kelseyhightower/envconfig"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"golang.org/x/net/http/httpproxy"
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
type RunnerScaleSetListenerConfig struct {
|
||||
@@ -45,9 +49,12 @@ type RunnerScaleSetListenerConfig struct {
|
||||
MaxRunners int `split_words:"true"`
|
||||
MinRunners int `split_words:"true"`
|
||||
RunnerScaleSetId int `split_words:"true"`
|
||||
RunnerScaleSetName string `split_words:"true"`
|
||||
ServerRootCA string `split_words:"true"`
|
||||
LogLevel string `split_words:"true"`
|
||||
LogFormat string `split_words:"true"`
|
||||
MetricsAddr string `split_words:"true"`
|
||||
MetricsEndpoint string `split_words:"true"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
@@ -79,17 +86,95 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := run(rc, logger); err != nil {
|
||||
logger.Error(err, "Run error")
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
g, ctx := errgroup.WithContext(ctx)
|
||||
|
||||
g.Go(func() error {
|
||||
opts := runOptions{
|
||||
serviceOptions: []func(*Service){
|
||||
WithLogger(logger),
|
||||
},
|
||||
}
|
||||
opts.serviceOptions = append(opts.serviceOptions, WithPrometheusMetrics(rc))
|
||||
|
||||
return run(ctx, rc, logger, opts)
|
||||
})
|
||||
|
||||
if len(rc.MetricsAddr) != 0 {
|
||||
g.Go(func() error {
|
||||
metricsServer := metricsServer{
|
||||
rc: rc,
|
||||
logger: logger,
|
||||
}
|
||||
g.Go(func() error {
|
||||
<-ctx.Done()
|
||||
return metricsServer.shutdown()
|
||||
})
|
||||
return metricsServer.listenAndServe()
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
logger.Error(err, "Error encountered")
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func run(rc RunnerScaleSetListenerConfig, logger logr.Logger) error {
|
||||
// Create root context and hook with sigint and sigterm
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
type metricsServer struct {
|
||||
rc RunnerScaleSetListenerConfig
|
||||
logger logr.Logger
|
||||
srv *http.Server
|
||||
}
|
||||
|
||||
func (s *metricsServer) shutdown() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
return s.srv.Shutdown(ctx)
|
||||
}
|
||||
|
||||
func (s *metricsServer) listenAndServe() error {
|
||||
reg := prometheus.NewRegistry()
|
||||
reg.MustRegister(
|
||||
// availableJobs,
|
||||
// acquiredJobs,
|
||||
assignedJobs,
|
||||
runningJobs,
|
||||
registeredRunners,
|
||||
busyRunners,
|
||||
minRunners,
|
||||
maxRunners,
|
||||
desiredRunners,
|
||||
idleRunners,
|
||||
startedJobsTotal,
|
||||
completedJobsTotal,
|
||||
// jobQueueDurationSeconds,
|
||||
jobStartupDurationSeconds,
|
||||
jobExecutionDurationSeconds,
|
||||
)
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle(
|
||||
s.rc.MetricsEndpoint,
|
||||
promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}),
|
||||
)
|
||||
|
||||
s.srv = &http.Server{
|
||||
Addr: s.rc.MetricsAddr,
|
||||
Handler: mux,
|
||||
}
|
||||
|
||||
s.logger.Info("Starting metrics server", "address", s.srv.Addr)
|
||||
return s.srv.ListenAndServe()
|
||||
}
|
||||
|
||||
type runOptions struct {
|
||||
serviceOptions []func(*Service)
|
||||
}
|
||||
|
||||
func run(ctx context.Context, rc RunnerScaleSetListenerConfig, logger logr.Logger, opts runOptions) error {
|
||||
// Create root context and hook with sigint and sigterm
|
||||
creds := &actions.ActionsAuth{}
|
||||
if rc.Token != "" {
|
||||
creds.Token = rc.Token
|
||||
@@ -131,9 +216,10 @@ func run(rc RunnerScaleSetListenerConfig, logger logr.Logger) error {
|
||||
MinRunners: rc.MinRunners,
|
||||
}
|
||||
|
||||
service := NewService(ctx, autoScalerClient, kubeManager, scaleSettings, func(s *Service) {
|
||||
s.logger = logger.WithName("service")
|
||||
})
|
||||
service, err := NewService(ctx, autoScalerClient, kubeManager, scaleSettings, opts.serviceOptions...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create new service: %v", err)
|
||||
}
|
||||
|
||||
// Start listening for messages
|
||||
if err = service.Start(); err != nil {
|
||||
|
||||
330
cmd/githubrunnerscalesetlistener/metrics.go
Normal file
330
cmd/githubrunnerscalesetlistener/metrics.go
Normal file
@@ -0,0 +1,330 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
|
||||
"github.com/actions/actions-runner-controller/github/actions"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// label names
|
||||
const (
|
||||
labelKeyRunnerScaleSetName = "name"
|
||||
labelKeyRunnerScaleSetNamespace = "namespace"
|
||||
labelKeyEnterprise = "enterprise"
|
||||
labelKeyOrganization = "organization"
|
||||
labelKeyRepository = "repository"
|
||||
labelKeyJobName = "job_name"
|
||||
labelKeyJobWorkflowRef = "job_workflow_ref"
|
||||
labelKeyEventName = "event_name"
|
||||
labelKeyJobResult = "job_result"
|
||||
labelKeyRunnerID = "runner_id"
|
||||
labelKeyRunnerName = "runner_name"
|
||||
)
|
||||
|
||||
const githubScaleSetSubsystem = "gha"
|
||||
|
||||
// labels
|
||||
var (
|
||||
scaleSetLabels = []string{
|
||||
labelKeyRunnerScaleSetName,
|
||||
labelKeyRepository,
|
||||
labelKeyOrganization,
|
||||
labelKeyEnterprise,
|
||||
labelKeyRunnerScaleSetNamespace,
|
||||
}
|
||||
|
||||
jobLabels = []string{
|
||||
labelKeyRepository,
|
||||
labelKeyOrganization,
|
||||
labelKeyEnterprise,
|
||||
labelKeyJobName,
|
||||
labelKeyJobWorkflowRef,
|
||||
labelKeyEventName,
|
||||
}
|
||||
|
||||
completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
|
||||
jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
|
||||
startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
|
||||
jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
|
||||
)
|
||||
|
||||
// metrics
|
||||
var (
|
||||
// availableJobs = prometheus.NewGaugeVec(
|
||||
// prometheus.GaugeOpts{
|
||||
// Subsystem: githubScaleSetSubsystem,
|
||||
// Name: "available_jobs",
|
||||
// Help: "Number of jobs with `runs-on` matching the runner scale set name. Jobs are not yet assigned to the runner scale set.",
|
||||
// },
|
||||
// scaleSetLabels,
|
||||
// )
|
||||
//
|
||||
// acquiredJobs = prometheus.NewGaugeVec(
|
||||
// prometheus.GaugeOpts{
|
||||
// Subsystem: githubScaleSetSubsystem,
|
||||
// Name: "acquired_jobs",
|
||||
// Help: "Number of jobs acquired by the scale set.",
|
||||
// },
|
||||
// scaleSetLabels,
|
||||
// )
|
||||
|
||||
assignedJobs = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "assigned_jobs",
|
||||
Help: "Number of jobs assigned to this scale set.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
runningJobs = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "running_jobs",
|
||||
Help: "Number of jobs running (or about to be run).",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
registeredRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "registered_runners",
|
||||
Help: "Number of runners registered by the scale set.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
busyRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "busy_runners",
|
||||
Help: "Number of registered runners running a job.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
minRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "min_runners",
|
||||
Help: "Minimum number of runners.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
maxRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "max_runners",
|
||||
Help: "Maximum number of runners.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
desiredRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "desired_runners",
|
||||
Help: "Number of runners desired by the scale set.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
idleRunners = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "idle_runners",
|
||||
Help: "Number of registered runners not running a job.",
|
||||
},
|
||||
scaleSetLabels,
|
||||
)
|
||||
|
||||
startedJobsTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "started_jobs_total",
|
||||
Help: "Total number of jobs started.",
|
||||
},
|
||||
startedJobsTotalLabels,
|
||||
)
|
||||
|
||||
completedJobsTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "completed_jobs_total",
|
||||
Help: "Total number of jobs completed.",
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
},
|
||||
completedJobsTotalLabels,
|
||||
)
|
||||
|
||||
// jobQueueDurationSeconds = prometheus.NewHistogramVec(
|
||||
// prometheus.HistogramOpts{
|
||||
// Subsystem: githubScaleSetSubsystem,
|
||||
// Name: "job_queue_duration_seconds",
|
||||
// Help: "Time spent waiting for workflow jobs to get assigned to the scale set after queueing (in seconds).",
|
||||
// Buckets: runtimeBuckets,
|
||||
// },
|
||||
// jobLabels,
|
||||
// )
|
||||
|
||||
jobStartupDurationSeconds = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "job_startup_duration_seconds",
|
||||
Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
|
||||
Buckets: runtimeBuckets,
|
||||
},
|
||||
jobStartupDurationLabels,
|
||||
)
|
||||
|
||||
jobExecutionDurationSeconds = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Subsystem: githubScaleSetSubsystem,
|
||||
Name: "job_execution_duration_seconds",
|
||||
Help: "Time spent executing workflow jobs by the scale set (in seconds).",
|
||||
Buckets: runtimeBuckets,
|
||||
},
|
||||
jobExecutionDurationLabels,
|
||||
)
|
||||
)
|
||||
|
||||
var runtimeBuckets []float64 = []float64{
|
||||
0.01,
|
||||
0.05,
|
||||
0.1,
|
||||
0.5,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
12,
|
||||
15,
|
||||
18,
|
||||
20,
|
||||
25,
|
||||
30,
|
||||
40,
|
||||
50,
|
||||
60,
|
||||
70,
|
||||
80,
|
||||
90,
|
||||
100,
|
||||
110,
|
||||
120,
|
||||
150,
|
||||
180,
|
||||
210,
|
||||
240,
|
||||
300,
|
||||
360,
|
||||
420,
|
||||
480,
|
||||
540,
|
||||
600,
|
||||
900,
|
||||
1200,
|
||||
1800,
|
||||
2400,
|
||||
3000,
|
||||
3600,
|
||||
}
|
||||
|
||||
type metricsExporter struct {
|
||||
// Initialized during creation.
|
||||
baseLabels
|
||||
}
|
||||
|
||||
type baseLabels struct {
|
||||
scaleSetName string
|
||||
scaleSetNamespace string
|
||||
enterprise string
|
||||
organization string
|
||||
repository string
|
||||
}
|
||||
|
||||
func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels {
|
||||
return prometheus.Labels{
|
||||
labelKeyEnterprise: b.enterprise,
|
||||
labelKeyOrganization: b.organization,
|
||||
labelKeyRepository: b.repository,
|
||||
labelKeyJobName: jobBase.JobDisplayName,
|
||||
labelKeyJobWorkflowRef: jobBase.JobWorkflowRef,
|
||||
labelKeyEventName: jobBase.EventName,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *baseLabels) scaleSetLabels() prometheus.Labels {
|
||||
return prometheus.Labels{
|
||||
labelKeyRunnerScaleSetName: b.scaleSetName,
|
||||
labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace,
|
||||
labelKeyEnterprise: b.enterprise,
|
||||
labelKeyOrganization: b.organization,
|
||||
labelKeyRepository: b.repository,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *baseLabels) completedJobLabels(msg *actions.JobCompleted) prometheus.Labels {
|
||||
l := b.jobLabels(&msg.JobMessageBase)
|
||||
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
|
||||
l[labelKeyJobResult] = msg.Result
|
||||
l[labelKeyRunnerName] = msg.RunnerName
|
||||
return l
|
||||
}
|
||||
|
||||
func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels {
|
||||
l := b.jobLabels(&msg.JobMessageBase)
|
||||
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
|
||||
l[labelKeyRunnerName] = msg.RunnerName
|
||||
return l
|
||||
}
|
||||
|
||||
func (m *metricsExporter) withBaseLabels(base baseLabels) {
|
||||
m.baseLabels = base
|
||||
}
|
||||
|
||||
func (m *metricsExporter) publishStatistics(stats *actions.RunnerScaleSetStatistic) {
|
||||
l := m.scaleSetLabels()
|
||||
|
||||
// availableJobs.With(l).Set(float64(stats.TotalAvailableJobs))
|
||||
// acquiredJobs.With(l).Set(float64(stats.TotalAcquiredJobs))
|
||||
assignedJobs.With(l).Set(float64(stats.TotalAssignedJobs))
|
||||
runningJobs.With(l).Set(float64(stats.TotalRunningJobs))
|
||||
registeredRunners.With(l).Set(float64(stats.TotalRegisteredRunners))
|
||||
busyRunners.With(l).Set(float64(stats.TotalBusyRunners))
|
||||
idleRunners.With(l).Set(float64(stats.TotalIdleRunners))
|
||||
}
|
||||
|
||||
func (m *metricsExporter) publishJobStarted(msg *actions.JobStarted) {
|
||||
l := m.startedJobLabels(msg)
|
||||
startedJobsTotal.With(l).Inc()
|
||||
|
||||
startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix()
|
||||
jobStartupDurationSeconds.With(l).Observe(float64(startupDuration))
|
||||
}
|
||||
|
||||
// func (m *metricsExporter) publishJobAssigned(msg *actions.JobAssigned) {
|
||||
// l := m.jobLabels(&msg.JobMessageBase)
|
||||
// queueDuration := msg.JobMessageBase.ScaleSetAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix()
|
||||
// jobQueueDurationSeconds.With(l).Observe(float64(queueDuration))
|
||||
// }
|
||||
|
||||
func (m *metricsExporter) publishJobCompleted(msg *actions.JobCompleted) {
|
||||
l := m.completedJobLabels(msg)
|
||||
completedJobsTotal.With(l).Inc()
|
||||
|
||||
executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix()
|
||||
jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration))
|
||||
}
|
||||
|
||||
func (m *metricsExporter) publishDesiredRunners(count int) {
|
||||
desiredRunners.With(m.scaleSetLabels()).Set(float64(count))
|
||||
}
|
||||
Reference in New Issue
Block a user