Provide scale-set listener metrics (#2559)

Co-authored-by: Tingluo Huang <tingluohuang@github.com>
Co-authored-by: Bassem Dghaidi <568794+Link-@users.noreply.github.com>
This commit is contained in:
Nikola Jokic
2023-08-21 13:50:07 +02:00
committed by GitHub
parent 1c360d7e26
commit a0a3916c80
20 changed files with 975 additions and 427 deletions

View File

@@ -3,6 +3,7 @@ package main
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"strings"
@@ -25,6 +26,31 @@ type Service struct {
kubeManager KubernetesManager
settings *ScaleSettings
currentRunnerCount int
metricsExporter metricsExporter
errs []error
}
func WithPrometheusMetrics(conf RunnerScaleSetListenerConfig) func(*Service) {
return func(svc *Service) {
parsedURL, err := actions.ParseGitHubConfigFromURL(conf.ConfigureUrl)
if err != nil {
svc.errs = append(svc.errs, err)
}
svc.metricsExporter.withBaseLabels(baseLabels{
scaleSetName: conf.EphemeralRunnerSetName,
scaleSetNamespace: conf.EphemeralRunnerSetNamespace,
enterprise: parsedURL.Enterprise,
organization: parsedURL.Organization,
repository: parsedURL.Repository,
})
}
}
func WithLogger(logger logr.Logger) func(*Service) {
return func(s *Service) {
s.logger = logger.WithName("service")
}
}
func NewService(
@@ -33,7 +59,7 @@ func NewService(
manager KubernetesManager,
settings *ScaleSettings,
options ...func(*Service),
) *Service {
) (*Service, error) {
s := &Service{
ctx: ctx,
rsClient: rsClient,
@@ -47,7 +73,11 @@ func NewService(
option(s)
}
return s
if len(s.errs) > 0 {
return nil, errors.Join(s.errs...)
}
return s, nil
}
func (s *Service) Start() error {
@@ -81,6 +111,8 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
"busy runners", message.Statistics.TotalBusyRunners,
"idle runners", message.Statistics.TotalIdleRunners)
s.metricsExporter.publishStatistics(message.Statistics)
if message.MessageType != "RunnerScaleSetJobMessages" {
s.logger.Info("skip message with unknown message type.", "messageType", message.MessageType)
return nil
@@ -110,27 +142,54 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
if err := json.Unmarshal(message, &jobAvailable); err != nil {
return fmt.Errorf("could not decode job available message. %w", err)
}
s.logger.Info("job available message received.", "RequestId", jobAvailable.RunnerRequestId)
s.logger.Info(
"job available message received.",
"RequestId",
jobAvailable.RunnerRequestId,
)
availableJobs = append(availableJobs, jobAvailable.RunnerRequestId)
case "JobAssigned":
var jobAssigned actions.JobAssigned
if err := json.Unmarshal(message, &jobAssigned); err != nil {
return fmt.Errorf("could not decode job assigned message. %w", err)
}
s.logger.Info("job assigned message received.", "RequestId", jobAssigned.RunnerRequestId)
s.logger.Info(
"job assigned message received.",
"RequestId",
jobAssigned.RunnerRequestId,
)
// s.metricsExporter.publishJobAssigned(&jobAssigned)
case "JobStarted":
var jobStarted actions.JobStarted
if err := json.Unmarshal(message, &jobStarted); err != nil {
return fmt.Errorf("could not decode job started message. %w", err)
}
s.logger.Info("job started message received.", "RequestId", jobStarted.RunnerRequestId, "RunnerId", jobStarted.RunnerId)
s.logger.Info(
"job started message received.",
"RequestId",
jobStarted.RunnerRequestId,
"RunnerId",
jobStarted.RunnerId,
)
s.metricsExporter.publishJobStarted(&jobStarted)
s.updateJobInfoForRunner(jobStarted)
case "JobCompleted":
var jobCompleted actions.JobCompleted
if err := json.Unmarshal(message, &jobCompleted); err != nil {
return fmt.Errorf("could not decode job completed message. %w", err)
}
s.logger.Info("job completed message received.", "RequestId", jobCompleted.RunnerRequestId, "Result", jobCompleted.Result, "RunnerId", jobCompleted.RunnerId, "RunnerName", jobCompleted.RunnerName)
s.logger.Info(
"job completed message received.",
"RequestId",
jobCompleted.RunnerRequestId,
"Result",
jobCompleted.Result,
"RunnerId",
jobCompleted.RunnerId,
"RunnerName",
jobCompleted.RunnerName,
)
s.metricsExporter.publishJobCompleted(&jobCompleted)
default:
s.logger.Info("unknown job message type.", "messageType", messageType.MessageType)
}
@@ -146,13 +205,15 @@ func (s *Service) processMessage(message *actions.RunnerScaleSetMessage) error {
func (s *Service) scaleForAssignedJobCount(count int) error {
targetRunnerCount := int(math.Max(math.Min(float64(s.settings.MaxRunners), float64(count)), float64(s.settings.MinRunners)))
s.metricsExporter.publishDesiredRunners(targetRunnerCount)
if targetRunnerCount != s.currentRunnerCount {
s.logger.Info("try scale runner request up/down base on assigned job count",
"assigned job", count,
"decision", targetRunnerCount,
"min", s.settings.MinRunners,
"max", s.settings.MaxRunners,
"currentRunnerCount", s.currentRunnerCount)
"currentRunnerCount", s.currentRunnerCount,
)
err := s.kubeManager.ScaleEphemeralRunnerSet(s.ctx, s.settings.Namespace, s.settings.ResourceName, targetRunnerCount)
if err != nil {
return fmt.Errorf("could not scale ephemeral runner set (%s/%s). %w", s.settings.Namespace, s.settings.ResourceName, err)
@@ -173,7 +234,8 @@ func (s *Service) updateJobInfoForRunner(jobInfo actions.JobStarted) {
"workflowRef", jobInfo.JobWorkflowRef,
"workflowRunId", jobInfo.WorkflowRunId,
"jobDisplayName", jobInfo.JobDisplayName,
"requestId", jobInfo.RunnerRequestId)
"requestId", jobInfo.RunnerRequestId,
)
err := s.kubeManager.UpdateEphemeralRunnerWithJobInfo(s.ctx, s.settings.Namespace, jobInfo.RunnerName, jobInfo.OwnerName, jobInfo.RepositoryName, jobInfo.JobWorkflowRef, jobInfo.JobDisplayName, jobInfo.WorkflowRunId, jobInfo.RunnerRequestId)
if err != nil {
s.logger.Error(err, "could not update ephemeral runner with job info", "runnerName", jobInfo.RunnerName, "requestId", jobInfo.RunnerRequestId)

View File

@@ -21,7 +21,7 @@ func TestNewService(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -36,6 +36,7 @@ func TestNewService(t *testing.T) {
},
)
require.NoError(t, err)
assert.Equal(t, logger, service.logger)
}
@@ -47,7 +48,7 @@ func TestStart(t *testing.T) {
require.NoError(t, log_err, "Error creating logger")
ctx, cancel := context.WithCancel(context.Background())
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -61,9 +62,11 @@ func TestStart(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
err := service.Start()
err = service.Start()
assert.NoError(t, err, "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
@@ -79,7 +82,7 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
require.NoError(t, log_err, "Error creating logger")
ctx, cancel := context.WithCancel(context.Background())
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -93,6 +96,7 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("GetRunnerScaleSetMessage", ctx, mock.Anything).Run(func(args mock.Arguments) {
_ = service.scaleForAssignedJobCount(5)
@@ -100,9 +104,9 @@ func TestStart_ScaleToMinRunners(t *testing.T) {
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
err := service.Start()
err = service.Start()
assert.NoError(t, err, "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
assert.True(t, mockKubeManager.AssertExpectations(t), "All expectations should be met")
}
@@ -116,7 +120,7 @@ func TestStart_ScaleToMinRunnersFailed(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -130,13 +134,14 @@ func TestStart_ScaleToMinRunnersFailed(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
c := mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(fmt.Errorf("error")).Once()
mockRsClient.On("GetRunnerScaleSetMessage", ctx, mock.Anything).Run(func(args mock.Arguments) {
_ = service.scaleForAssignedJobCount(5)
}).Return(c.ReturnArguments.Get(0))
err := service.Start()
err = service.Start()
assert.ErrorContains(t, err, "could not get and process message", "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
@@ -151,7 +156,7 @@ func TestStart_GetMultipleMessages(t *testing.T) {
require.NoError(t, log_err, "Error creating logger")
ctx, cancel := context.WithCancel(context.Background())
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -165,10 +170,12 @@ func TestStart_GetMultipleMessages(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(nil).Times(5)
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
err := service.Start()
err = service.Start()
assert.NoError(t, err, "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
@@ -184,7 +191,7 @@ func TestStart_ErrorOnMessage(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -198,10 +205,12 @@ func TestStart_ErrorOnMessage(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(nil).Times(2)
mockRsClient.On("GetRunnerScaleSetMessage", service.ctx, mock.Anything).Return(fmt.Errorf("error")).Once()
err := service.Start()
err = service.Start()
assert.ErrorContains(t, err, "could not get and process message. error", "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
@@ -217,7 +226,7 @@ func TestProcessMessage_NoStatistic(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -231,8 +240,9 @@ func TestProcessMessage_NoStatistic(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "test",
Body: "test",
@@ -252,7 +262,7 @@ func TestProcessMessage_IgnoreUnknownMessageType(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -266,8 +276,9 @@ func TestProcessMessage_IgnoreUnknownMessageType(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "unknown",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -290,7 +301,7 @@ func TestProcessMessage_InvalidBatchMessageJson(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -305,7 +316,9 @@ func TestProcessMessage_InvalidBatchMessageJson(t *testing.T) {
},
)
err := service.processMessage(&actions.RunnerScaleSetMessage{
require.NoError(t, err)
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -328,7 +341,7 @@ func TestProcessMessage_InvalidJobMessageJson(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -342,8 +355,9 @@ func TestProcessMessage_InvalidJobMessageJson(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -366,7 +380,7 @@ func TestProcessMessage_MultipleMessages(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -380,10 +394,12 @@ func TestProcessMessage_MultipleMessages(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return ids[0] == 3 && ids[1] == 4 })).Return(nil).Once()
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -407,7 +423,7 @@ func TestProcessMessage_AcquireJobsFailed(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -421,9 +437,11 @@ func TestProcessMessage_AcquireJobsFailed(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return ids[0] == 1 })).Return(fmt.Errorf("error")).Once()
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -447,7 +465,7 @@ func TestScaleForAssignedJobCount_DeDupScale(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -461,9 +479,11 @@ func TestScaleForAssignedJobCount_DeDupScale(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Return(nil).Once()
err := service.scaleForAssignedJobCount(2)
err = service.scaleForAssignedJobCount(2)
require.NoError(t, err, "Unexpected error")
err = service.scaleForAssignedJobCount(2)
require.NoError(t, err, "Unexpected error")
@@ -486,7 +506,7 @@ func TestScaleForAssignedJobCount_ScaleWithinMinMax(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -500,13 +520,15 @@ func TestScaleForAssignedJobCount_ScaleWithinMinMax(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 1).Return(nil).Once()
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 3).Return(nil).Once()
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(nil).Once()
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 1).Return(nil).Once()
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 5).Return(nil).Once()
err := service.scaleForAssignedJobCount(0)
err = service.scaleForAssignedJobCount(0)
require.NoError(t, err, "Unexpected error")
err = service.scaleForAssignedJobCount(3)
require.NoError(t, err, "Unexpected error")
@@ -531,7 +553,7 @@ func TestScaleForAssignedJobCount_ScaleFailed(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -545,9 +567,11 @@ func TestScaleForAssignedJobCount_ScaleFailed(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
mockKubeManager.On("ScaleEphemeralRunnerSet", ctx, service.settings.Namespace, service.settings.ResourceName, 2).Return(fmt.Errorf("error"))
err := service.scaleForAssignedJobCount(2)
err = service.scaleForAssignedJobCount(2)
assert.ErrorContains(t, err, "could not scale ephemeral runner set (namespace/resource). error", "Unexpected error")
assert.True(t, mockRsClient.AssertExpectations(t), "All expectations should be met")
@@ -563,7 +587,7 @@ func TestProcessMessage_JobStartedMessage(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -577,12 +601,14 @@ func TestProcessMessage_JobStartedMessage(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
service.currentRunnerCount = 1
mockKubeManager.On("UpdateEphemeralRunnerWithJobInfo", ctx, service.settings.Namespace, "runner1", "owner1", "repo1", ".github/workflows/ci.yaml", "job1", int64(100), int64(3)).Run(func(args mock.Arguments) { cancel() }).Return(nil).Once()
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return len(ids) == 0 })).Return(nil).Once()
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{
@@ -606,7 +632,7 @@ func TestProcessMessage_JobStartedMessageIgnoreRunnerUpdateError(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
service := NewService(
service, err := NewService(
ctx,
mockRsClient,
mockKubeManager,
@@ -620,12 +646,14 @@ func TestProcessMessage_JobStartedMessageIgnoreRunnerUpdateError(t *testing.T) {
s.logger = logger
},
)
require.NoError(t, err)
service.currentRunnerCount = 1
mockKubeManager.On("UpdateEphemeralRunnerWithJobInfo", ctx, service.settings.Namespace, "runner1", "owner1", "repo1", ".github/workflows/ci.yaml", "job1", int64(100), int64(3)).Run(func(args mock.Arguments) { cancel() }).Return(fmt.Errorf("error")).Once()
mockRsClient.On("AcquireJobsForRunnerScaleSet", ctx, mock.MatchedBy(func(ids []int64) bool { return len(ids) == 0 })).Return(nil).Once()
err := service.processMessage(&actions.RunnerScaleSetMessage{
err = service.processMessage(&actions.RunnerScaleSetMessage{
MessageId: 1,
MessageType: "RunnerScaleSetJobMessages",
Statistics: &actions.RunnerScaleSetStatistic{

View File

@@ -25,13 +25,17 @@ import (
"os"
"os/signal"
"syscall"
"time"
"github.com/actions/actions-runner-controller/build"
"github.com/actions/actions-runner-controller/github/actions"
"github.com/actions/actions-runner-controller/logging"
"github.com/go-logr/logr"
"github.com/kelseyhightower/envconfig"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"golang.org/x/net/http/httpproxy"
"golang.org/x/sync/errgroup"
)
type RunnerScaleSetListenerConfig struct {
@@ -45,9 +49,12 @@ type RunnerScaleSetListenerConfig struct {
MaxRunners int `split_words:"true"`
MinRunners int `split_words:"true"`
RunnerScaleSetId int `split_words:"true"`
RunnerScaleSetName string `split_words:"true"`
ServerRootCA string `split_words:"true"`
LogLevel string `split_words:"true"`
LogFormat string `split_words:"true"`
MetricsAddr string `split_words:"true"`
MetricsEndpoint string `split_words:"true"`
}
func main() {
@@ -79,17 +86,95 @@ func main() {
os.Exit(1)
}
if err := run(rc, logger); err != nil {
logger.Error(err, "Run error")
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
g, ctx := errgroup.WithContext(ctx)
g.Go(func() error {
opts := runOptions{
serviceOptions: []func(*Service){
WithLogger(logger),
},
}
opts.serviceOptions = append(opts.serviceOptions, WithPrometheusMetrics(rc))
return run(ctx, rc, logger, opts)
})
if len(rc.MetricsAddr) != 0 {
g.Go(func() error {
metricsServer := metricsServer{
rc: rc,
logger: logger,
}
g.Go(func() error {
<-ctx.Done()
return metricsServer.shutdown()
})
return metricsServer.listenAndServe()
})
}
if err := g.Wait(); err != nil {
logger.Error(err, "Error encountered")
os.Exit(1)
}
}
func run(rc RunnerScaleSetListenerConfig, logger logr.Logger) error {
// Create root context and hook with sigint and sigterm
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
type metricsServer struct {
rc RunnerScaleSetListenerConfig
logger logr.Logger
srv *http.Server
}
func (s *metricsServer) shutdown() error {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
return s.srv.Shutdown(ctx)
}
func (s *metricsServer) listenAndServe() error {
reg := prometheus.NewRegistry()
reg.MustRegister(
// availableJobs,
// acquiredJobs,
assignedJobs,
runningJobs,
registeredRunners,
busyRunners,
minRunners,
maxRunners,
desiredRunners,
idleRunners,
startedJobsTotal,
completedJobsTotal,
// jobQueueDurationSeconds,
jobStartupDurationSeconds,
jobExecutionDurationSeconds,
)
mux := http.NewServeMux()
mux.Handle(
s.rc.MetricsEndpoint,
promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}),
)
s.srv = &http.Server{
Addr: s.rc.MetricsAddr,
Handler: mux,
}
s.logger.Info("Starting metrics server", "address", s.srv.Addr)
return s.srv.ListenAndServe()
}
type runOptions struct {
serviceOptions []func(*Service)
}
func run(ctx context.Context, rc RunnerScaleSetListenerConfig, logger logr.Logger, opts runOptions) error {
// Create root context and hook with sigint and sigterm
creds := &actions.ActionsAuth{}
if rc.Token != "" {
creds.Token = rc.Token
@@ -131,9 +216,10 @@ func run(rc RunnerScaleSetListenerConfig, logger logr.Logger) error {
MinRunners: rc.MinRunners,
}
service := NewService(ctx, autoScalerClient, kubeManager, scaleSettings, func(s *Service) {
s.logger = logger.WithName("service")
})
service, err := NewService(ctx, autoScalerClient, kubeManager, scaleSettings, opts.serviceOptions...)
if err != nil {
return fmt.Errorf("failed to create new service: %v", err)
}
// Start listening for messages
if err = service.Start(); err != nil {

View File

@@ -0,0 +1,330 @@
package main
import (
"strconv"
"github.com/actions/actions-runner-controller/github/actions"
"github.com/prometheus/client_golang/prometheus"
)
// label names
const (
labelKeyRunnerScaleSetName = "name"
labelKeyRunnerScaleSetNamespace = "namespace"
labelKeyEnterprise = "enterprise"
labelKeyOrganization = "organization"
labelKeyRepository = "repository"
labelKeyJobName = "job_name"
labelKeyJobWorkflowRef = "job_workflow_ref"
labelKeyEventName = "event_name"
labelKeyJobResult = "job_result"
labelKeyRunnerID = "runner_id"
labelKeyRunnerName = "runner_name"
)
const githubScaleSetSubsystem = "gha"
// labels
var (
scaleSetLabels = []string{
labelKeyRunnerScaleSetName,
labelKeyRepository,
labelKeyOrganization,
labelKeyEnterprise,
labelKeyRunnerScaleSetNamespace,
}
jobLabels = []string{
labelKeyRepository,
labelKeyOrganization,
labelKeyEnterprise,
labelKeyJobName,
labelKeyJobWorkflowRef,
labelKeyEventName,
}
completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
)
// metrics
var (
// availableJobs = prometheus.NewGaugeVec(
// prometheus.GaugeOpts{
// Subsystem: githubScaleSetSubsystem,
// Name: "available_jobs",
// Help: "Number of jobs with `runs-on` matching the runner scale set name. Jobs are not yet assigned to the runner scale set.",
// },
// scaleSetLabels,
// )
//
// acquiredJobs = prometheus.NewGaugeVec(
// prometheus.GaugeOpts{
// Subsystem: githubScaleSetSubsystem,
// Name: "acquired_jobs",
// Help: "Number of jobs acquired by the scale set.",
// },
// scaleSetLabels,
// )
assignedJobs = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "assigned_jobs",
Help: "Number of jobs assigned to this scale set.",
},
scaleSetLabels,
)
runningJobs = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "running_jobs",
Help: "Number of jobs running (or about to be run).",
},
scaleSetLabels,
)
registeredRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "registered_runners",
Help: "Number of runners registered by the scale set.",
},
scaleSetLabels,
)
busyRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "busy_runners",
Help: "Number of registered runners running a job.",
},
scaleSetLabels,
)
minRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "min_runners",
Help: "Minimum number of runners.",
},
scaleSetLabels,
)
maxRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "max_runners",
Help: "Maximum number of runners.",
},
scaleSetLabels,
)
desiredRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "desired_runners",
Help: "Number of runners desired by the scale set.",
},
scaleSetLabels,
)
idleRunners = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "idle_runners",
Help: "Number of registered runners not running a job.",
},
scaleSetLabels,
)
startedJobsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: githubScaleSetSubsystem,
Name: "started_jobs_total",
Help: "Total number of jobs started.",
},
startedJobsTotalLabels,
)
completedJobsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "completed_jobs_total",
Help: "Total number of jobs completed.",
Subsystem: githubScaleSetSubsystem,
},
completedJobsTotalLabels,
)
// jobQueueDurationSeconds = prometheus.NewHistogramVec(
// prometheus.HistogramOpts{
// Subsystem: githubScaleSetSubsystem,
// Name: "job_queue_duration_seconds",
// Help: "Time spent waiting for workflow jobs to get assigned to the scale set after queueing (in seconds).",
// Buckets: runtimeBuckets,
// },
// jobLabels,
// )
jobStartupDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_startup_duration_seconds",
Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
Buckets: runtimeBuckets,
},
jobStartupDurationLabels,
)
jobExecutionDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_execution_duration_seconds",
Help: "Time spent executing workflow jobs by the scale set (in seconds).",
Buckets: runtimeBuckets,
},
jobExecutionDurationLabels,
)
)
var runtimeBuckets []float64 = []float64{
0.01,
0.05,
0.1,
0.5,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
12,
15,
18,
20,
25,
30,
40,
50,
60,
70,
80,
90,
100,
110,
120,
150,
180,
210,
240,
300,
360,
420,
480,
540,
600,
900,
1200,
1800,
2400,
3000,
3600,
}
type metricsExporter struct {
// Initialized during creation.
baseLabels
}
type baseLabels struct {
scaleSetName string
scaleSetNamespace string
enterprise string
organization string
repository string
}
func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels {
return prometheus.Labels{
labelKeyEnterprise: b.enterprise,
labelKeyOrganization: b.organization,
labelKeyRepository: b.repository,
labelKeyJobName: jobBase.JobDisplayName,
labelKeyJobWorkflowRef: jobBase.JobWorkflowRef,
labelKeyEventName: jobBase.EventName,
}
}
func (b *baseLabels) scaleSetLabels() prometheus.Labels {
return prometheus.Labels{
labelKeyRunnerScaleSetName: b.scaleSetName,
labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace,
labelKeyEnterprise: b.enterprise,
labelKeyOrganization: b.organization,
labelKeyRepository: b.repository,
}
}
func (b *baseLabels) completedJobLabels(msg *actions.JobCompleted) prometheus.Labels {
l := b.jobLabels(&msg.JobMessageBase)
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
l[labelKeyJobResult] = msg.Result
l[labelKeyRunnerName] = msg.RunnerName
return l
}
func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels {
l := b.jobLabels(&msg.JobMessageBase)
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
l[labelKeyRunnerName] = msg.RunnerName
return l
}
func (m *metricsExporter) withBaseLabels(base baseLabels) {
m.baseLabels = base
}
func (m *metricsExporter) publishStatistics(stats *actions.RunnerScaleSetStatistic) {
l := m.scaleSetLabels()
// availableJobs.With(l).Set(float64(stats.TotalAvailableJobs))
// acquiredJobs.With(l).Set(float64(stats.TotalAcquiredJobs))
assignedJobs.With(l).Set(float64(stats.TotalAssignedJobs))
runningJobs.With(l).Set(float64(stats.TotalRunningJobs))
registeredRunners.With(l).Set(float64(stats.TotalRegisteredRunners))
busyRunners.With(l).Set(float64(stats.TotalBusyRunners))
idleRunners.With(l).Set(float64(stats.TotalIdleRunners))
}
func (m *metricsExporter) publishJobStarted(msg *actions.JobStarted) {
l := m.startedJobLabels(msg)
startedJobsTotal.With(l).Inc()
startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix()
jobStartupDurationSeconds.With(l).Observe(float64(startupDuration))
}
// func (m *metricsExporter) publishJobAssigned(msg *actions.JobAssigned) {
// l := m.jobLabels(&msg.JobMessageBase)
// queueDuration := msg.JobMessageBase.ScaleSetAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix()
// jobQueueDurationSeconds.With(l).Observe(float64(queueDuration))
// }
func (m *metricsExporter) publishJobCompleted(msg *actions.JobCompleted) {
l := m.completedJobLabels(msg)
completedJobsTotal.With(l).Inc()
executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix()
jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration))
}
func (m *metricsExporter) publishDesiredRunners(count int) {
desiredRunners.With(m.scaleSetLabels()).Set(float64(count))
}