mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-11 03:57:01 +00:00
Expose workflow job metrics via new actions-metrics-server (#2057)
* Add workflow job metrics to Github webhook server * Fix handling of workflow_job.Conclusion * Make the prometheus metrics exporter for the workflow jobs a dedicated application * chart: Add support for deploying actions-metrics-server * A few improvements to make it easy to cover in E2E * chart: Add missing actionsmetrics.service.yaml * chart: Do not modify actionsMetricsServer.replicaCount * chart: Add documentation for actionsMetrics and actionsMetricsServer Co-authored-by: Colin Heathman <cheathman@benchsci.com>
This commit is contained in:
227
pkg/actionsmetrics/event_reader.go
Normal file
227
pkg/actionsmetrics/event_reader.go
Normal file
@@ -0,0 +1,227 @@
|
||||
package actionsmetrics
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v47/github"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
)
|
||||
|
||||
type EventReader struct {
|
||||
Log logr.Logger
|
||||
|
||||
// GitHub Client to fetch information about job failures
|
||||
GitHubClient *github.Client
|
||||
|
||||
// Event queue
|
||||
Events chan interface{}
|
||||
}
|
||||
|
||||
// HandleWorkflowJobEvent send event to reader channel for processing
|
||||
//
|
||||
// forcing the events through a channel ensures they are processed in sequentially,
|
||||
// and prevents any race conditions with githubWorkflowJobStatus
|
||||
func (reader *EventReader) HandleWorkflowJobEvent(event interface{}) {
|
||||
reader.Events <- event
|
||||
}
|
||||
|
||||
// ProcessWorkflowJobEvents pop events in a loop for processing
|
||||
//
|
||||
// Should be called asynchronously with `go`
|
||||
func (reader *EventReader) ProcessWorkflowJobEvents(ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
case event := <-reader.Events:
|
||||
reader.ProcessWorkflowJobEvent(ctx, event)
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ProcessWorkflowJobEvent processes a single event
|
||||
//
|
||||
// Events should be processed in the same order that Github emits them
|
||||
func (reader *EventReader) ProcessWorkflowJobEvent(ctx context.Context, event interface{}) {
|
||||
|
||||
e, ok := event.(*gogithub.WorkflowJobEvent)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
// collect labels
|
||||
labels := make(prometheus.Labels)
|
||||
|
||||
runsOn := strings.Join(e.WorkflowJob.Labels, `,`)
|
||||
labels["runs_on"] = runsOn
|
||||
labels["job_name"] = *e.WorkflowJob.Name
|
||||
|
||||
// switch on job status
|
||||
switch action := e.GetAction(); action {
|
||||
case "queued":
|
||||
githubWorkflowJobsQueuedTotal.With(labels).Inc()
|
||||
|
||||
case "in_progress":
|
||||
githubWorkflowJobsStartedTotal.With(labels).Inc()
|
||||
|
||||
if reader.GitHubClient == nil {
|
||||
return
|
||||
}
|
||||
|
||||
parseResult, err := reader.fetchAndParseWorkflowJobLogs(ctx, e)
|
||||
if err != nil {
|
||||
reader.Log.Error(err, "reading workflow job log")
|
||||
return
|
||||
} else {
|
||||
reader.Log.Info("reading workflow_job logs",
|
||||
"job_name", *e.WorkflowJob.Name,
|
||||
"job_id", fmt.Sprint(*e.WorkflowJob.ID),
|
||||
)
|
||||
}
|
||||
|
||||
githubWorkflowJobQueueDurationSeconds.With(labels).Observe(parseResult.QueueTime.Seconds())
|
||||
|
||||
case "completed":
|
||||
githubWorkflowJobsCompletedTotal.With(labels).Inc()
|
||||
|
||||
// job_conclusion -> (neutral, success, skipped, cancelled, timed_out, action_required, failure)
|
||||
githubWorkflowJobConclusionsTotal.With(extraLabel("job_conclusion", *e.WorkflowJob.Conclusion, labels)).Inc()
|
||||
|
||||
parseResult, err := reader.fetchAndParseWorkflowJobLogs(ctx, e)
|
||||
if err != nil {
|
||||
reader.Log.Error(err, "reading workflow job log")
|
||||
return
|
||||
} else {
|
||||
reader.Log.Info("reading workflow_job logs",
|
||||
"job_name", *e.WorkflowJob.Name,
|
||||
"job_id", fmt.Sprint(*e.WorkflowJob.ID),
|
||||
)
|
||||
}
|
||||
|
||||
if *e.WorkflowJob.Conclusion == "failure" {
|
||||
failedStep := "null"
|
||||
for i, step := range e.WorkflowJob.Steps {
|
||||
|
||||
// *step.Conclusion ~
|
||||
// "success",
|
||||
// "failure",
|
||||
// "neutral",
|
||||
// "cancelled",
|
||||
// "skipped",
|
||||
// "timed_out",
|
||||
// "action_required",
|
||||
// null
|
||||
if *step.Conclusion == "failure" {
|
||||
failedStep = fmt.Sprint(i)
|
||||
break
|
||||
}
|
||||
if *step.Conclusion == "timed_out" {
|
||||
failedStep = fmt.Sprint(i)
|
||||
parseResult.ExitCode = "timed_out"
|
||||
break
|
||||
}
|
||||
}
|
||||
githubWorkflowJobFailuresTotal.With(
|
||||
extraLabel("failed_step", failedStep,
|
||||
extraLabel("exit_code", parseResult.ExitCode, labels),
|
||||
),
|
||||
).Inc()
|
||||
}
|
||||
|
||||
githubWorkflowJobRunDurationSeconds.With(extraLabel("job_conclusion", *e.WorkflowJob.Conclusion, labels)).Observe(parseResult.RunTime.Seconds())
|
||||
}
|
||||
}
|
||||
|
||||
func extraLabel(key string, value string, labels prometheus.Labels) prometheus.Labels {
|
||||
fixedLabels := make(prometheus.Labels)
|
||||
for k, v := range labels {
|
||||
fixedLabels[k] = v
|
||||
}
|
||||
fixedLabels[key] = value
|
||||
return fixedLabels
|
||||
}
|
||||
|
||||
type ParseResult struct {
|
||||
ExitCode string
|
||||
QueueTime time.Duration
|
||||
RunTime time.Duration
|
||||
}
|
||||
|
||||
var logLine = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{7}Z)\s(.+)$`)
|
||||
var exitCodeLine = regexp.MustCompile(`##\[error\]Process completed with exit code (\d)\.`)
|
||||
|
||||
func (reader *EventReader) fetchAndParseWorkflowJobLogs(ctx context.Context, e *gogithub.WorkflowJobEvent) (*ParseResult, error) {
|
||||
|
||||
owner := *e.Repo.Owner.Login
|
||||
repo := *e.Repo.Name
|
||||
id := *e.WorkflowJob.ID
|
||||
url, _, err := reader.GitHubClient.Actions.GetWorkflowJobLogs(ctx, owner, repo, id, true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
jobLogs, err := http.DefaultClient.Get(url.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
exitCode := "null"
|
||||
|
||||
var (
|
||||
queuedTime time.Time
|
||||
startedTime time.Time
|
||||
completedTime time.Time
|
||||
)
|
||||
|
||||
func() {
|
||||
// Read jobLogs.Body line by line
|
||||
|
||||
defer jobLogs.Body.Close()
|
||||
lines := bufio.NewScanner(jobLogs.Body)
|
||||
|
||||
for lines.Scan() {
|
||||
matches := logLine.FindStringSubmatch(lines.Text())
|
||||
if matches == nil {
|
||||
continue
|
||||
}
|
||||
timestamp := matches[1]
|
||||
line := matches[2]
|
||||
|
||||
if strings.HasPrefix(line, "##[error]") {
|
||||
// Get exit code
|
||||
exitCodeMatch := exitCodeLine.FindStringSubmatch(line)
|
||||
if exitCodeMatch != nil {
|
||||
exitCode = exitCodeMatch[1]
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(line, "Waiting for a runner to pick up this job...") {
|
||||
queuedTime, _ = time.Parse(time.RFC3339, timestamp)
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(line, "Job is about to start running on the runner:") {
|
||||
startedTime, _ = time.Parse(time.RFC3339, timestamp)
|
||||
continue
|
||||
}
|
||||
|
||||
// Last line in the log will count as the completed time
|
||||
completedTime, _ = time.Parse(time.RFC3339, timestamp)
|
||||
}
|
||||
}()
|
||||
|
||||
return &ParseResult{
|
||||
ExitCode: exitCode,
|
||||
QueueTime: startedTime.Sub(queuedTime),
|
||||
RunTime: completedTime.Sub(startedTime),
|
||||
}, nil
|
||||
}
|
||||
126
pkg/actionsmetrics/metrics.go
Normal file
126
pkg/actionsmetrics/metrics.go
Normal file
@@ -0,0 +1,126 @@
|
||||
// Package metrics provides monitoring of the GitHub related metrics.
|
||||
//
|
||||
// This depends on the metrics exporter of kubebuilder.
|
||||
// See https://book.kubebuilder.io/reference/metrics.html for details.
|
||||
package actionsmetrics
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"sigs.k8s.io/controller-runtime/pkg/metrics"
|
||||
)
|
||||
|
||||
func init() {
|
||||
metrics.Registry.MustRegister(
|
||||
githubWorkflowJobQueueDurationSeconds,
|
||||
githubWorkflowJobRunDurationSeconds,
|
||||
githubWorkflowJobConclusionsTotal,
|
||||
githubWorkflowJobsQueuedTotal,
|
||||
githubWorkflowJobsStartedTotal,
|
||||
githubWorkflowJobsCompletedTotal,
|
||||
githubWorkflowJobFailuresTotal,
|
||||
)
|
||||
}
|
||||
|
||||
var (
|
||||
runtimeBuckets []float64 = []float64{
|
||||
0.01,
|
||||
0.05,
|
||||
0.1,
|
||||
0.5,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
12,
|
||||
15,
|
||||
18,
|
||||
20,
|
||||
25,
|
||||
30,
|
||||
40,
|
||||
50,
|
||||
60,
|
||||
70,
|
||||
80,
|
||||
90,
|
||||
100,
|
||||
110,
|
||||
120,
|
||||
150,
|
||||
180,
|
||||
210,
|
||||
240,
|
||||
300,
|
||||
360,
|
||||
420,
|
||||
480,
|
||||
540,
|
||||
600,
|
||||
900,
|
||||
1200,
|
||||
1800,
|
||||
2400,
|
||||
3000,
|
||||
3600,
|
||||
}
|
||||
)
|
||||
|
||||
var (
|
||||
githubWorkflowJobQueueDurationSeconds = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "github_workflow_job_queue_duration_seconds",
|
||||
Help: "Queue times for workflow jobs in seconds",
|
||||
Buckets: runtimeBuckets,
|
||||
},
|
||||
[]string{"runs_on", "job_name"},
|
||||
)
|
||||
githubWorkflowJobRunDurationSeconds = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "github_workflow_job_run_duration_seconds",
|
||||
Help: "Run times for workflow jobs in seconds",
|
||||
Buckets: runtimeBuckets,
|
||||
},
|
||||
[]string{"runs_on", "job_name", "job_conclusion"},
|
||||
)
|
||||
githubWorkflowJobConclusionsTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "github_workflow_job_conclusions_total",
|
||||
Help: "Conclusions for tracked workflow jobs",
|
||||
},
|
||||
[]string{"runs_on", "job_name", "job_conclusion"},
|
||||
)
|
||||
githubWorkflowJobsQueuedTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "github_workflow_jobs_queued_total",
|
||||
Help: "Total count of workflow jobs queued (events where job_status=queued)",
|
||||
},
|
||||
[]string{"runs_on", "job_name"},
|
||||
)
|
||||
githubWorkflowJobsStartedTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "github_workflow_jobs_started_total",
|
||||
Help: "Total count of workflow jobs started (events where job_status=in_progress)",
|
||||
},
|
||||
[]string{"runs_on", "job_name"},
|
||||
)
|
||||
githubWorkflowJobsCompletedTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "github_workflow_jobs_completed_total",
|
||||
Help: "Total count of workflow jobs completed (events where job_status=completed)",
|
||||
},
|
||||
[]string{"runs_on", "job_name"},
|
||||
)
|
||||
githubWorkflowJobFailuresTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "github_workflow_job_failures_total",
|
||||
Help: "Conclusions for tracked workflow runs",
|
||||
},
|
||||
[]string{"runs_on", "job_name", "failed_step", "exit_code"},
|
||||
)
|
||||
)
|
||||
157
pkg/actionsmetrics/webhookserver.go
Normal file
157
pkg/actionsmetrics/webhookserver.go
Normal file
@@ -0,0 +1,157 @@
|
||||
package actionsmetrics
|
||||
|
||||
/*
|
||||
Copyright 2022 The actions-runner-controller authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v47/github"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
)
|
||||
|
||||
type EventHook func(interface{})
|
||||
|
||||
// WebhookServer is a HTTP server that handles workflow_job events sent from GitHub Actions
|
||||
type WebhookServer struct {
|
||||
Log logr.Logger
|
||||
|
||||
// SecretKeyBytes is the byte representation of the Webhook secret token
|
||||
// the administrator is generated and specified in GitHub Web UI.
|
||||
SecretKeyBytes []byte
|
||||
|
||||
// GitHub Client to discover runner groups assigned to a repository
|
||||
GitHubClient *github.Client
|
||||
|
||||
// When HorizontalRunnerAutoscalerGitHubWebhook handles a request, each EventHook is sent the webhook event
|
||||
EventHooks []EventHook
|
||||
}
|
||||
|
||||
func (autoscaler *WebhookServer) Reconcile(_ context.Context, request reconcile.Request) (reconcile.Result, error) {
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (autoscaler *WebhookServer) Handle(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ok bool
|
||||
|
||||
err error
|
||||
)
|
||||
|
||||
defer func() {
|
||||
if !ok {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
|
||||
if err != nil {
|
||||
msg := err.Error()
|
||||
if written, err := w.Write([]byte(msg)); err != nil {
|
||||
autoscaler.Log.V(1).Error(err, "failed writing http error response", "msg", msg, "written", written)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
defer func() {
|
||||
if r.Body != nil {
|
||||
r.Body.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
// respond ok to GET / e.g. for health check
|
||||
if r.Method == http.MethodGet {
|
||||
ok = true
|
||||
fmt.Fprintln(w, "actions-metrics-server is running")
|
||||
return
|
||||
}
|
||||
|
||||
var payload []byte
|
||||
|
||||
if len(autoscaler.SecretKeyBytes) > 0 {
|
||||
payload, err = gogithub.ValidatePayload(r, autoscaler.SecretKeyBytes)
|
||||
if err != nil {
|
||||
autoscaler.Log.Error(err, "error validating request body")
|
||||
|
||||
return
|
||||
}
|
||||
} else {
|
||||
payload, err = io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
autoscaler.Log.Error(err, "error reading request body")
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
webhookType := gogithub.WebHookType(r)
|
||||
event, err := gogithub.ParseWebHook(webhookType, payload)
|
||||
if err != nil {
|
||||
var s string
|
||||
if payload != nil {
|
||||
s = string(payload)
|
||||
}
|
||||
|
||||
autoscaler.Log.Error(err, "could not parse webhook", "webhookType", webhookType, "payload", s)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
log := autoscaler.Log.WithValues(
|
||||
"event", webhookType,
|
||||
"hookID", r.Header.Get("X-GitHub-Hook-ID"),
|
||||
"delivery", r.Header.Get("X-GitHub-Delivery"),
|
||||
)
|
||||
|
||||
switch event.(type) {
|
||||
case *gogithub.PingEvent:
|
||||
ok = true
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
msg := "pong"
|
||||
|
||||
if written, err := w.Write([]byte(msg)); err != nil {
|
||||
log.Error(err, "failed writing http response", "msg", msg, "written", written)
|
||||
}
|
||||
|
||||
log.Info("handled ping event")
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
for _, eventHook := range autoscaler.EventHooks {
|
||||
eventHook(event)
|
||||
}
|
||||
|
||||
ok = true
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
msg := "ok"
|
||||
|
||||
log.Info(msg)
|
||||
|
||||
if written, err := w.Write([]byte(msg)); err != nil {
|
||||
log.Error(err, "failed writing http response", "msg", msg, "written", written)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user