Expose workflow job metrics via new actions-metrics-server (#2057)

* Add workflow job metrics to Github webhook server

* Fix handling of workflow_job.Conclusion

* Make the prometheus metrics exporter for the workflow jobs a dedicated application

* chart: Add support for deploying actions-metrics-server

* A few improvements to make it easy to cover in E2E

* chart: Add missing actionsmetrics.service.yaml

* chart: Do not modify actionsMetricsServer.replicaCount

* chart: Add documentation for actionsMetrics and actionsMetricsServer

Co-authored-by: Colin Heathman <cheathman@benchsci.com>
This commit is contained in:
Yusuke Kuoka
2022-12-10 08:24:28 +09:00
committed by GitHub
parent 0285da1a32
commit 300e93c59d
16 changed files with 1019 additions and 2 deletions

View File

@@ -0,0 +1,227 @@
package actionsmetrics
import (
"bufio"
"context"
"fmt"
"net/http"
"regexp"
"strings"
"time"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v47/github"
"github.com/prometheus/client_golang/prometheus"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
type EventReader struct {
Log logr.Logger
// GitHub Client to fetch information about job failures
GitHubClient *github.Client
// Event queue
Events chan interface{}
}
// HandleWorkflowJobEvent send event to reader channel for processing
//
// forcing the events through a channel ensures they are processed in sequentially,
// and prevents any race conditions with githubWorkflowJobStatus
func (reader *EventReader) HandleWorkflowJobEvent(event interface{}) {
reader.Events <- event
}
// ProcessWorkflowJobEvents pop events in a loop for processing
//
// Should be called asynchronously with `go`
func (reader *EventReader) ProcessWorkflowJobEvents(ctx context.Context) {
for {
select {
case event := <-reader.Events:
reader.ProcessWorkflowJobEvent(ctx, event)
case <-ctx.Done():
return
}
}
}
// ProcessWorkflowJobEvent processes a single event
//
// Events should be processed in the same order that Github emits them
func (reader *EventReader) ProcessWorkflowJobEvent(ctx context.Context, event interface{}) {
e, ok := event.(*gogithub.WorkflowJobEvent)
if !ok {
return
}
// collect labels
labels := make(prometheus.Labels)
runsOn := strings.Join(e.WorkflowJob.Labels, `,`)
labels["runs_on"] = runsOn
labels["job_name"] = *e.WorkflowJob.Name
// switch on job status
switch action := e.GetAction(); action {
case "queued":
githubWorkflowJobsQueuedTotal.With(labels).Inc()
case "in_progress":
githubWorkflowJobsStartedTotal.With(labels).Inc()
if reader.GitHubClient == nil {
return
}
parseResult, err := reader.fetchAndParseWorkflowJobLogs(ctx, e)
if err != nil {
reader.Log.Error(err, "reading workflow job log")
return
} else {
reader.Log.Info("reading workflow_job logs",
"job_name", *e.WorkflowJob.Name,
"job_id", fmt.Sprint(*e.WorkflowJob.ID),
)
}
githubWorkflowJobQueueDurationSeconds.With(labels).Observe(parseResult.QueueTime.Seconds())
case "completed":
githubWorkflowJobsCompletedTotal.With(labels).Inc()
// job_conclusion -> (neutral, success, skipped, cancelled, timed_out, action_required, failure)
githubWorkflowJobConclusionsTotal.With(extraLabel("job_conclusion", *e.WorkflowJob.Conclusion, labels)).Inc()
parseResult, err := reader.fetchAndParseWorkflowJobLogs(ctx, e)
if err != nil {
reader.Log.Error(err, "reading workflow job log")
return
} else {
reader.Log.Info("reading workflow_job logs",
"job_name", *e.WorkflowJob.Name,
"job_id", fmt.Sprint(*e.WorkflowJob.ID),
)
}
if *e.WorkflowJob.Conclusion == "failure" {
failedStep := "null"
for i, step := range e.WorkflowJob.Steps {
// *step.Conclusion ~
// "success",
// "failure",
// "neutral",
// "cancelled",
// "skipped",
// "timed_out",
// "action_required",
// null
if *step.Conclusion == "failure" {
failedStep = fmt.Sprint(i)
break
}
if *step.Conclusion == "timed_out" {
failedStep = fmt.Sprint(i)
parseResult.ExitCode = "timed_out"
break
}
}
githubWorkflowJobFailuresTotal.With(
extraLabel("failed_step", failedStep,
extraLabel("exit_code", parseResult.ExitCode, labels),
),
).Inc()
}
githubWorkflowJobRunDurationSeconds.With(extraLabel("job_conclusion", *e.WorkflowJob.Conclusion, labels)).Observe(parseResult.RunTime.Seconds())
}
}
func extraLabel(key string, value string, labels prometheus.Labels) prometheus.Labels {
fixedLabels := make(prometheus.Labels)
for k, v := range labels {
fixedLabels[k] = v
}
fixedLabels[key] = value
return fixedLabels
}
type ParseResult struct {
ExitCode string
QueueTime time.Duration
RunTime time.Duration
}
var logLine = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{7}Z)\s(.+)$`)
var exitCodeLine = regexp.MustCompile(`##\[error\]Process completed with exit code (\d)\.`)
func (reader *EventReader) fetchAndParseWorkflowJobLogs(ctx context.Context, e *gogithub.WorkflowJobEvent) (*ParseResult, error) {
owner := *e.Repo.Owner.Login
repo := *e.Repo.Name
id := *e.WorkflowJob.ID
url, _, err := reader.GitHubClient.Actions.GetWorkflowJobLogs(ctx, owner, repo, id, true)
if err != nil {
return nil, err
}
jobLogs, err := http.DefaultClient.Get(url.String())
if err != nil {
return nil, err
}
exitCode := "null"
var (
queuedTime time.Time
startedTime time.Time
completedTime time.Time
)
func() {
// Read jobLogs.Body line by line
defer jobLogs.Body.Close()
lines := bufio.NewScanner(jobLogs.Body)
for lines.Scan() {
matches := logLine.FindStringSubmatch(lines.Text())
if matches == nil {
continue
}
timestamp := matches[1]
line := matches[2]
if strings.HasPrefix(line, "##[error]") {
// Get exit code
exitCodeMatch := exitCodeLine.FindStringSubmatch(line)
if exitCodeMatch != nil {
exitCode = exitCodeMatch[1]
}
continue
}
if strings.HasPrefix(line, "Waiting for a runner to pick up this job...") {
queuedTime, _ = time.Parse(time.RFC3339, timestamp)
continue
}
if strings.HasPrefix(line, "Job is about to start running on the runner:") {
startedTime, _ = time.Parse(time.RFC3339, timestamp)
continue
}
// Last line in the log will count as the completed time
completedTime, _ = time.Parse(time.RFC3339, timestamp)
}
}()
return &ParseResult{
ExitCode: exitCode,
QueueTime: startedTime.Sub(queuedTime),
RunTime: completedTime.Sub(startedTime),
}, nil
}

View File

@@ -0,0 +1,126 @@
// Package metrics provides monitoring of the GitHub related metrics.
//
// This depends on the metrics exporter of kubebuilder.
// See https://book.kubebuilder.io/reference/metrics.html for details.
package actionsmetrics
import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
func init() {
metrics.Registry.MustRegister(
githubWorkflowJobQueueDurationSeconds,
githubWorkflowJobRunDurationSeconds,
githubWorkflowJobConclusionsTotal,
githubWorkflowJobsQueuedTotal,
githubWorkflowJobsStartedTotal,
githubWorkflowJobsCompletedTotal,
githubWorkflowJobFailuresTotal,
)
}
var (
runtimeBuckets []float64 = []float64{
0.01,
0.05,
0.1,
0.5,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
12,
15,
18,
20,
25,
30,
40,
50,
60,
70,
80,
90,
100,
110,
120,
150,
180,
210,
240,
300,
360,
420,
480,
540,
600,
900,
1200,
1800,
2400,
3000,
3600,
}
)
var (
githubWorkflowJobQueueDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "github_workflow_job_queue_duration_seconds",
Help: "Queue times for workflow jobs in seconds",
Buckets: runtimeBuckets,
},
[]string{"runs_on", "job_name"},
)
githubWorkflowJobRunDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "github_workflow_job_run_duration_seconds",
Help: "Run times for workflow jobs in seconds",
Buckets: runtimeBuckets,
},
[]string{"runs_on", "job_name", "job_conclusion"},
)
githubWorkflowJobConclusionsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "github_workflow_job_conclusions_total",
Help: "Conclusions for tracked workflow jobs",
},
[]string{"runs_on", "job_name", "job_conclusion"},
)
githubWorkflowJobsQueuedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "github_workflow_jobs_queued_total",
Help: "Total count of workflow jobs queued (events where job_status=queued)",
},
[]string{"runs_on", "job_name"},
)
githubWorkflowJobsStartedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "github_workflow_jobs_started_total",
Help: "Total count of workflow jobs started (events where job_status=in_progress)",
},
[]string{"runs_on", "job_name"},
)
githubWorkflowJobsCompletedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "github_workflow_jobs_completed_total",
Help: "Total count of workflow jobs completed (events where job_status=completed)",
},
[]string{"runs_on", "job_name"},
)
githubWorkflowJobFailuresTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "github_workflow_job_failures_total",
Help: "Conclusions for tracked workflow runs",
},
[]string{"runs_on", "job_name", "failed_step", "exit_code"},
)
)

View File

@@ -0,0 +1,157 @@
package actionsmetrics
/*
Copyright 2022 The actions-runner-controller authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import (
"context"
"fmt"
"io"
"net/http"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v47/github"
ctrl "sigs.k8s.io/controller-runtime"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
type EventHook func(interface{})
// WebhookServer is a HTTP server that handles workflow_job events sent from GitHub Actions
type WebhookServer struct {
Log logr.Logger
// SecretKeyBytes is the byte representation of the Webhook secret token
// the administrator is generated and specified in GitHub Web UI.
SecretKeyBytes []byte
// GitHub Client to discover runner groups assigned to a repository
GitHubClient *github.Client
// When HorizontalRunnerAutoscalerGitHubWebhook handles a request, each EventHook is sent the webhook event
EventHooks []EventHook
}
func (autoscaler *WebhookServer) Reconcile(_ context.Context, request reconcile.Request) (reconcile.Result, error) {
return ctrl.Result{}, nil
}
func (autoscaler *WebhookServer) Handle(w http.ResponseWriter, r *http.Request) {
var (
ok bool
err error
)
defer func() {
if !ok {
w.WriteHeader(http.StatusInternalServerError)
if err != nil {
msg := err.Error()
if written, err := w.Write([]byte(msg)); err != nil {
autoscaler.Log.V(1).Error(err, "failed writing http error response", "msg", msg, "written", written)
}
}
}
}()
defer func() {
if r.Body != nil {
r.Body.Close()
}
}()
// respond ok to GET / e.g. for health check
if r.Method == http.MethodGet {
ok = true
fmt.Fprintln(w, "actions-metrics-server is running")
return
}
var payload []byte
if len(autoscaler.SecretKeyBytes) > 0 {
payload, err = gogithub.ValidatePayload(r, autoscaler.SecretKeyBytes)
if err != nil {
autoscaler.Log.Error(err, "error validating request body")
return
}
} else {
payload, err = io.ReadAll(r.Body)
if err != nil {
autoscaler.Log.Error(err, "error reading request body")
return
}
}
webhookType := gogithub.WebHookType(r)
event, err := gogithub.ParseWebHook(webhookType, payload)
if err != nil {
var s string
if payload != nil {
s = string(payload)
}
autoscaler.Log.Error(err, "could not parse webhook", "webhookType", webhookType, "payload", s)
return
}
log := autoscaler.Log.WithValues(
"event", webhookType,
"hookID", r.Header.Get("X-GitHub-Hook-ID"),
"delivery", r.Header.Get("X-GitHub-Delivery"),
)
switch event.(type) {
case *gogithub.PingEvent:
ok = true
w.WriteHeader(http.StatusOK)
msg := "pong"
if written, err := w.Write([]byte(msg)); err != nil {
log.Error(err, "failed writing http response", "msg", msg, "written", written)
}
log.Info("handled ping event")
return
}
for _, eventHook := range autoscaler.EventHooks {
eventHook(event)
}
ok = true
w.WriteHeader(http.StatusOK)
msg := "ok"
log.Info(msg)
if written, err := w.Write([]byte(msg)); err != nil {
log.Error(err, "failed writing http response", "msg", msg, "written", written)
}
}