refactor: Make RunnerReplicaSet and Runner backed by the same logic that backs RunnerSet

This commit is contained in:
Yusuke Kuoka
2022-03-05 12:13:22 +00:00
parent c95e84a528
commit 14a878bfae
12 changed files with 278 additions and 903 deletions

View File

@@ -18,13 +18,10 @@ package controllers
import (
"context"
"errors"
"fmt"
"reflect"
"time"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v39/github"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
@@ -32,7 +29,6 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
@@ -72,15 +68,35 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
return ctrl.Result{}, nil
}
if rs.ObjectMeta.Labels == nil {
rs.ObjectMeta.Labels = map[string]string{}
}
// Template hash is usually set by the upstream controller(RunnerDeplloyment controller) on authoring
// RunerReplicaset resource, but it may be missing when the user directly created RunnerReplicaSet.
// As a template hash is required by by the runner replica management, we dynamically add it here without ever persisting it.
if rs.ObjectMeta.Labels[LabelKeyRunnerTemplateHash] == "" {
template := rs.Spec.DeepCopy()
template.Replicas = nil
template.EffectiveTime = nil
templateHash := ComputeHash(template)
log.Info("Using auto-generated template hash", "value", templateHash)
rs.ObjectMeta.Labels = CloneAndAddLabel(rs.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
rs.Spec.Template.ObjectMeta.Labels = CloneAndAddLabel(rs.Spec.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
}
selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
if err != nil {
return ctrl.Result{}, err
}
// Get the Runners managed by the target RunnerReplicaSet
var allRunners v1alpha1.RunnerList
var runnerList v1alpha1.RunnerList
if err := r.List(
ctx,
&allRunners,
&runnerList,
client.InNamespace(req.Namespace),
client.MatchingLabelsSelector{Selector: selector},
); err != nil {
@@ -89,218 +105,43 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}
var (
current int
ready int
available int
lastSyncTime *time.Time
)
for _, r := range allRunners.Items {
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
// to not treat all the runners in the namespace as its children.
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
// Just hold on, and runners will disappear as long as the runner controller is up and running.
if !r.DeletionTimestamp.IsZero() {
continue
}
if r.Annotations != nil {
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
t, err := time.Parse(time.RFC3339, a)
if err == nil {
if lastSyncTime == nil || lastSyncTime.Before(t) {
lastSyncTime = &t
}
}
}
}
current += 1
if r.Status.Phase == string(corev1.PodRunning) {
ready += 1
// available is currently the same as ready, as we don't yet have minReadySeconds for runners
available += 1
}
}
}
var desired int
replicas := 1
if rs.Spec.Replicas != nil {
desired = *rs.Spec.Replicas
} else {
desired = 1
}
// TODO: remove this registration runner cleanup later (v0.23.0 or v0.24.0)
//
// We had to have a registration-only runner to support scale-from-zero before.
// But since Sep 2021 Actions update on GitHub Cloud and GHES 3.3, it is unneceesary.
// See the below issues for more contexts:
// https://github.com/actions-runner-controller/actions-runner-controller/issues/516
// https://github.com/actions-runner-controller/actions-runner-controller/issues/859
//
// In the below block, we have a logic to remove existing registration-only runners as unnecessary.
// This logic is introduced since actions-runner-controller 0.21.0 and probably last one or two minor releases
// so that actions-runner-controller instance in everyone's cluster won't leave dangling registration-only runners.
registrationOnlyRunnerNsName := req.NamespacedName
registrationOnlyRunnerNsName.Name = registrationOnlyRunnerNameFor(rs.Name)
registrationOnlyRunner := v1alpha1.Runner{}
registrationOnlyRunnerExists := false
if err := r.Get(
ctx,
registrationOnlyRunnerNsName,
&registrationOnlyRunner,
); err != nil {
if !kerrors.IsNotFound(err) {
return ctrl.Result{}, err
}
} else {
registrationOnlyRunnerExists = true
}
if registrationOnlyRunnerExists {
if err := r.Client.Delete(ctx, &registrationOnlyRunner); err != nil {
log.Error(err, "Retrying soon because we failed to delete registration-only runner")
return ctrl.Result{Requeue: true}, nil
}
replicas = *rs.Spec.Replicas
}
effectiveTime := rs.Spec.EffectiveTime
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
} else if current > desired {
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
// you're unlikely to fall into this branch.
//
// That's becaseu all the stakeholders work like this:
//
// 1. A runner pod completes with the runner container exiting with code 0
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
//
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
// marked for deletion by the runner controller.
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
desired, err := r.newRunner(rs)
if err != nil {
log.Error(err, "Could not create runner")
n := current - desired
log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
// get runners that are currently offline/not busy/timed-out to register
var deletionCandidates []v1alpha1.Runner
for _, runner := range allRunners.Items {
busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
notRegistered := false
offline := false
var notFoundException *github.RunnerNotFound
var offlineException *github.RunnerOffline
if errors.As(err, &notFoundException) {
log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
notRegistered = true
} else if errors.As(err, &offlineException) {
offline = true
} else {
var e *gogithub.RateLimitError
if errors.As(err, &e) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
registrationTimeout := 15 * time.Minute
currentTime := time.Now()
registrationDidTimeout := currentTime.Sub(runner.CreationTimestamp.Add(registrationTimeout)) > 0
if notRegistered && registrationDidTimeout {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Marking the runner for scale down. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
"runnerCreationTimestamp", runner.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
deletionCandidates = append(deletionCandidates, runner)
}
// offline runners should always be a great target for scale down
if offline {
deletionCandidates = append(deletionCandidates, runner)
}
} else if !busy {
deletionCandidates = append(deletionCandidates, runner)
}
}
if len(deletionCandidates) < n {
n = len(deletionCandidates)
}
log.V(0).Info(fmt.Sprintf("Deleting %d runner(s)", n), "desired", desired, "current", current, "ready", ready)
for i := 0; i < n; i++ {
if err := r.Client.Delete(ctx, &deletionCandidates[i]); client.IgnoreNotFound(err) != nil {
log.Error(err, "Failed to delete runner resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&rs, corev1.EventTypeNormal, "RunnerDeleted", fmt.Sprintf("Deleted runner '%s'", deletionCandidates[i].Name))
log.Info(fmt.Sprintf("Deleted runner %s", deletionCandidates[i].Name))
}
} else if desired > current {
n := desired - current
log.V(0).Info(fmt.Sprintf("Creating %d runner(s)", n), "desired", desired, "available", current, "ready", ready)
for i := 0; i < n; i++ {
newRunner, err := r.newRunner(rs)
if err != nil {
log.Error(err, "Could not create runner")
return ctrl.Result{}, err
}
if err := r.Client.Create(ctx, &newRunner); err != nil {
log.Error(err, "Failed to create runner resource")
return ctrl.Result{}, err
}
}
return ctrl.Result{}, err
}
var status v1alpha1.RunnerReplicaSetStatus
var live []client.Object
for _, r := range runnerList.Items {
r := r
live = append(live, &r)
}
res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, replicas, func() client.Object { return desired.DeepCopy() }, ephemeral, live)
if err != nil || res == nil {
return ctrl.Result{}, err
}
var (
status v1alpha1.RunnerReplicaSetStatus
current, available, ready int
)
for _, o := range res.currentObjects {
current += o.total
available += o.running
ready += o.running
}
status.Replicas = &current
status.AvailableReplicas = &available
@@ -322,6 +163,8 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1alpha1.Runner, error) {
// Note that the upstream controller (runnerdeployment) is expected to add
// the "runner template hash" label to the template.meta which is necessary to make this controller work correctly
objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy()
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"