Merge branch 'master' into improve-logs

This commit is contained in:
Felipe Galindo Sanchez
2022-02-28 09:25:30 -08:00
committed by GitHub
36 changed files with 725 additions and 211 deletions

View File

@@ -768,8 +768,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
capacityReservations := getValidCapacityReservations(copy)
if amount > 0 {
now := time.Now()
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
EffectiveTime: metav1.Time{Time: now},
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
Replicas: amount,
})
} else if amount < 0 {
@@ -788,10 +790,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
copy.Spec.CapacityReservations = reservations
}
autoscaler.Log.Info(
before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
expired := before - len(capacityReservations)
after := len(copy.Spec.CapacityReservations)
autoscaler.Log.V(1).Info(
fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name),
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
"after", copy.Spec.CapacityReservations,
"before", before,
"expired", expired,
"amount", amount,
"after", after,
)
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {

View File

@@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
var effectiveTime *time.Time
for _, r := range hra.Spec.CapacityReservations {
t := r.EffectiveTime
if effectiveTime == nil || effectiveTime.Before(t.Time) {
effectiveTime = &t.Time
}
}
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas {
copy := rd.DeepCopy()
copy.Spec.Replicas = &newDesiredReplicas
if ephemeral && effectiveTime != nil {
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
}
} else if ephemeral && effectiveTime != nil {
copy := rd.DeepCopy()
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
}

View File

@@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
Name: controllerName("runner"),
RegistrationRecheckInterval: time.Millisecond,
RegistrationRecheckJitter: time.Millisecond,
UnregistrationTimeout: 1 * time.Second,
UnregistrationRetryDelay: 1 * time.Second,
}
err = runnerController.SetupWithManager(mgr)
Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller")

View File

@@ -72,6 +72,9 @@ type RunnerReconciler struct {
Name string
RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
}
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
@@ -110,8 +113,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil
}
} else {
var p *corev1.Pod
{
var pod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
if !kerrors.IsNotFound(err) {
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
return ctrl.Result{Requeue: true}, nil
}
} else {
p = &pod
}
}
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
return r.processRunnerDeletion(runner, ctx, log)
return r.processRunnerDeletion(runner, ctx, log, p)
}
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
@@ -159,20 +177,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded
stopped := runnerPodOrContainerIsStopped(&pod)
if !stopped {
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
if stopped && ephemeral {
log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
//
// In the next reconcilation loop, this triggers a runner unregistration.
// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
//
// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
// which handles the unregistrationa the removal of the completed pod, and so on.
if err := r.Delete(ctx, &runner); err != nil {
log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
return ctrl.Result{Requeue: true}, nil
}
restart := stopped
@@ -404,64 +429,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil
}
// Try to delete current pod if recreation is needed
safeToDeletePod := false
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
} else {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
if !ok {
log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
}
safeToDeletePod = true
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
if res != nil {
return *res, err
}
if safeToDeletePod {
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
if err := r.Delete(ctx, &pod); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
if err := r.Delete(ctx, updatedPod); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
return ctrl.Result{}, nil
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) {
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded
if !stopped {
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
}
}
return stopped
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
if removed {
if len(runner.Status.Registration.Token) > 0 {
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
} else {
log.V(1).Info("Runner was never registered on GitHub")
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
if res != nil {
return *res, err
}
newRunner := runner.DeepCopy()
@@ -478,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
return ctrl.Result{}, nil
}
func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
}
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
deletionTimeout := 1 * time.Minute
currentTime := time.Now()

View File

@@ -0,0 +1,249 @@
package controllers
import (
"context"
"errors"
"fmt"
"time"
"github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v39/github"
corev1 "k8s.io/api/core/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
unregistrationStartTimestamp = "unregistration-start-timestamp"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
)
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job.
//
// This function returns a non-nil pointer to corev1.Pod as the first return value
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
//
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
// This function is designed to complete a length graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has started unregistration")
} else {
log.Info("Runner has already started unregistration")
}
}
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err
}
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has completed unregistration")
} else {
log.Info("Runner has already completed unregistration")
}
}
return pod, nil, nil
}
// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
log.Error(err, "Failed to unregister runner before deleting the pod.")
return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered. Removing the runner pod.")
} else if pod == nil {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
// However, that doesn't always mean the pod can be safely removed.
//
// If the pod does not exist for the runner,
// it may be due to that the runner pod has never been created.
// In that case we can safely assume that the runner will never be registered.
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
} else {
// A runner and a runner pod that is created by this version of ARC should match
// any of the above branches.
//
// But we leave this match all branch for potential backward-compatibility.
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
// and retry later.
log.V(1).Info("Runner unregistration is being retried later.")
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}
return nil, nil
}
func getAnnotation(pod *corev1.Pod, key string) (string, bool) {
if pod.Annotations == nil {
return "", false
}
v, ok := pod.Annotations[key]
return v, ok
}
func setAnnotation(pod *corev1.Pod, key, value string) {
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Annotations[key] = value
}
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
//
// Case 1. (true, nil) when it has successfully unregistered the runner.
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
//
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
// In other words, all those three sub-cases cannot be distinguished by this function alone.
//
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
//
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
//
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
//
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
// as long as you recreate the runner pod.
//
// If it was "2-3.", you need a workaround to avoid the race condition.
//
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
//
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
// There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
// The reason ARC did so was to let a runner running a job to not stop prematurely.
//
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
// so ARC doesn't need to do anything special for a graceful runner stop.
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
// the runner pod is safe for deletion.
//
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// # NOTES
//
// - It can be "status=offline" at the same time but that's another story.
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
// change from 60 seconds.
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}

View File

@@ -47,6 +47,9 @@ type RunnerPodReconciler struct {
Name string
RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
}
const (
@@ -105,33 +108,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if removed {
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
patchedPod := updatedPod.DeepCopy()
patchedPod.ObjectMeta.Finalizers = finalizers
newRunner := runnerPod.DeepCopy()
newRunner.ObjectMeta.Finalizers = finalizers
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner for finalizer removal")
return ctrl.Result{}, err
}
@@ -365,8 +351,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
// Delete current pod if recreation is needed
if err := r.Delete(ctx, &runnerPod); err != nil {
if err := r.Delete(ctx, updated); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
@@ -377,8 +368,22 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
}
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {

View File

@@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas {
newestSet.Spec.Replicas = &newDesiredReplicas
newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime
if err := r.Client.Update(ctx, newestSet); err != nil {
log.Error(err, "Failed to update runnerreplicaset resource")
@@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str
Labels: newRSTemplate.ObjectMeta.Labels,
},
Spec: v1alpha1.RunnerReplicaSetSpec{
Replicas: rd.Spec.Replicas,
Selector: newRSSelector,
Template: newRSTemplate,
Replicas: rd.Spec.Replicas,
Selector: newRSSelector,
Template: newRSTemplate,
EffectiveTime: rd.Spec.EffectiveTime,
},
}

View File

@@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct {
Name string
}
const (
SyncTimeAnnotationKey = "sync-time"
)
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
@@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}
var myRunners []v1alpha1.Runner
var (
current int
ready int
available int
lastSyncTime *time.Time
)
for _, r := range allRunners.Items {
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
// to not treat all the runners in the namespace as its children.
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
myRunners = append(myRunners, r)
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
// Just hold on, and runners will disappear as long as the runner controller is up and running.
if !r.DeletionTimestamp.IsZero() {
continue
}
if r.Annotations != nil {
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
t, err := time.Parse(time.RFC3339, a)
if err == nil {
if lastSyncTime == nil || lastSyncTime.Before(t) {
lastSyncTime = &t
}
}
}
}
current += 1
@@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}
if current > desired {
effectiveTime := rs.Spec.EffectiveTime
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
} else if current > desired {
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
// you're unlikely to fall into this branch.
//
// That's becaseu all the stakeholders work like this:
//
// 1. A runner pod completes with the runner container exiting with code 0
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
//
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
// marked for deletion by the runner controller.
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
n := current - desired
log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
@@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
objectMeta.Namespace = rs.ObjectMeta.Namespace
if objectMeta.Annotations == nil {
objectMeta.Annotations = map[string]string{}
}
objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339)
runner := v1alpha1.Runner{
TypeMeta: metav1.TypeMeta{},

View File

@@ -1,49 +0,0 @@
package controllers
import (
"context"
"fmt"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
// - (true, nil) when it has successfully unregistered the runner.
// - (false, nil) when the runner has been already unregistered.
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
// But we assume that it's not actually offline and still running a job.
if runner.GetBusy() {
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}