actions-runner-controller/controllers/runner_graceful_stop.go

package controllers

import (
	"context"
	"errors"
	"fmt"
	"strconv"
	"time"

	"github.com/actions-runner-controller/actions-runner-controller/github"
	"github.com/go-logr/logr"
	gogithub "github.com/google/go-github/v39/github"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/client"
)

// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job.
//
// This function returns a non-nil pointer to corev1.Pod as the first return value
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
//
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
// This function is designed to complete a lengthy graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
	pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
	if err != nil {
		return nil, &ctrl.Result{}, err
	}

	if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
		return nil, res, err
	}

	pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
	if err != nil {
		return nil, &ctrl.Result{}, err
	}

	return pod, nil, nil
}

// annotatePodOnce annotates the pod if it wasn't.
// Returns the provided pod as-is if it was already annotated.
// Returns the updated pod if the pod was missing the annotation and the update to add the annotation succeeded.
func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod *corev1.Pod, k, v string) (*corev1.Pod, error) {
	if pod == nil {
		return nil, nil
	}

	if _, ok := getAnnotation(pod, k); ok {
		return pod, nil
	}

	updated := pod.DeepCopy()
	setAnnotation(&updated.ObjectMeta, k, v)
	if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
		log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", k))
		return nil, err
	}

	log.V(2).Info("Annotated pod", "key", k, "value", v)

	return updated, nil
}

// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
	var runnerID *int64

	if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
		v, err := strconv.ParseInt(id, 10, 64)
		if err != nil {
			return &ctrl.Result{}, err
		}

		runnerID = &v
	}

	code := runnerContainerExitCode(pod)

	if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
		// If it's already unregistered in the previous reconcilation loop,
		// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
		log.Info("Runner pod is marked as already unregistered.")
	} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
		// If it's an ephemeral runner with the actions/runner container exited with 0,
		// we can safely assume that it has unregistered itself from GitHub Actions
		// so it's natural that RemoveRunner fails due to 404.

		// If pod has ended up succeeded we need to restart it
		// Happens e.g. when dind is in runner and run completes
		log.Info("Runner pod has been stopped with a successful status.")
	} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
		log.Info("Runner pod is annotated to wait for completion")

		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil {
		if errors.Is(err, &gogithub.RateLimitError{}) {
			// We log the underlying error when we failed calling GitHub API to list or unregisters,
			// or the runner is still busy.
			log.Error(
				err,
				fmt.Sprintf(
					"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
					retryDelayOnGitHubAPIRateLimitError,
				),
			)

			return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
		}

		log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)

		var runnerBusy bool

		errRes := &gogithub.ErrorResponse{}
		if errors.As(err, &errRes) {
			runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)

			var runnerID int64

			if runner != nil && runner.ID != nil {
				runnerID = *runner.ID
			}

			runnerBusy = errRes.Response.StatusCode == 422

			if runnerBusy && code != nil {
				log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
					"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
					"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
					"runnerExitCode", *code,
					"runnerID", runnerID,
				)

				return nil, nil
			}
		}

		if runnerBusy {
			// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
			// work as the reconcilation can happen earlier due to pod status update.
			// For ephemeral runners, we can expect it to stop and unregister itself on completion.
			// So we can just wait for the completion without actively retrying unregistration.
			ephemeral := getRunnerEnv(pod, EnvVarEphemeral)
			if ephemeral == "true" {
				pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339))
				if err != nil {
					return &ctrl.Result{}, err
				}

				return &ctrl.Result{}, nil
			}

			log.V(2).Info("Retrying runner unregistration because the static runner is still busy")
			// Otherwise we may end up spamming 422 errors,
			// each call consuming GitHub API rate limit
			// https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271
			return &ctrl.Result{RequeueAfter: retryDelay}, nil
		}

		return &ctrl.Result{}, err
	} else if ok {
		log.Info("Runner has just been unregistered.")
	} else if pod == nil {
		// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
		// However, that doesn't always mean the pod can be safely removed.
		//
		// If the pod does not exist for the runner,
		// it may be due to that the runner pod has never been created.
		// In that case we can safely assume that the runner will never be registered.

		log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
	} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
		log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+
			" It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err)

		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	} else {
		// A runner and a runner pod that is created by this version of ARC should match
		// any of the above branches.
		//
		// But we leave this match all branch for potential backward-compatibility.
		// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
		// and retry later.
		log.V(1).Info("Runner unregistration is being retried later.")

		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	}

	return nil, nil
}

func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
	_, hasRunnerID := getAnnotation(pod, AnnotationKeyRunnerID)
	if runnerPodOrContainerIsStopped(pod) || hasRunnerID {
		return pod, nil, nil
	}

	r, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
	if err != nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	if r == nil || r.ID == nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	id := *r.ID

	updated, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerID, fmt.Sprintf("%d", id))
	if err != nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	return updated, nil, nil
}

func getAnnotation(obj client.Object, key string) (string, bool) {
	if obj.GetAnnotations() == nil {
		return "", false
	}

	v, ok := obj.GetAnnotations()[key]

	return v, ok
}

func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
	if meta.Annotations == nil {
		meta.Annotations = map[string]string{}
	}

	meta.Annotations[key] = value
}

func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
	for _, c := range pod.Status.Conditions {
		if c.Type == tpe && c.Status == v {
			return &c.LastTransitionTime
		}
	}

	return nil
}

func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
	c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
	if c == nil {
		return false
	}

	return c.Add(d).Before(time.Now())
}

func podRunnerID(pod *corev1.Pod) string {
	id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
	return id
}

func getRunnerEnv(pod *corev1.Pod, key string) string {
	for _, c := range pod.Spec.Containers {
		if c.Name == containerName {
			for _, e := range c.Env {
				if e.Name == key {
					return e.Value
				}
			}
		}
	}
	return ""
}

func setRunnerEnv(pod *corev1.Pod, key, value string) {
	for i := range pod.Spec.Containers {
		c := pod.Spec.Containers[i]
		if c.Name == containerName {
			for j, env := range c.Env {
				if env.Name == key {
					pod.Spec.Containers[i].Env[j].Value = value
					return
				}
			}
			pod.Spec.Containers[i].Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
		}
	}
}

// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
//
// Case 1. (true, nil) when it has successfully unregistered the runner.
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
//   an error returned by GitHub API.
//
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
// In other words, all those three sub-cases cannot be distinguished by this function alone.
//
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
//   You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
//   so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
//
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
//   Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
//   There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
//
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
//   If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
//
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
// as long as you recreate the runner pod.
//
// If it was "2-3.", you need a workaround to avoid the race condition.
//
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
//
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
// There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id *int64) (bool, error) {
	if id == nil {
		runner, err := getRunner(ctx, client, enterprise, org, repo, name)
		if err != nil {
			return false, err
		}

		if runner == nil || runner.ID == nil {
			return false, nil
		}

		id = runner.ID
	}

	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
	// The reason ARC did so was to let a runner running a job to not stop prematurely.
	//
	// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
	// so ARC doesn't need to do anything special for a graceful runner stop.
	// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
	// the runner pod is safe for deletion.
	//
	// Trying to remove a busy runner can result in errors like the following:
	//    failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
	//
	// # NOTES
	//
	// - It can be "status=offline" at the same time but that's another story.
	// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
	//   determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
	// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
	//   change from 60 seconds.
	//
	// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
	if err := client.RemoveRunner(ctx, enterprise, org, repo, *id); err != nil {
		return false, err
	}

	return true, nil
}

func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
	runners, err := client.ListRunners(ctx, enterprise, org, repo)
	if err != nil {
		return nil, err
	}

	for _, runner := range runners {
		if runner.GetName() == name {
			return runner, nil
		}
	}

	return nil, nil
}