Fix externally deleted runner pod to not block unregistration process

This commit is contained in:
Yusuke Kuoka
2022-03-13 12:15:49 +00:00
parent e4280dcb0d
commit efb7fca308

View File

@@ -81,12 +81,29 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
runnerID = &v runnerID = &v
} }
if runnerID == nil {
runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
if err != nil {
return &ctrl.Result{}, err
}
if runner != nil && runner.ID != nil {
runnerID = runner.ID
}
}
code := runnerContainerExitCode(pod) code := runnerContainerExitCode(pod)
if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" { if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop, // If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod. // you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.") log.Info("Runner pod is marked as already unregistered.")
} else if runnerID == nil {
log.Info(
"Unregistration started before runner ID is assigned. " +
"Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? " +
"Marking unregistration as completed anyway because there's nothing ARC can do.",
)
} else if pod != nil && runnerPodOrContainerIsStopped(pod) { } else if pod != nil && runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0, // If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions // we can safely assume that it has unregistered itself from GitHub Actions
@@ -99,7 +116,7 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
log.Info("Runner pod is annotated to wait for completion") log.Info("Runner pod is annotated to wait for completion")
return &ctrl.Result{RequeueAfter: retryDelay}, nil return &ctrl.Result{RequeueAfter: retryDelay}, nil
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil { } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) { if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters, // We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy. // or the runner is still busy.
@@ -335,20 +352,7 @@ func setRunnerEnv(pod *corev1.Pod, key, value string) {
// There isn't a single right grace period that works for everyone. // There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue. // while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id *int64) (bool, error) { func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id int64) (bool, error) {
if id == nil {
runner, err := getRunner(ctx, client, enterprise, org, repo, name)
if err != nil {
return false, err
}
if runner == nil || runner.ID == nil {
return false, nil
}
id = runner.ID
}
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
// The reason ARC did so was to let a runner running a job to not stop prematurely. // The reason ARC did so was to let a runner running a job to not stop prematurely.
// //
@@ -369,7 +373,7 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or
// change from 60 seconds. // change from 60 seconds.
// //
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, *id); err != nil { if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err return false, err
} }