Handle offline runners gracefully (#341)

* if a runner pod starts up with an invalid token, it will go in an infinite retry loop, appearing as RUNNING from the outside * normally, this error situation is detected because no corresponding runner objects exists in GitHub and the pod will get removed after registration timeout * if the GitHub runner object already existed before - e.g. because a finalizer was not properly run as part of a partial Kubernetes crash, the runner will always stay in a running mode, even updating the registration token will not kill the problematic pod * introducing RunnerOffline exception that can be handled in runner controller and replicaset controller * as runners are offline when a pod is completed and marked for restart, only do additional restart checks if no restart was already decided, making code a bit cleaner and saving GitHub API calls after each job completion
2026-03-13 04:21:58 +08:00 · 2021-02-22 02:08:04 +01:00
parent dd0b9f3e95
commit 2d7fbbfb68
3 changed files with 94 additions and 48 deletions
--- a/github/github.go
+++ b/github/github.go
@@ -310,6 +310,14 @@ func (e *RunnerNotFound) Error() string {
 	return fmt.Sprintf("runner %q not found", e.runnerName)
 }

+type RunnerOffline struct {
+	runnerName string
+}
+
+func (e *RunnerOffline) Error() string {
+	return fmt.Sprintf("runner %q offline", e.runnerName)
+}
+
 func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
 	runners, err := r.ListRunners(ctx, enterprise, org, repo)
 	if err != nil {
@@ -318,6 +326,9 @@ func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name s

 	for _, runner := range runners {
 		if runner.GetName() == name {
+			if runner.GetStatus() == "offline" {
+				return false, &RunnerOffline{runnerName: name}
+			}
 			return runner.GetBusy(), nil
 		}
 	}