Handle offline runners gracefully (#341)

* if a runner pod starts up with an invalid token, it will go in an 
infinite retry loop, appearing as RUNNING from the outside
* normally, this error situation is detected because no corresponding 
runner objects exists in GitHub and the pod will get removed after 
registration timeout
* if the GitHub runner object already existed before - e.g. because a 
finalizer was not properly run as part of a partial Kubernetes crash, 
the runner will always stay in a running mode, even updating the 
registration token will not kill the problematic pod
* introducing RunnerOffline exception that can be handled in runner 
controller and replicaset controller
* as runners are offline when a pod is completed and marked for restart, 
only do additional restart checks if no restart was already decided, 
making code a bit cleaner and saving GitHub API calls after each job 
completion
This commit is contained in:
Johannes Nicolai
2021-02-22 02:08:04 +01:00
committed by GitHub
parent dd0b9f3e95
commit 2d7fbbfb68
3 changed files with 94 additions and 48 deletions

View File

@@ -310,6 +310,14 @@ func (e *RunnerNotFound) Error() string {
return fmt.Sprintf("runner %q not found", e.runnerName)
}
type RunnerOffline struct {
runnerName string
}
func (e *RunnerOffline) Error() string {
return fmt.Sprintf("runner %q offline", e.runnerName)
}
func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.ListRunners(ctx, enterprise, org, repo)
if err != nil {
@@ -318,6 +326,9 @@ func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name s
for _, runner := range runners {
if runner.GetName() == name {
if runner.GetStatus() == "offline" {
return false, &RunnerOffline{runnerName: name}
}
return runner.GetBusy(), nil
}
}