Prevent runnerset pod unregistration until it gets runner ID

This eliminates the race condition that results in the runner terminated prematurely when RunnerSet triggered unregistration of StatefulSet that added just a few seconds ago.
This commit is contained in:
Yusuke Kuoka
2022-03-01 02:28:15 +00:00
parent 15b402bb32
commit a3072c110d
4 changed files with 225 additions and 64 deletions

View File

@@ -102,9 +102,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, err
}
log.V(2).Info("Added finalizer")
return ctrl.Result{}, nil
}
} else {
log.V(2).Info("Seen deletion-timestamp is already set")
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if removed {
@@ -122,7 +126,9 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, err
}
log.Info("Removed runner from GitHub", "repository", repo, "organization", org)
log.V(2).Info("Removed finalizer")
return ctrl.Result{}, nil
}
deletionTimeout := 1 * time.Minute
@@ -160,6 +166,35 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
po, res, err := ensureRunnerPodRegistered(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
runnerPod = *po
if _, unregistrationRequested := getAnnotation(&runnerPod.ObjectMeta, unregistrationRequestTimestamp); unregistrationRequested {
log.V(2).Info("Progressing unregistration because unregistration-request timestamp is set")
// At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller.
//
// In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all,
// so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work.
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
// At this point we are sure that the runner has successfully unregistered, hence is safe to be deleted.
// But we don't delete the pod here. Instead, let the upstream controller/parent object to delete this pod as
// a part of a cascade deletion.
// This is to avoid a parent object, like statefulset, to recreate the deleted pod.
// If the pod was recreated, it will start a registration process and that may race with the statefulset deleting the pod.
log.V(2).Info("Unregistration seems complete")
return ctrl.Result{}, nil
}
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := runnerPod.Status.Phase == corev1.PodSucceeded