mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-10 11:41:27 +00:00
Fix dead-lock when runner unregistration triggered before PV attachment (#1975)
This fixes an issue discovered while I was testing #1759. Please see the new comment in code for more information.
This commit is contained in:
@@ -98,12 +98,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
|
||||
// If it's already unregistered in the previous reconcilation loop,
|
||||
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
|
||||
log.Info("Runner pod is marked as already unregistered.")
|
||||
} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) {
|
||||
} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) &&
|
||||
!podIsPending(pod) {
|
||||
|
||||
log.Info(
|
||||
"Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
|
||||
"registrationTimeout", registrationTimeout,
|
||||
)
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
} else if runnerID == nil && podIsPending(pod) {
|
||||
// Note: This logic is here to prevent a dead-lock between ARC and the PV provider.
|
||||
//
|
||||
// The author of this logic thinks that some (or all?) of CSI plugins and PV providers
|
||||
// do not supported to provision dynamic PVs for a pod that is already marked for deletion.
|
||||
// If we didn't handle this case here, ARC would end up with waiting forever until the
|
||||
// PV provider to provision PVs for the pod, which seems to never happen.
|
||||
log.Info(
|
||||
"Unregistration started before runner pod gets scheduled onto a node. "+
|
||||
"Perhaps the runner is taking a long time due to e.g. slow CSI slugin not giving us a PV in a timely manner, or your Kubernetes cluster is overloaded? "+
|
||||
"Marking unregistration as completed anyway because there's nothing ARC can do.",
|
||||
"registrationTimeout", registrationTimeout,
|
||||
)
|
||||
} else if runnerID == nil && runnerPodOrContainerIsStopped(pod) {
|
||||
log.Info(
|
||||
"Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+
|
||||
@@ -327,6 +342,10 @@ func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionTyp
|
||||
return c.Add(d).Before(time.Now())
|
||||
}
|
||||
|
||||
func podIsPending(pod *corev1.Pod) bool {
|
||||
return pod.Status.Phase == corev1.PodPending
|
||||
}
|
||||
|
||||
func podRunnerID(pod *corev1.Pod) string {
|
||||
id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
|
||||
return id
|
||||
|
||||
Reference in New Issue
Block a user