Fix dead-lock when runner unregistration triggered before PV attachment (#1975)

This fixes an issue discovered while I was testing #1759. Please see the new comment in code for more information.
This commit is contained in:
Yusuke Kuoka
2022-11-04 06:29:19 +09:00
committed by GitHub
parent 8505c95719
commit 23c8fe4a8b

View File

@@ -98,12 +98,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
// If it's already unregistered in the previous reconcilation loop, // If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod. // you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.") log.Info("Runner pod is marked as already unregistered.")
} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) { } else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) &&
!podIsPending(pod) {
log.Info( log.Info(
"Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop", "Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
"registrationTimeout", registrationTimeout, "registrationTimeout", registrationTimeout,
) )
return &ctrl.Result{RequeueAfter: retryDelay}, nil return &ctrl.Result{RequeueAfter: retryDelay}, nil
} else if runnerID == nil && podIsPending(pod) {
// Note: This logic is here to prevent a dead-lock between ARC and the PV provider.
//
// The author of this logic thinks that some (or all?) of CSI plugins and PV providers
// do not supported to provision dynamic PVs for a pod that is already marked for deletion.
// If we didn't handle this case here, ARC would end up with waiting forever until the
// PV provider to provision PVs for the pod, which seems to never happen.
log.Info(
"Unregistration started before runner pod gets scheduled onto a node. "+
"Perhaps the runner is taking a long time due to e.g. slow CSI slugin not giving us a PV in a timely manner, or your Kubernetes cluster is overloaded? "+
"Marking unregistration as completed anyway because there's nothing ARC can do.",
"registrationTimeout", registrationTimeout,
)
} else if runnerID == nil && runnerPodOrContainerIsStopped(pod) { } else if runnerID == nil && runnerPodOrContainerIsStopped(pod) {
log.Info( log.Info(
"Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+ "Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+
@@ -327,6 +342,10 @@ func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionTyp
return c.Add(d).Before(time.Now()) return c.Add(d).Before(time.Now())
} }
func podIsPending(pod *corev1.Pod) bool {
return pod.Status.Phase == corev1.PodPending
}
func podRunnerID(pod *corev1.Pod) string { func podRunnerID(pod *corev1.Pod) string {
id, _ := getAnnotation(pod, AnnotationKeyRunnerID) id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
return id return id