Fix dead-lock when runner unregistration triggered before PV attachment (#1975)

This fixes an issue discovered while I was testing #1759. Please see the new comment in code for more information.
2025-12-10 11:41:27 +00:00 · 2022-11-04 06:29:19 +09:00
parent 8505c95719
commit 23c8fe4a8b
1 changed files with 20 additions and 1 deletions
--- a/controllers/runner_graceful_stop.go
+++ b/controllers/runner_graceful_stop.go
@@ -98,12 +98,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
 		// If it's already unregistered in the previous reconcilation loop,
 		// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
 		log.Info("Runner pod is marked as already unregistered.")
-	} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) {
+	} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) &&
+		!podIsPending(pod) {
+
 		log.Info(
 			"Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
 			"registrationTimeout", registrationTimeout,
 		)
 		return &ctrl.Result{RequeueAfter: retryDelay}, nil
+	} else if runnerID == nil && podIsPending(pod) {
+		// Note: This logic is here to prevent a dead-lock between ARC and the PV provider.
+		//
+		// The author of this logic thinks that some (or all?) of CSI plugins and PV providers
+		// do not supported to provision dynamic PVs for a pod that is already marked for deletion.
+		// If we didn't handle this case here, ARC would end up with waiting forever until the
+		// PV provider to provision PVs for the pod, which seems to never happen.
+		log.Info(
+			"Unregistration started before runner pod gets scheduled onto a node. "+
+				"Perhaps the runner is taking a long time due to e.g. slow CSI slugin not giving us a PV in a timely manner, or your Kubernetes cluster is overloaded? "+
+				"Marking unregistration as completed anyway because there's nothing ARC can do.",
+			"registrationTimeout", registrationTimeout,
+		)
 	} else if runnerID == nil && runnerPodOrContainerIsStopped(pod) {
 		log.Info(
 			"Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+
@@ -327,6 +342,10 @@ func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionTyp
 	return c.Add(d).Before(time.Now())
 }

+func podIsPending(pod *corev1.Pod) bool {
+	return pod.Status.Phase == corev1.PodPending
+}
+
 func podRunnerID(pod *corev1.Pod) string {
 	id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
 	return id