diff --git a/controllers/new_runner_pod_test.go b/controllers/new_runner_pod_test.go index c1b3b54b..e70ac509 100644 --- a/controllers/new_runner_pod_test.go +++ b/controllers/new_runner_pod_test.go @@ -155,7 +155,7 @@ func TestNewRunnerPod(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } @@ -237,7 +237,7 @@ func TestNewRunnerPod(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } @@ -315,7 +315,7 @@ func TestNewRunnerPod(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } @@ -576,7 +576,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } @@ -673,7 +673,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } @@ -770,7 +770,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyOnFailure, + RestartPolicy: corev1.RestartPolicyNever, }, } diff --git a/controllers/pod_runner_token_injector.go b/controllers/pod_runner_token_injector.go index 5c7dd487..2d107d79 100644 --- a/controllers/pod_runner_token_injector.go +++ b/controllers/pod_runner_token_injector.go @@ -78,9 +78,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque updated.Annotations[AnnotationKeyTokenExpirationDate] = ts - if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure { - updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure - } + forceRunnerPodRestartPolicyNever(updated) buf, err := json.Marshal(updated) if err != nil { diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 7f6ec506..9c2c466c 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -676,9 +676,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru pod := template.DeepCopy() - if pod.Spec.RestartPolicy == "" { - pod.Spec.RestartPolicy = "OnFailure" - } + forceRunnerPodRestartPolicyNever(pod) if mtu := runnerSpec.DockerMTU; mtu != nil && dockerdInRunner { runnerContainer.Env = append(runnerContainer.Env, []corev1.EnvVar{ diff --git a/controllers/runner_pod.go b/controllers/runner_pod.go new file mode 100644 index 00000000..349eb993 --- /dev/null +++ b/controllers/runner_pod.go @@ -0,0 +1,22 @@ +package controllers + +import corev1 "k8s.io/api/core/v1" + +// Force the runner pod managed by either RunnerDeployment and RunnerSet to have restartPolicy=Never. +// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369 for more context. +// +// This is to prevent runner pods from stucking in Terminating when a K8s node disappeared along with the runnr pod and the runner container within it. +// +// Previously, we used restartPolicy of OnFailure, it turned wrong later, and therefore we now set Never. +// +// When the restartPolicy is OnFailure and the node disappeared, runner pods on the node seem to stuck in state.terminated==nil, state.waiting!=nil, and state.lastTerminationState!=nil, +// and will ever become Running. +// It's probably due to that the node onto which the pods have been scheduled will ever come back, hence the container restart attempt swill ever succeed, +// the pods stuck waiting for successful restarts forever. +// +// By forcing runner pods to never restart, we hope there will be no chances of pods being stuck waiting. +func forceRunnerPodRestartPolicyNever(pod *corev1.Pod) { + if pod.Spec.RestartPolicy != corev1.RestartPolicyNever { + pod.Spec.RestartPolicy = corev1.RestartPolicyNever + } +}