Remove ephemeral runner when exit code != 0 and is patched with the job (#4239)

This commit is contained in:
Nikola Jokic
2025-09-17 21:40:37 +02:00
committed by GitHub
parent 2035e13724
commit 088e2a3a90
11 changed files with 186 additions and 74 deletions

View File

@@ -192,10 +192,12 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
case err == nil:
// create secret if not created
log.Info("Creating new ephemeral runner secret for jitconfig.")
if err := r.createSecret(ctx, ephemeralRunner, jitConfig, log); err != nil {
jitSecret, err := r.createSecret(ctx, ephemeralRunner, jitConfig, log)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to create secret: %w", err)
}
log.Info("Created new ephemeral runner secret for jitconfig.")
secret = jitSecret
case errors.Is(err, retryableError):
log.Info("Encountered retryable error, requeueing", "error", err.Error())
@@ -226,12 +228,15 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{Requeue: true}, nil
}
runnerName := string(secret.Data["runnerName"])
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
obj.Status.RunnerId = runnerID
obj.Status.RunnerName = string(secret.Data["runnerName"])
obj.Status.RunnerName = runnerName
}); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %w", err)
}
ephemeralRunner.Status.RunnerId = runnerID
ephemeralRunner.Status.RunnerName = runnerName
log.Info("Updated ephemeral runner status with runnerId and runnerName")
}
@@ -321,6 +326,18 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
case cs.State.Terminated.ExitCode != 0: // failed
log.Info("Ephemeral runner container failed", "exitCode", cs.State.Terminated.ExitCode)
if ephemeralRunner.HasJob() {
log.Error(
errors.New("ephemeral runner has a job assigned, but the pod has failed"),
"Ephemeral runner either has faulty entrypoint or something external killing the runner",
)
log.Info("Deleting the ephemeral runner that has a job assigned but the pod has failed")
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete the ephemeral runner that has a job assigned but the pod has failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
log.Error(err, "Failed to delete runner pod on failure")
return ctrl.Result{}, err
@@ -328,9 +345,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, nil
default: // succeeded
log.Info("Ephemeral runner has finished successfully")
if err := r.markAsFinished(ctx, ephemeralRunner, log); err != nil {
log.Error(err, "Failed to mark ephemeral runner as finished")
log.Info("Ephemeral runner has finished successfully, deleting ephemeral runner", "exitCode", cs.State.Terminated.ExitCode)
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete ephemeral runner after successful completion")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
@@ -500,18 +517,6 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
return nil
}
func (r *EphemeralRunnerReconciler) markAsFinished(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
log.Info("Updating ephemeral runner status to Finished")
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
obj.Status.Phase = corev1.PodSucceeded
}); err != nil {
return fmt.Errorf("failed to update ephemeral runner with status finished: %w", err)
}
log.Info("EphemeralRunner status is marked as Finished")
return nil
}
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
// It should not be responsible for setting the status to Failed.
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
@@ -680,21 +685,21 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp
return ctrl.Result{}, nil
}
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, jitConfig *actions.RunnerScaleSetJitRunnerConfig, log logr.Logger) error {
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, jitConfig *actions.RunnerScaleSetJitRunnerConfig, log logr.Logger) (*corev1.Secret, error) {
log.Info("Creating new secret for ephemeral runner")
jitSecret := r.newEphemeralRunnerJitSecret(runner, jitConfig)
if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil {
return fmt.Errorf("failed to set controller reference: %w", err)
return nil, fmt.Errorf("failed to set controller reference: %w", err)
}
log.Info("Created new secret spec for ephemeral runner")
if err := r.Create(ctx, jitSecret); err != nil {
return fmt.Errorf("failed to create jit secret: %w", err)
return nil, fmt.Errorf("failed to create jit secret: %w", err)
}
log.Info("Created ephemeral runner secret", "secretName", jitSecret.Name)
return nil
return jitSecret, nil
}
// updateRunStatusFromPod is responsible for updating non-exiting statuses.

View File

@@ -176,7 +176,7 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo(ephemeralRunner.Name))
})
It("It should re-create pod on failure", func() {
It("It should re-create pod on failure and no job assigned", func() {
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
@@ -200,6 +200,67 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo(true))
})
It("It should delete ephemeral runner on failure and job assigned", func() {
er := new(v1alpha1.EphemeralRunner)
// Check if finalizer is added
Eventually(
func() error {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
return err
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get ephemeral runner")
// update job id to simulate job assigned
er.Status.JobID = "1"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
er = new(v1alpha1.EphemeralRunner)
Eventually(
func() (string, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
if err != nil {
return "", err
}
return er.Status.JobID, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo("1"))
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}).Should(BeEquivalentTo(true))
// delete pod to simulate failure
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
er = new(v1alpha1.EphemeralRunner)
Eventually(
func() bool {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should failed if a pod template is invalid", func() {
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"
@@ -208,13 +269,22 @@ var _ = Describe("EphemeralRunner", func() {
Expect(err).To(BeNil())
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (corev1.PodPhase, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace}, updated)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodFailed))
Eventually(
func() (corev1.PodPhase, error) {
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: invalideEphemeralRunner.Name, Namespace: invalideEphemeralRunner.Namespace},
updated,
)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(corev1.PodFailed))
Expect(updated.Status.Reason).Should(Equal("InvalidPod"))
Expect(updated.Status.Message).Should(Equal("Failed to create the pod: pods \"invalid-ephemeral-runner\" is forbidden: no PriorityClass with name notexist was found"))
})
@@ -775,7 +845,7 @@ var _ = Describe("EphemeralRunner", func() {
startManagers(GinkgoT(), mgr)
})
It("It should set the Phase to Succeeded", func() {
It("It should delete EphemeralRunner when pod exits successfully", func() {
ephemeralRunner := newExampleRunner("test-runner", autoscalingNS.Name, configSecret.Name)
err := k8sClient.Create(ctx, ephemeralRunner)
@@ -801,13 +871,18 @@ var _ = Describe("EphemeralRunner", func() {
Expect(err).To(BeNil(), "failed to update pod status")
updated := new(v1alpha1.EphemeralRunner)
Eventually(func() (corev1.PodPhase, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", nil
}
return updated.Status.Phase, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(corev1.PodSucceeded))
Eventually(
func() bool {
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
updated,
)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue())
})
})

View File

@@ -453,8 +453,13 @@ func (r *EphemeralRunnerSetReconciler) deleteIdleEphemeralRunners(ctx context.Co
continue
}
if !isDone && ephemeralRunner.Status.JobRequestId > 0 {
log.Info("Skipping ephemeral runner since it is running a job", "name", ephemeralRunner.Name, "jobRequestId", ephemeralRunner.Status.JobRequestId)
if !isDone && ephemeralRunner.HasJob() {
log.Info(
"Skipping ephemeral runner since it is running a job",
"name", ephemeralRunner.Name,
"workflowRunId", ephemeralRunner.Status.WorkflowRunId,
"jobId", ephemeralRunner.Status.JobID,
)
continue
}