From c27541140a72dfef145da4d32abc374b78654614 Mon Sep 17 00:00:00 2001 From: Nikola Jokic Date: Mon, 4 Aug 2025 12:35:04 +0200 Subject: [PATCH] Remove JIT config from ephemeral runner status field (#4191) --- .../v1alpha1/ephemeralrunner_types.go | 2 - .../actions.github.com_ephemeralrunners.yaml | 2 - .../actions.github.com_ephemeralrunners.yaml | 2 - .../ephemeralrunner_controller.go | 192 +++++++++--------- controllers/actions.github.com/error.go | 12 ++ .../actions.github.com/resourcebuilder.go | 9 +- 6 files changed, 116 insertions(+), 103 deletions(-) create mode 100644 controllers/actions.github.com/error.go diff --git a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go index 7667174f..838996d7 100644 --- a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go +++ b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go @@ -145,8 +145,6 @@ type EphemeralRunnerStatus struct { RunnerId int `json:"runnerId,omitempty"` // +optional RunnerName string `json:"runnerName,omitempty"` - // +optional - RunnerJITConfig string `json:"runnerJITConfig,omitempty"` // +optional Failures map[string]metav1.Time `json:"failures,omitempty"` diff --git a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml index 3e9812c2..ef4ccda9 100644 --- a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml +++ b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml @@ -7874,8 +7874,6 @@ spec: type: string runnerId: type: integer - runnerJITConfig: - type: string runnerName: type: string workflowRunId: diff --git a/config/crd/bases/actions.github.com_ephemeralrunners.yaml b/config/crd/bases/actions.github.com_ephemeralrunners.yaml index 3e9812c2..ef4ccda9 100644 --- a/config/crd/bases/actions.github.com_ephemeralrunners.yaml +++ b/config/crd/bases/actions.github.com_ephemeralrunners.yaml @@ -7874,8 +7874,6 @@ spec: type: string runnerId: type: integer - runnerJITConfig: - type: string runnerName: type: string workflowRunId: diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 74138fc7..0590627f 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "net/http" + "strconv" "time" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -179,11 +180,59 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil } - if ephemeralRunner.Status.RunnerId == 0 { - log.Info("Creating new ephemeral runner registration and updating status with runner config") - if r, err := r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log); r != nil { - return *r, err + secret := new(corev1.Secret) + if err := r.Get(ctx, req.NamespacedName, secret); err != nil { + if !kerrors.IsNotFound(err) { + log.Error(err, "Failed to fetch secret") + return ctrl.Result{}, err } + + jitConfig, err := r.createRunnerJitConfig(ctx, ephemeralRunner, log) + switch { + case err == nil: + // create secret if not created + log.Info("Creating new ephemeral runner secret for jitconfig.") + if err := r.createSecret(ctx, ephemeralRunner, jitConfig, log); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to create secret: %w", err) + } + log.Info("Created new ephemeral runner secret for jitconfig.") + + case errors.Is(err, retryableError): + log.Info("Encountered retryable error, requeueing", "error", err.Error()) + return ctrl.Result{Requeue: true}, nil + case errors.Is(err, fatalError): + log.Info("JIT config cannot be created for this ephemeral runner, issuing delete", "error", err.Error()) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete the ephemeral runner: %w", err) + } + log.Info("Request to delete ephemeral runner has been issued") + return ctrl.Result{}, nil + default: + log.Error(err, "Failed to create ephemeral runners secret", "error", err.Error()) + return ctrl.Result{}, err + } + } + + if ephemeralRunner.Status.RunnerId == 0 { + log.Info("Updating ephemeral runner status with runnerId and runnerName") + runnerID, err := strconv.Atoi(string(secret.Data["runnerId"])) + if err != nil { + log.Error(err, "Runner config secret is corrupted: missing runnerId") + log.Info("Deleting corrupted runner config secret") + if err := r.Delete(ctx, secret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete the corrupted runner config secret") + } + log.Info("Corrupted runner config secret has been deleted") + return ctrl.Result{Requeue: true}, nil + } + + if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { + obj.Status.RunnerId = runnerID + obj.Status.RunnerName = string(secret.Data["runnerName"]) + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %w", err) + } + log.Info("Updated ephemeral runner status with runnerId and runnerName") } if len(ephemeralRunner.Status.Failures) > maxFailures { @@ -213,27 +262,6 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ }, nil } - secret := new(corev1.Secret) - if err := r.Get(ctx, req.NamespacedName, secret); err != nil { - if !kerrors.IsNotFound(err) { - log.Error(err, "Failed to fetch secret") - return ctrl.Result{}, err - } - // create secret if not created - log.Info("Creating new ephemeral runner secret for jitconfig.") - if r, err := r.createSecret(ctx, ephemeralRunner, log); r != nil { - return *r, err - } - - // Retry to get the secret that was just created. - // Otherwise, even though we want to continue to create the pod, - // it fails due to the missing secret resulting in an invalid pod spec. - if err := r.Get(ctx, req.NamespacedName, secret); err != nil { - log.Error(err, "Failed to fetch secret") - return ctrl.Result{}, err - } - } - pod := new(corev1.Pod) if err := r.Get(ctx, req.NamespacedName, pod); err != nil { if !kerrors.IsNotFound(err) { @@ -509,14 +537,12 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem return nil } -// updateStatusWithRunnerConfig fetches runtime configuration needed by the runner -// This method should always set .status.runnerId and .status.runnerJITConfig -func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) { +func (r *EphemeralRunnerReconciler) createRunnerJitConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (*actions.RunnerScaleSetJitRunnerConfig, error) { // Runner is not registered with the service. We need to register it first log.Info("Creating ephemeral runner JIT config") actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) if err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %w", err) + return nil, fmt.Errorf("failed to get actions client for generating JIT config: %w", err) } jitSettings := &actions.RunnerScaleSetJitRunnerSetting{ @@ -531,74 +557,52 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con } jitConfig, err := actionsClient.GenerateJitRunnerConfig(ctx, jitSettings, ephemeralRunner.Spec.RunnerScaleSetId) + if err == nil { // if NO error + log.Info("Created ephemeral runner JIT config", "runnerId", jitConfig.Runner.Id) + return jitConfig, nil + } + + actionsError := &actions.ActionsError{} + if !errors.As(err, &actionsError) { + return nil, fmt.Errorf("failed to generate JIT config with generic error: %w", err) + } + + if actionsError.StatusCode != http.StatusConflict || + !actionsError.IsException("AgentExistsException") { + return nil, fmt.Errorf("failed to generate JIT config with Actions service error: %w", err) + } + + // If the runner with the name we want already exists it means: + // - We might have a name collision. + // - Our previous reconciliation loop failed to update the + // status with the runnerId and runnerJITConfig after the `GenerateJitRunnerConfig` + // created the runner registration on the service. + // We will try to get the runner and see if it's belong to this AutoScalingRunnerSet, + // if so, we can simply delete the runner registration and create a new one. + log.Info("Getting runner jit config failed with conflict error, trying to get the runner by name", "runnerName", ephemeralRunner.Name) + existingRunner, err := actionsClient.GetRunnerByName(ctx, ephemeralRunner.Name) if err != nil { - actionsError := &actions.ActionsError{} - if !errors.As(err, &actionsError) { - return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %w", err) - } + return nil, fmt.Errorf("failed to get runner by name: %w", err) + } - if actionsError.StatusCode != http.StatusConflict || - !actionsError.IsException("AgentExistsException") { - return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %w", err) - } + if existingRunner == nil { + log.Info("Runner with the same name does not exist anymore, re-queuing the reconciliation") + return nil, fmt.Errorf("%w: runner existed, retry configuration", retryableError) + } - // If the runner with the name we want already exists it means: - // - We might have a name collision. - // - Our previous reconciliation loop failed to update the - // status with the runnerId and runnerJITConfig after the `GenerateJitRunnerConfig` - // created the runner registration on the service. - // We will try to get the runner and see if it's belong to this AutoScalingRunnerSet, - // if so, we can simply delete the runner registration and create a new one. - log.Info("Getting runner jit config failed with conflict error, trying to get the runner by name", "runnerName", ephemeralRunner.Name) - existingRunner, err := actionsClient.GetRunnerByName(ctx, ephemeralRunner.Name) + log.Info("Found the runner with the same name", "runnerId", existingRunner.Id, "runnerScaleSetId", existingRunner.RunnerScaleSetId) + if existingRunner.RunnerScaleSetId == ephemeralRunner.Spec.RunnerScaleSetId { + log.Info("Removing the runner with the same name") + err := actionsClient.RemoveRunner(ctx, int64(existingRunner.Id)) if err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to get runner by name: %w", err) + return nil, fmt.Errorf("failed to remove runner from the service: %w", err) } - if existingRunner == nil { - log.Info("Runner with the same name does not exist, re-queuing the reconciliation") - return &ctrl.Result{Requeue: true}, nil - } - - log.Info("Found the runner with the same name", "runnerId", existingRunner.Id, "runnerScaleSetId", existingRunner.RunnerScaleSetId) - if existingRunner.RunnerScaleSetId == ephemeralRunner.Spec.RunnerScaleSetId { - log.Info("Removing the runner with the same name") - err := actionsClient.RemoveRunner(ctx, int64(existingRunner.Id)) - if err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %w", err) - } - - log.Info("Removed the runner with the same name, re-queuing the reconciliation") - return &ctrl.Result{Requeue: true}, nil - } - - // TODO: Do we want to mark the ephemeral runner as failed, and let EphemeralRunnerSet to clean it up, so we can recover from this situation? - // The situation is that the EphemeralRunner's name is already used by something else to register a runner, and we can't take the control back. - return &ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %w", err) - } - log.Info("Created ephemeral runner JIT config", "runnerId", jitConfig.Runner.Id) - - log.Info("Updating ephemeral runner status with runnerId and runnerJITConfig") - err = patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { - obj.Status.RunnerId = jitConfig.Runner.Id - obj.Status.RunnerName = jitConfig.Runner.Name - obj.Status.RunnerJITConfig = jitConfig.EncodedJITConfig - }) - if err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %w", err) + log.Info("Removed the runner with the same name, re-queuing the reconciliation") + return nil, fmt.Errorf("%w: runner existed belonging to the scale set, retry configuration", retryableError) } - // We want to continue without a requeue for faster pod creation. - // - // To do so, we update the status in-place, so that both continuing the loop and - // and requeuing and skipping updateStatusWithRunnerConfig in the next loop, will - // have the same effect. - ephemeralRunner.Status.RunnerId = jitConfig.Runner.Id - ephemeralRunner.Status.RunnerName = jitConfig.Runner.Name - ephemeralRunner.Status.RunnerJITConfig = jitConfig.EncodedJITConfig - - log.Info("Updated ephemeral runner status with runnerId and runnerJITConfig") - return nil, nil + return nil, fmt.Errorf("%w: runner with the same name but doesn't belong to this RunnerScaleSet: %w", fatalError, err) } func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alpha1.EphemeralRunner, secret *corev1.Secret, log logr.Logger) (ctrl.Result, error) { @@ -674,21 +678,21 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp return ctrl.Result{}, nil } -func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) { +func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, jitConfig *actions.RunnerScaleSetJitRunnerConfig, log logr.Logger) error { log.Info("Creating new secret for ephemeral runner") - jitSecret := r.newEphemeralRunnerJitSecret(runner) + jitSecret := r.newEphemeralRunnerJitSecret(runner, jitConfig) if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to set controller reference: %w", err) + return fmt.Errorf("failed to set controller reference: %w", err) } log.Info("Created new secret spec for ephemeral runner") if err := r.Create(ctx, jitSecret); err != nil { - return &ctrl.Result{}, fmt.Errorf("failed to create jit secret: %w", err) + return fmt.Errorf("failed to create jit secret: %w", err) } log.Info("Created ephemeral runner secret", "secretName", jitSecret.Name) - return nil, nil + return nil } // updateRunStatusFromPod is responsible for updating non-exiting statuses. diff --git a/controllers/actions.github.com/error.go b/controllers/actions.github.com/error.go new file mode 100644 index 00000000..7c295a8c --- /dev/null +++ b/controllers/actions.github.com/error.go @@ -0,0 +1,12 @@ +package actionsgithubcom + +type controllerError string + +func (e controllerError) Error() string { + return string(e) +} + +const ( + retryableError = controllerError("retryable error") + fatalError = controllerError("fatal error") +) diff --git a/controllers/actions.github.com/resourcebuilder.go b/controllers/actions.github.com/resourcebuilder.go index abdb0706..39e17350 100644 --- a/controllers/actions.github.com/resourcebuilder.go +++ b/controllers/actions.github.com/resourcebuilder.go @@ -618,7 +618,7 @@ func (b *ResourceBuilder) newEphemeralRunnerPod(ctx context.Context, runner *v1a FilterLabels(labels, LabelKeyRunnerTemplateHash), annotations, runner.Spec, - runner.Status.RunnerJITConfig, + secret.Data, ) objectMeta := metav1.ObjectMeta{ @@ -671,14 +671,17 @@ func (b *ResourceBuilder) newEphemeralRunnerPod(ctx context.Context, runner *v1a return &newPod } -func (b *ResourceBuilder) newEphemeralRunnerJitSecret(ephemeralRunner *v1alpha1.EphemeralRunner) *corev1.Secret { +func (b *ResourceBuilder) newEphemeralRunnerJitSecret(ephemeralRunner *v1alpha1.EphemeralRunner, jitConfig *actions.RunnerScaleSetJitRunnerConfig) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace, }, Data: map[string][]byte{ - jitTokenKey: []byte(ephemeralRunner.Status.RunnerJITConfig), + jitTokenKey: []byte(jitConfig.EncodedJITConfig), + "runnerName": []byte(jitConfig.Runner.Name), + "runnerId": []byte(strconv.Itoa(jitConfig.Runner.Id)), + "scaleSetId": []byte(strconv.Itoa(jitConfig.Runner.RunnerScaleSetId)), }, } }