diff --git a/README.md b/README.md index e0daa215..ecce405e 100644 --- a/README.md +++ b/README.md @@ -1320,7 +1320,9 @@ Once the ephemeral runner has completed running a workflow job, it stops with a As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job. -Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security. +Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and in this mode `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. + +Static runners may worth trying only when you do want to prioritize job run speed more than job reliability and security. > In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`. > diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index e0862f60..ae6375e7 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -17,6 +17,8 @@ spec: image: ${RUNNER_NAME}:${RUNNER_TAG} imagePullPolicy: IfNotPresent + ephemeral: ${TEST_EPHEMERAL} + # Whether to pass --ephemeral (true) or --once (false, deprecated) env: - name: RUNNER_FEATURE_FLAG_EPHEMERAL diff --git a/controllers/constants.go b/controllers/constants.go index 510fa911..24dba05d 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -34,15 +34,14 @@ const ( AnnotationKeyRunnerID = annotationKeyPrefix + "id" - // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) - // and RemoveRunner API (to actually unregister the runner) calls. - // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s - // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response - // up to 60 seconds (or even more depending on the situation). - DefaultUnregistrationTimeout = 60 * time.Second - // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. - DefaultUnregistrationRetryDelay = 30 * time.Second + DefaultUnregistrationRetryDelay = time.Minute + + // RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation. + // Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token + // for creating a broader scoped runner token, like organizationa or enterprise runner token. + // Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value. + RetryDelayOnCreateRegistrationError = 3 * time.Minute // registrationTimeout is the duration until a pod times out after it becomes Ready and Running. // A pod that is timed out can be terminated if needed. diff --git a/controllers/integration_test.go b/controllers/integration_test.go index 105d8658..5695b44d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,7 +110,6 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond * 100, RegistrationRecheckJitter: time.Millisecond * 10, - UnregistrationTimeout: 1 * time.Second, UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 5811494b..907bcc9f 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -73,7 +73,6 @@ type RunnerReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -200,11 +199,11 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con newRunner.ObjectMeta.Finalizers = finalizers if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil { - log.Error(err, "Failed to update runner for finalizer removal") + log.Error(err, "Unable to remove finalizer") return ctrl.Result{}, err } - log.Info("Removed runner from GitHub", "repository", runner.Spec.Repository, "organization", runner.Spec.Organization) + log.Info("Removed finalizer") } return ctrl.Result{}, nil @@ -212,7 +211,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) { if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { - return ctrl.Result{}, err + return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil } else if updated { return ctrl.Result{Requeue: true}, nil } @@ -254,6 +253,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) if err != nil { + // An error can be a permanent, permission issue like the below: + // POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration [] + // In such case retrying in seconds might not make much sense. + r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed") log.Error(err, "Failed to get new registration token") return false, err diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index d95d94a8..86de46bb 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -26,13 +26,13 @@ import ( // This function is designed to complete a lengthy graceful stop process in a unblocking way. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. -func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { +func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339)) if err != nil { return nil, &ctrl.Result{}, err } - if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod } // If the first return value is nil, it's safe to delete the runner pod. -func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { +func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { var runnerID *int64 if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok { @@ -81,12 +81,29 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. runnerID = &v } + if runnerID == nil { + runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner) + if err != nil { + return &ctrl.Result{}, err + } + + if runner != nil && runner.ID != nil { + runnerID = runner.ID + } + } + code := runnerContainerExitCode(pod) if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" { // If it's already unregistered in the previous reconcilation loop, // you can safely assume that it won't get registered again so it's safe to delete the runner pod. log.Info("Runner pod is marked as already unregistered.") + } else if runnerID == nil { + log.Info( + "Unregistration started before runner ID is assigned. " + + "Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? " + + "Marking unregistration as completed anyway because there's nothing ARC can do.", + ) } else if pod != nil && runnerPodOrContainerIsStopped(pod) { // If it's an ephemeral runner with the actions/runner container exited with 0, // we can safely assume that it has unregistered itself from GitHub Actions @@ -99,7 +116,7 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. log.Info("Runner pod is annotated to wait for completion") return &ctrl.Result{RequeueAfter: retryDelay}, nil - } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil { + } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil { if errors.Is(err, &gogithub.RateLimitError{}) { // We log the underlying error when we failed calling GitHub API to list or unregisters, // or the runner is still busy. @@ -120,6 +137,15 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. errRes := &gogithub.ErrorResponse{} if errors.As(err, &errRes) { + if errRes.Response.StatusCode == 403 { + log.Error(err, "Unable to unregister due to permission error. "+ + "Perhaps you've changed the permissions of PAT or GitHub App, or you updated authentication method of ARC in a wrong way? "+ + "ARC considers it as already unregistered and continue removing the pod. "+ + "You may need to remove the runner on GitHub UI.") + + return nil, nil + } + runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner) var runnerID int64 @@ -177,17 +203,10 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" { - t, err := time.Parse(time.RFC3339, ts) - if err != nil { - return &ctrl.Result{RequeueAfter: retryDelay}, err - } + log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+ + " It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err) - if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { - log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) - return &ctrl.Result{RequeueAfter: retryDelay}, err - } - - log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + return &ctrl.Result{RequeueAfter: retryDelay}, nil } else { // A runner and a runner pod that is created by this version of ARC should match // any of the above branches. @@ -333,20 +352,7 @@ func setRunnerEnv(pod *corev1.Pod, key, value string) { // There isn't a single right grace period that works for everyone. // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, // while the shorter the grace period is, the more likely you may encounter the race issue. -func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id *int64) (bool, error) { - if id == nil { - runner, err := getRunner(ctx, client, enterprise, org, repo, name) - if err != nil { - return false, err - } - - if runner == nil || runner.ID == nil { - return false, nil - } - - id = runner.ID - } - +func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id int64) (bool, error) { // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. // The reason ARC did so was to let a runner running a job to not stop prematurely. // @@ -367,7 +373,7 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or // change from 60 seconds. // // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? - if err := client.RemoveRunner(ctx, enterprise, org, repo, *id); err != nil { + if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { return false, err } diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index ab683124..3a631671 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -45,7 +45,6 @@ type RunnerPodReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -104,7 +103,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. // But for the case that the user manually deleted it for whatever reason, // we have to ensure it to gracefully stop now. - updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -172,7 +171,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // // In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all, // so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work. - _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -190,15 +189,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { - unregistrationTimeout := DefaultUnregistrationTimeout - - if r.UnregistrationTimeout > 0 { - unregistrationTimeout = r.UnregistrationTimeout - } - return unregistrationTimeout -} - func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { retryDelay := DefaultUnregistrationRetryDelay diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index 6cf1ca23..abc5020a 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -302,9 +302,11 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, log.V(2).Info("Detected some current object(s)", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "names", names) } - var pending, running, regTimeout int + var total, terminating, pending, running, regTimeout int for _, ss := range currentObjects { + total += ss.total + terminating += ss.terminating pending += ss.pending running += ss.running regTimeout += ss.regTimeout @@ -319,6 +321,8 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, log.V(2).Info( "Found some pods across owner(s)", + "total", total, + "terminating", terminating, "pending", pending, "running", running, "regTimeout", regTimeout, @@ -329,7 +333,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, maybeRunning := pending + running wantMoreRunners := newDesiredReplicas > maybeRunning - alreadySyncedAfterEffectiveTime := lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) + alreadySyncedAfterEffectiveTime := ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale)) log = log.WithValues( @@ -344,6 +348,16 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, ) if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale { + // This is our special handling of the situation for ephemeral runners only. + // + // Handling static runners this way results in scale-up to not work at all, + // because then any scale up attempts for static runenrs fall within this condition, for two reasons. + // First, static(persistent) runners will never restart on their own. + // Second, we don't update EffectiveTime for static runners. + // + // We do need to skip this condition for static runners, and that's why we take the `ephemeral` flag into account when + // computing `alreadySyncedAfterEffectiveTime``. + log.V(2).Info( "Detected that some ephemeral runners have disappeared. " + "Usually this is due to that ephemeral runner completions " + @@ -415,24 +429,26 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, // or actions/runner may potentially misbehave on SIGTERM immediately sent by K8s. // We'd better unregister first and then start a pod deletion process. // The annotation works as a mark to start the pod unregistration and deletion process of ours. + + if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); ok { + log.V(2).Info("Still waiting for runner pod(s) unregistration to complete") + + continue + } + for _, po := range ss.pods { if _, err := annotatePodOnce(ctx, c, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil { return nil, err } } - if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok { - updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) - - if err := c.Patch(ctx, updated, client.MergeFrom(ss.object)); err != nil { - log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) - return nil, err - } - - log.V(2).Info("Redundant object has been annotated to start the unregistration before deletion") - } else { - log.V(2).Info("BUG: Redundant object was already annotated") + updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) + return nil, err } + + log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") } } else if retained > newDesiredReplicas { log.V(2).Info("Waiting sync before scale down", "retained", retained, "newDesiredReplicas", newDesiredReplicas) @@ -508,6 +524,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, log.Error(err, "Failed to delete owner") return nil, err } + + log.V(2).Info("Started deletion of owner") + continue } @@ -523,13 +542,13 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } } - log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total) - if deletionSafe == res.total { + log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total) + if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok { updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) - if err := c.Patch(ctx, updated, client.MergeFrom(res.object)); err != nil { + if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil { log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp)) return nil, err } @@ -538,9 +557,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } else { log.V(2).Info("BUG: Redundant owner was already annotated to start the deletion") } - } - continue + continue + } } if annotations := res.owner.GetAnnotations(); annotations != nil { diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 9a55ae3d..e893f33c 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -118,6 +118,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, err } + log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name) + return ctrl.Result{}, nil } @@ -142,6 +144,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, err } + log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name) + // We requeue in order to clean up old runner replica sets later. // Otherwise, they aren't cleaned up until the next re-sync interval. return ctrl.Result{RequeueAfter: 5 * time.Second}, nil @@ -222,15 +226,38 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req for i := range oldSets { rs := oldSets[i] + rslog := log.WithValues("runnerreplicaset", rs.Name) + + if rs.Status.Replicas != nil && *rs.Status.Replicas > 0 { + if rs.Spec.Replicas != nil && *rs.Spec.Replicas == 0 { + rslog.V(2).Info("Waiting for runnerreplicaset to scale to zero") + + continue + } + + updated := rs.DeepCopy() + zero := 0 + updated.Spec.Replicas = &zero + if err := r.Client.Update(ctx, updated); err != nil { + rslog.Error(err, "Failed to scale runnerreplicaset to zero") + + return ctrl.Result{}, err + } + + rslog.Info("Scaled runnerreplicaset to zero") + + continue + } + if err := r.Client.Delete(ctx, &rs); err != nil { - log.Error(err, "Failed to delete runnerreplicaset resource") + rslog.Error(err, "Failed to delete runnerreplicaset resource") return ctrl.Result{}, err } r.Recorder.Event(&rd, corev1.EventTypeNormal, "RunnerReplicaSetDeleted", fmt.Sprintf("Deleted runnerreplicaset '%s'", rs.Name)) - log.Info("Deleted runnerreplicaset", "runnerdeployment", rd.ObjectMeta.Name, "runnerreplicaset", rs.Name) + rslog.Info("Deleted runnerreplicaset") } } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 3433fc58..6ceb559b 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -65,6 +65,10 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } if !rs.ObjectMeta.DeletionTimestamp.IsZero() { + // RunnerReplicaSet cannot be gracefuly removed. + // That means any runner that is running a job can be prematurely terminated. + // To gracefully remove a RunnerReplicaSet, scale it down to zero first, observe RunnerReplicaSet's status replicas, + // and remove it only after the status replicas becomes zero. return ctrl.Result{}, nil } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index c2994d23..293051c3 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -135,6 +135,7 @@ func TestE2ERunnerDeploy(t *testing.T) { } env := initTestEnv(t) + env.useApp = true t.Run("build and load images", func(t *testing.T) { env.buildAndLoadImages(t) @@ -186,6 +187,7 @@ type env struct { runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string githubTokenWebhook string testEnterprise string + testEphemeral string featureFlagEphemeral *bool scaleDownDelaySecondsAfterScaleOut int64 minReplicas int64 @@ -218,8 +220,9 @@ func initTestEnv(t *testing.T) *env { e.testRepo = testing.Getenv(t, "TEST_REPO", "") e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") - e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") - e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) + e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "") + e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "") + e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20) if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil { e.featureFlagEphemeral = &ephemeral @@ -288,6 +291,7 @@ func (e *env) installActionsRunnerController(t *testing.T) { "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "RUNNER_LABEL=" + e.runnerLabel, "TEST_ID=" + e.testID, + "TEST_EPHEMERAL=" + e.testEphemeral, fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas), fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),