Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535 Ref #1581 Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
2025-12-11 03:57:01 +00:00 · 2022-11-01 20:30:10 +09:00
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/actions-runner-controller/actions-runner-controller/testing"
 	"github.com/google/go-github/v47/github"
 	"github.com/onsi/gomega"
+	"github.com/stretchr/testify/require"
 	"golang.org/x/oauth2"
 	"sigs.k8s.io/yaml"
 )
@@ -330,6 +331,10 @@ func TestE2E(t *testing.T) {
 					t.Run(fmt.Sprintf("update runners - attempt %d", i), func(t *testing.T) {
 						env.deploy(t, RunnerDeployments, testID, fmt.Sprintf("ROLLING_UPDATE_PHASE=%d", i))
 					})
+
+					t.Run(fmt.Sprintf("set deletiontimestamps on runner pods - attempt %d", i), func(t *testing.T) {
+						env.setDeletionTimestampsOnRunningPods(t, RunnerDeployments)
+					})
 				}
 			}
 		}()
@@ -370,6 +375,8 @@ type env struct {
 	doDockerBuild                               bool
 	containerMode                               string
 	runnerServiceAccuontName                    string
+	runnerGracefulStopTimeout                   string
+	runnerTerminationGracePeriodSeconds         string
 	runnerNamespace                             string
 	remoteKubeconfig                            string
 	imagePullSecretName                         string
@@ -500,6 +507,8 @@ func initTestEnv(t *testing.T, k8sMinorVer string, vars vars) *env {
 	e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "")
 	e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "")
 	e.runnerServiceAccuontName = testing.Getenv(t, "TEST_RUNNER_SERVICE_ACCOUNT_NAME", "")
+	e.runnerTerminationGracePeriodSeconds = testing.Getenv(t, "TEST_RUNNER_TERMINATION_GRACE_PERIOD_SECONDS", "30")
+	e.runnerGracefulStopTimeout = testing.Getenv(t, "TEST_RUNNER_GRACEFUL_STOP_TIMEOUT", "15")
 	e.runnerNamespace = testing.Getenv(t, "TEST_RUNNER_NAMESPACE", "default")
 	e.remoteKubeconfig = testing.Getenv(t, "ARC_E2E_REMOTE_KUBECONFIG", "")
 	e.imagePullSecretName = testing.Getenv(t, "ARC_E2E_IMAGE_PULL_SECRET_NAME", "")
@@ -712,6 +721,48 @@ func (e *env) undeploy(t *testing.T, kind DeployKind, testID string) {
 	e.do(t, "delete", kind, testID)
 }

+func (e *env) setDeletionTimestampsOnRunningPods(t *testing.T, deployKind DeployKind) {
+	t.Helper()
+
+	var scope, kind, labelKind string
+	if e.testOrg != "" {
+		scope = "org"
+	} else if e.testEnterprise != "" {
+		scope = "enterprise"
+	} else {
+		scope = "repo"
+	}
+
+	if deployKind == RunnerDeployments {
+		kind = "runnerdeploy"
+		labelKind = "runner-deployment"
+	} else {
+		kind = "runnerset"
+		labelKind = "runnerset"
+	}
+
+	label := fmt.Sprintf("%s-name=%s-%s", labelKind, scope, kind)
+
+	ctx := context.Background()
+	c := e.getKubectlConfig()
+
+	t.Logf("Finding pods with label %s", label)
+
+	pods, err := e.Kubectl.FindPods(ctx, label, c)
+	require.NoError(t, err)
+
+	if len(pods) == 0 {
+		return
+	}
+
+	t.Logf("Setting deletionTimestamps on pods %s", strings.Join(pods, ", "))
+
+	err = e.Kubectl.DeletePods(ctx, pods, c)
+	require.NoError(t, err)
+
+	t.Logf("Deleted pods %s", strings.Join(pods, ", "))
+}
+
 func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ...string) {
 	t.Helper()

@@ -722,6 +773,8 @@ func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ..
 		"OP=" + op,
 		"RUNNER_NAMESPACE=" + e.runnerNamespace,
 		"RUNNER_SERVICE_ACCOUNT_NAME=" + e.runnerServiceAccuontName,
+		"RUNNER_GRACEFUL_STOP_TIMEOUT=" + e.runnerGracefulStopTimeout,
+		"RUNNER_TERMINATION_GRACE_PERIOD_SECONDS=" + e.runnerTerminationGracePeriodSeconds,
 	}
 	scriptEnv = append(scriptEnv, env...)

@@ -825,7 +878,7 @@ func (e *env) testJobs(testID string) []job {
 func (e *env) verifyActionsWorkflowRun(t *testing.T, testID string) {
 	t.Helper()

-	verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout())
+	verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout(), e.getKubectlConfig())
 }

 func (e *env) verifyTimeout() time.Duration {
@@ -836,6 +889,18 @@ func (e *env) verifyTimeout() time.Duration {
 	return 8 * 60 * time.Second
 }

+func (e *env) getKubectlConfig() testing.KubectlConfig {
+	kubectlEnv := []string{
+		"KUBECONFIG=" + e.Kubeconfig,
+	}
+
+	cmCfg := testing.KubectlConfig{
+		Env: kubectlEnv,
+	}
+
+	return cmCfg
+}
+
 type job struct {
 	name, testArg, configMapName string
 }
@@ -969,10 +1034,18 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
 					// When rootless, we need to use the `docker` buildx driver, which doesn't support cache export
 					// so we end up with the below error on docker-build:
 					//   error: cache export feature is currently not supported for docker driver. Please switch to a different driver (eg. "docker buildx create --use")
+					// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-container-driver
+					// for the `docker-container` driver.
 					dockerBuildCache = "--cache-from=type=local,src=/home/runner/.cache/buildx " +
 						"--cache-to=type=local,dest=/home/runner/.cache/buildx-new,mode=max "
 					dockerfile = "Dockerfile"
+					// Note though, if the cache does not exist yet, the buildx build seem to write cache data to /home/runner/.cache/buildx,
+					// not buildx-new.
+					// I think the following message emitted by buildx in the end is relevant to this behaviour, but not 100% sure:
+					//   WARNING: local cache import at /home/runner/.cache/buildx not found due to err: could not read /home/runner/.cache/buildx/index.json: open /home/runner/.cache/buildx/index.json: no such file or directory
 				} else {
+					// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-driver
+					// for the `docker` driver.
 					setupBuildXActionWith.Driver = "docker"
 					dockerfile = "Dockerfile.nocache"
 				}
@@ -997,20 +1070,35 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
 							fmt.Sprintf("-f %s .", dockerfile),
 					},
 				)
-			}
-		}

-		if useSudo {
-			steps = append(steps,
-				testing.Step{
-					// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
-					// See https://github.com/moby/buildkit/issues/1896 for why this is needed
-					Run: "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
-				},
-				testing.Step{
-					Run: "ls -lah /home/runner/.cache/*",
-				},
-			)
+				if useSudo {
+					steps = append(steps,
+						testing.Step{
+							// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
+							// See https://github.com/moby/buildkit/issues/1896 for why this is needed
+							Run: "if -d /home/runner/.cache/buildx-new; then " + sudo + "rm -rf /home/runner/.cache/buildx && " + sudo + `mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx; else echo "/home/runner/.cache/buildx-new is not found. Perhaps you're running this on a stateleess runner?"; fi`,
+						},
+						testing.Step{
+							Run: "ls -lah /home/runner/.cache/*",
+						},
+					)
+				}
+			}
+
+			if useSudo {
+				if kind == RunnerDeployments {
+					steps = append(steps,
+						testing.Step{
+							// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
+							// See https://github.com/moby/buildkit/issues/1896 for why this is needed
+							Run: sudo + "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
+						},
+						testing.Step{
+							Run: sudo + "ls -lah /home/runner/.cache/*",
+						},
+					)
+				}
+			}
 		}

 		steps = append(steps,
@@ -1062,7 +1150,7 @@ kubectl create cm %s$id --from-literal=status=ok
 	}
 }

-func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration) {
+func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration, cmCfg testing.KubectlConfig) {
 	t.Helper()

 	var expected []string
@@ -1079,14 +1167,6 @@ func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, ti
 		for i := range testJobs {
 			testResultCMName := testJobs[i].configMapName

-			kubectlEnv := []string{
-				"KUBECONFIG=" + env.Kubeconfig,
-			}
-
-			cmCfg := testing.KubectlConfig{
-				Env: kubectlEnv,
-			}
-
 			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 			defer cancel()

--- a/test/entrypoint/assets/config.sh
+++ b/test/entrypoint/assets/config.sh
--- a/test/entrypoint/assets/logging.sh
+++ b/test/entrypoint/assets/logging.sh
--- a/test/entrypoint/assets/run.sh
+++ b/test/entrypoint/assets/run.sh
--- a/test/entrypoint/should_retry_configuring/test.sh
+++ b/test/entrypoint/should_retry_configuring/test.sh
@@ -3,14 +3,14 @@
 # UNITTEST: retry config
 # Will simulate a configuration failure and expects:
 # - the configuration step to be run 10 times
-# - the entrypoint script to exit with error code 2
+# - the startup script to exit with error code 2
 # - the run.sh script to never run.

 source ../assets/logging.sh

-entrypoint_log() {
+startup_log() {
  while read I; do
-    printf "\tentrypoint.sh: $I\n"
+    printf "\tstartup.sh: $I\n"
  done
 }

@@ -44,12 +44,12 @@ cleanup() {
 # Always run cleanup when test ends regardless of how it ends
 trap cleanup SIGINT SIGTERM SIGQUIT EXIT

-log "Running the entrypoint"
+log "Running the startup script"
 log ""

-# Run the runner entrypoint script which as a final step runs this
+# Run the runner startup script which as a final step runs this
 # unit tests run.sh as it was symlinked
-../../../runner/entrypoint.sh 2> >(entrypoint_log)
+../../../runner/startup.sh 2> >(startup_log)

 if [ "$?" != "2" ]; then
  error "========================================="
--- a/test/entrypoint/should_work_non_ephemeral/test.sh
+++ b/test/entrypoint/should_work_non_ephemeral/test.sh
@@ -3,14 +3,14 @@
 # UNITTEST: should work as non ephemeral
 # Will simulate a scenario where ephemeral=false. expects:
 # - the configuration step to be run exactly once
-# - the entrypoint script to exit with no error
+# - the startup script to exit with no error
 # - the run.sh script to run without the --once flag

 source ../assets/logging.sh

-entrypoint_log() {
+startup_log() {
  while read I; do
-    printf "\tentrypoint.sh: $I\n"
+    printf "\tstartup.sh: $I\n"
  done
 }

@@ -44,16 +44,16 @@ cleanup() {
 # Always run cleanup when test ends regardless of how it ends
 trap cleanup SIGINT SIGTERM SIGQUIT EXIT

-log "Running the entrypoint"
+log "Running the startup script"
 log ""

-# Run the runner entrypoint script which as a final step runs this
+# Run the runner entrypstartupoint script which as a final step runs this
 # unit tests run.sh as it was symlinked
-../../../runner/entrypoint.sh 2> >(entrypoint_log)
+../../../runner/startup.sh 2> >(startup_log)

 if [ "$?" != "0" ]; then
  error "==========================================="
-  error "FAIL | Entrypoint script did not exit successfully"
+  error "FAIL | Startup script did not exit successfully"
  exit 1
 fi

--- a/test/entrypoint/should_work_normally/test.sh
+++ b/test/entrypoint/should_work_normally/test.sh
@@ -3,14 +3,14 @@
 # UNITTEST: should work normally
 # Will simulate a normal execution scenario. expects:
 # - the configuration step to be run exactly once
-# - the entrypoint script to exit with no error
+# - the startup script to exit with no error
 # - the run.sh script to run with the --once flag activated.

 source ../assets/logging.sh

-entrypoint_log() {
+startup_log() {
  while read I; do
-    printf "\tentrypoint.sh: $I\n"
+    printf "\startup.sh: $I\n"
  done
 }

@@ -42,12 +42,12 @@ cleanup() {
 # Always run cleanup when test ends regardless of how it ends
 trap cleanup SIGINT SIGTERM SIGQUIT EXIT

-log "Running the entrypoint"
+log "Running the startup script"
 log ""

-# Run the runner entrypoint script which as a final step runs this
+# Run the runner startup script which as a final step runs this
 # unit tests run.sh as it was symlinked
-../../../runner/entrypoint.sh 2> >(entrypoint_log)
+../../../runner/startup.sh 2> >(startup_log)

 if [ "$?" != "0" ]; then
  error "=========================="
--- a/test/entrypoint/should_work_use_disable_update_switch/test.sh
+++ b/test/entrypoint/should_work_use_disable_update_switch/test.sh
@@ -3,14 +3,14 @@
 # UNITTEST: should work disable update
 # Will simulate a scneario where disableupdate=true. expects:
 # - the configuration step to be run exactly once
-# - the entrypoint script to exit with no error
+# - the startup script to exit with no error
 # - the config.sh script to run with the --disableupdate flag set to 'true'.

 source ../assets/logging.sh

-entrypoint_log() {
+startup_log() {
  while read I; do
-    printf "\tentrypoint.sh: $I\n"
+    printf "\tstartup.sh: $I\n"
  done
 }

@@ -43,12 +43,12 @@ cleanup() {
 # Always run cleanup when test ends regardless of how it ends
 trap cleanup SIGINT SIGTERM SIGQUIT EXIT

-log "Running the entrypoint"
+log "Running the startup script"
 log ""

-# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
-# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
-../../../runner/entrypoint.sh 2> >(entrypoint_log)
+# run.sh and config.sh get used by the runner's real startup.sh and are part of actions/runner.
+# We change symlink dummy versions so the startup.sh can run allowing us to test the real entrypoint.sh
+../../../runner/startup.sh 2> >(startup_log)

 if [ "$?" != "0" ]; then
  error "=========================="
--- a/test/entrypoint/test.sh
+++ b/test/entrypoint/test.sh