mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-11 03:57:01 +00:00
Fix runners to do their best to gracefully stop on pod eviction (#1759)
Ref #1535 Ref #1581 Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
This commit is contained in:
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/actions-runner-controller/actions-runner-controller/testing"
|
||||
"github.com/google/go-github/v47/github"
|
||||
"github.com/onsi/gomega"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/oauth2"
|
||||
"sigs.k8s.io/yaml"
|
||||
)
|
||||
@@ -330,6 +331,10 @@ func TestE2E(t *testing.T) {
|
||||
t.Run(fmt.Sprintf("update runners - attempt %d", i), func(t *testing.T) {
|
||||
env.deploy(t, RunnerDeployments, testID, fmt.Sprintf("ROLLING_UPDATE_PHASE=%d", i))
|
||||
})
|
||||
|
||||
t.Run(fmt.Sprintf("set deletiontimestamps on runner pods - attempt %d", i), func(t *testing.T) {
|
||||
env.setDeletionTimestampsOnRunningPods(t, RunnerDeployments)
|
||||
})
|
||||
}
|
||||
}
|
||||
}()
|
||||
@@ -370,6 +375,8 @@ type env struct {
|
||||
doDockerBuild bool
|
||||
containerMode string
|
||||
runnerServiceAccuontName string
|
||||
runnerGracefulStopTimeout string
|
||||
runnerTerminationGracePeriodSeconds string
|
||||
runnerNamespace string
|
||||
remoteKubeconfig string
|
||||
imagePullSecretName string
|
||||
@@ -500,6 +507,8 @@ func initTestEnv(t *testing.T, k8sMinorVer string, vars vars) *env {
|
||||
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "")
|
||||
e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "")
|
||||
e.runnerServiceAccuontName = testing.Getenv(t, "TEST_RUNNER_SERVICE_ACCOUNT_NAME", "")
|
||||
e.runnerTerminationGracePeriodSeconds = testing.Getenv(t, "TEST_RUNNER_TERMINATION_GRACE_PERIOD_SECONDS", "30")
|
||||
e.runnerGracefulStopTimeout = testing.Getenv(t, "TEST_RUNNER_GRACEFUL_STOP_TIMEOUT", "15")
|
||||
e.runnerNamespace = testing.Getenv(t, "TEST_RUNNER_NAMESPACE", "default")
|
||||
e.remoteKubeconfig = testing.Getenv(t, "ARC_E2E_REMOTE_KUBECONFIG", "")
|
||||
e.imagePullSecretName = testing.Getenv(t, "ARC_E2E_IMAGE_PULL_SECRET_NAME", "")
|
||||
@@ -712,6 +721,48 @@ func (e *env) undeploy(t *testing.T, kind DeployKind, testID string) {
|
||||
e.do(t, "delete", kind, testID)
|
||||
}
|
||||
|
||||
func (e *env) setDeletionTimestampsOnRunningPods(t *testing.T, deployKind DeployKind) {
|
||||
t.Helper()
|
||||
|
||||
var scope, kind, labelKind string
|
||||
if e.testOrg != "" {
|
||||
scope = "org"
|
||||
} else if e.testEnterprise != "" {
|
||||
scope = "enterprise"
|
||||
} else {
|
||||
scope = "repo"
|
||||
}
|
||||
|
||||
if deployKind == RunnerDeployments {
|
||||
kind = "runnerdeploy"
|
||||
labelKind = "runner-deployment"
|
||||
} else {
|
||||
kind = "runnerset"
|
||||
labelKind = "runnerset"
|
||||
}
|
||||
|
||||
label := fmt.Sprintf("%s-name=%s-%s", labelKind, scope, kind)
|
||||
|
||||
ctx := context.Background()
|
||||
c := e.getKubectlConfig()
|
||||
|
||||
t.Logf("Finding pods with label %s", label)
|
||||
|
||||
pods, err := e.Kubectl.FindPods(ctx, label, c)
|
||||
require.NoError(t, err)
|
||||
|
||||
if len(pods) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
t.Logf("Setting deletionTimestamps on pods %s", strings.Join(pods, ", "))
|
||||
|
||||
err = e.Kubectl.DeletePods(ctx, pods, c)
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Logf("Deleted pods %s", strings.Join(pods, ", "))
|
||||
}
|
||||
|
||||
func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ...string) {
|
||||
t.Helper()
|
||||
|
||||
@@ -722,6 +773,8 @@ func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ..
|
||||
"OP=" + op,
|
||||
"RUNNER_NAMESPACE=" + e.runnerNamespace,
|
||||
"RUNNER_SERVICE_ACCOUNT_NAME=" + e.runnerServiceAccuontName,
|
||||
"RUNNER_GRACEFUL_STOP_TIMEOUT=" + e.runnerGracefulStopTimeout,
|
||||
"RUNNER_TERMINATION_GRACE_PERIOD_SECONDS=" + e.runnerTerminationGracePeriodSeconds,
|
||||
}
|
||||
scriptEnv = append(scriptEnv, env...)
|
||||
|
||||
@@ -825,7 +878,7 @@ func (e *env) testJobs(testID string) []job {
|
||||
func (e *env) verifyActionsWorkflowRun(t *testing.T, testID string) {
|
||||
t.Helper()
|
||||
|
||||
verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout())
|
||||
verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout(), e.getKubectlConfig())
|
||||
}
|
||||
|
||||
func (e *env) verifyTimeout() time.Duration {
|
||||
@@ -836,6 +889,18 @@ func (e *env) verifyTimeout() time.Duration {
|
||||
return 8 * 60 * time.Second
|
||||
}
|
||||
|
||||
func (e *env) getKubectlConfig() testing.KubectlConfig {
|
||||
kubectlEnv := []string{
|
||||
"KUBECONFIG=" + e.Kubeconfig,
|
||||
}
|
||||
|
||||
cmCfg := testing.KubectlConfig{
|
||||
Env: kubectlEnv,
|
||||
}
|
||||
|
||||
return cmCfg
|
||||
}
|
||||
|
||||
type job struct {
|
||||
name, testArg, configMapName string
|
||||
}
|
||||
@@ -969,10 +1034,18 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
|
||||
// When rootless, we need to use the `docker` buildx driver, which doesn't support cache export
|
||||
// so we end up with the below error on docker-build:
|
||||
// error: cache export feature is currently not supported for docker driver. Please switch to a different driver (eg. "docker buildx create --use")
|
||||
// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-container-driver
|
||||
// for the `docker-container` driver.
|
||||
dockerBuildCache = "--cache-from=type=local,src=/home/runner/.cache/buildx " +
|
||||
"--cache-to=type=local,dest=/home/runner/.cache/buildx-new,mode=max "
|
||||
dockerfile = "Dockerfile"
|
||||
// Note though, if the cache does not exist yet, the buildx build seem to write cache data to /home/runner/.cache/buildx,
|
||||
// not buildx-new.
|
||||
// I think the following message emitted by buildx in the end is relevant to this behaviour, but not 100% sure:
|
||||
// WARNING: local cache import at /home/runner/.cache/buildx not found due to err: could not read /home/runner/.cache/buildx/index.json: open /home/runner/.cache/buildx/index.json: no such file or directory
|
||||
} else {
|
||||
// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-driver
|
||||
// for the `docker` driver.
|
||||
setupBuildXActionWith.Driver = "docker"
|
||||
dockerfile = "Dockerfile.nocache"
|
||||
}
|
||||
@@ -997,20 +1070,35 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
|
||||
fmt.Sprintf("-f %s .", dockerfile),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if useSudo {
|
||||
steps = append(steps,
|
||||
testing.Step{
|
||||
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
|
||||
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
|
||||
Run: "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
|
||||
},
|
||||
testing.Step{
|
||||
Run: "ls -lah /home/runner/.cache/*",
|
||||
},
|
||||
)
|
||||
if useSudo {
|
||||
steps = append(steps,
|
||||
testing.Step{
|
||||
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
|
||||
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
|
||||
Run: "if -d /home/runner/.cache/buildx-new; then " + sudo + "rm -rf /home/runner/.cache/buildx && " + sudo + `mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx; else echo "/home/runner/.cache/buildx-new is not found. Perhaps you're running this on a stateleess runner?"; fi`,
|
||||
},
|
||||
testing.Step{
|
||||
Run: "ls -lah /home/runner/.cache/*",
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if useSudo {
|
||||
if kind == RunnerDeployments {
|
||||
steps = append(steps,
|
||||
testing.Step{
|
||||
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
|
||||
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
|
||||
Run: sudo + "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
|
||||
},
|
||||
testing.Step{
|
||||
Run: sudo + "ls -lah /home/runner/.cache/*",
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
steps = append(steps,
|
||||
@@ -1062,7 +1150,7 @@ kubectl create cm %s$id --from-literal=status=ok
|
||||
}
|
||||
}
|
||||
|
||||
func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration) {
|
||||
func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration, cmCfg testing.KubectlConfig) {
|
||||
t.Helper()
|
||||
|
||||
var expected []string
|
||||
@@ -1079,14 +1167,6 @@ func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, ti
|
||||
for i := range testJobs {
|
||||
testResultCMName := testJobs[i].configMapName
|
||||
|
||||
kubectlEnv := []string{
|
||||
"KUBECONFIG=" + env.Kubeconfig,
|
||||
}
|
||||
|
||||
cmCfg := testing.KubectlConfig{
|
||||
Env: kubectlEnv,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
|
||||
@@ -3,14 +3,14 @@
|
||||
# UNITTEST: retry config
|
||||
# Will simulate a configuration failure and expects:
|
||||
# - the configuration step to be run 10 times
|
||||
# - the entrypoint script to exit with error code 2
|
||||
# - the startup script to exit with error code 2
|
||||
# - the run.sh script to never run.
|
||||
|
||||
source ../assets/logging.sh
|
||||
|
||||
entrypoint_log() {
|
||||
startup_log() {
|
||||
while read I; do
|
||||
printf "\tentrypoint.sh: $I\n"
|
||||
printf "\tstartup.sh: $I\n"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -44,12 +44,12 @@ cleanup() {
|
||||
# Always run cleanup when test ends regardless of how it ends
|
||||
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
|
||||
|
||||
log "Running the entrypoint"
|
||||
log "Running the startup script"
|
||||
log ""
|
||||
|
||||
# Run the runner entrypoint script which as a final step runs this
|
||||
# Run the runner startup script which as a final step runs this
|
||||
# unit tests run.sh as it was symlinked
|
||||
../../../runner/entrypoint.sh 2> >(entrypoint_log)
|
||||
../../../runner/startup.sh 2> >(startup_log)
|
||||
|
||||
if [ "$?" != "2" ]; then
|
||||
error "========================================="
|
||||
@@ -3,14 +3,14 @@
|
||||
# UNITTEST: should work as non ephemeral
|
||||
# Will simulate a scenario where ephemeral=false. expects:
|
||||
# - the configuration step to be run exactly once
|
||||
# - the entrypoint script to exit with no error
|
||||
# - the startup script to exit with no error
|
||||
# - the run.sh script to run without the --once flag
|
||||
|
||||
source ../assets/logging.sh
|
||||
|
||||
entrypoint_log() {
|
||||
startup_log() {
|
||||
while read I; do
|
||||
printf "\tentrypoint.sh: $I\n"
|
||||
printf "\tstartup.sh: $I\n"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -44,16 +44,16 @@ cleanup() {
|
||||
# Always run cleanup when test ends regardless of how it ends
|
||||
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
|
||||
|
||||
log "Running the entrypoint"
|
||||
log "Running the startup script"
|
||||
log ""
|
||||
|
||||
# Run the runner entrypoint script which as a final step runs this
|
||||
# Run the runner entrypstartupoint script which as a final step runs this
|
||||
# unit tests run.sh as it was symlinked
|
||||
../../../runner/entrypoint.sh 2> >(entrypoint_log)
|
||||
../../../runner/startup.sh 2> >(startup_log)
|
||||
|
||||
if [ "$?" != "0" ]; then
|
||||
error "==========================================="
|
||||
error "FAIL | Entrypoint script did not exit successfully"
|
||||
error "FAIL | Startup script did not exit successfully"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -3,14 +3,14 @@
|
||||
# UNITTEST: should work normally
|
||||
# Will simulate a normal execution scenario. expects:
|
||||
# - the configuration step to be run exactly once
|
||||
# - the entrypoint script to exit with no error
|
||||
# - the startup script to exit with no error
|
||||
# - the run.sh script to run with the --once flag activated.
|
||||
|
||||
source ../assets/logging.sh
|
||||
|
||||
entrypoint_log() {
|
||||
startup_log() {
|
||||
while read I; do
|
||||
printf "\tentrypoint.sh: $I\n"
|
||||
printf "\startup.sh: $I\n"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -42,12 +42,12 @@ cleanup() {
|
||||
# Always run cleanup when test ends regardless of how it ends
|
||||
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
|
||||
|
||||
log "Running the entrypoint"
|
||||
log "Running the startup script"
|
||||
log ""
|
||||
|
||||
# Run the runner entrypoint script which as a final step runs this
|
||||
# Run the runner startup script which as a final step runs this
|
||||
# unit tests run.sh as it was symlinked
|
||||
../../../runner/entrypoint.sh 2> >(entrypoint_log)
|
||||
../../../runner/startup.sh 2> >(startup_log)
|
||||
|
||||
if [ "$?" != "0" ]; then
|
||||
error "=========================="
|
||||
@@ -3,14 +3,14 @@
|
||||
# UNITTEST: should work disable update
|
||||
# Will simulate a scneario where disableupdate=true. expects:
|
||||
# - the configuration step to be run exactly once
|
||||
# - the entrypoint script to exit with no error
|
||||
# - the startup script to exit with no error
|
||||
# - the config.sh script to run with the --disableupdate flag set to 'true'.
|
||||
|
||||
source ../assets/logging.sh
|
||||
|
||||
entrypoint_log() {
|
||||
startup_log() {
|
||||
while read I; do
|
||||
printf "\tentrypoint.sh: $I\n"
|
||||
printf "\tstartup.sh: $I\n"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -43,12 +43,12 @@ cleanup() {
|
||||
# Always run cleanup when test ends regardless of how it ends
|
||||
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
|
||||
|
||||
log "Running the entrypoint"
|
||||
log "Running the startup script"
|
||||
log ""
|
||||
|
||||
# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
|
||||
# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
|
||||
../../../runner/entrypoint.sh 2> >(entrypoint_log)
|
||||
# run.sh and config.sh get used by the runner's real startup.sh and are part of actions/runner.
|
||||
# We change symlink dummy versions so the startup.sh can run allowing us to test the real entrypoint.sh
|
||||
../../../runner/startup.sh 2> >(startup_log)
|
||||
|
||||
if [ "$?" != "0" ]; then
|
||||
error "=========================="
|
||||
Reference in New Issue
Block a user