Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535
Ref #1581

Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
This commit is contained in:
Yusuke Kuoka
2022-11-01 20:30:10 +09:00
committed by GitHub
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions

View File

@@ -14,6 +14,7 @@ import (
"github.com/actions-runner-controller/actions-runner-controller/testing"
"github.com/google/go-github/v47/github"
"github.com/onsi/gomega"
"github.com/stretchr/testify/require"
"golang.org/x/oauth2"
"sigs.k8s.io/yaml"
)
@@ -330,6 +331,10 @@ func TestE2E(t *testing.T) {
t.Run(fmt.Sprintf("update runners - attempt %d", i), func(t *testing.T) {
env.deploy(t, RunnerDeployments, testID, fmt.Sprintf("ROLLING_UPDATE_PHASE=%d", i))
})
t.Run(fmt.Sprintf("set deletiontimestamps on runner pods - attempt %d", i), func(t *testing.T) {
env.setDeletionTimestampsOnRunningPods(t, RunnerDeployments)
})
}
}
}()
@@ -370,6 +375,8 @@ type env struct {
doDockerBuild bool
containerMode string
runnerServiceAccuontName string
runnerGracefulStopTimeout string
runnerTerminationGracePeriodSeconds string
runnerNamespace string
remoteKubeconfig string
imagePullSecretName string
@@ -500,6 +507,8 @@ func initTestEnv(t *testing.T, k8sMinorVer string, vars vars) *env {
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "")
e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "")
e.runnerServiceAccuontName = testing.Getenv(t, "TEST_RUNNER_SERVICE_ACCOUNT_NAME", "")
e.runnerTerminationGracePeriodSeconds = testing.Getenv(t, "TEST_RUNNER_TERMINATION_GRACE_PERIOD_SECONDS", "30")
e.runnerGracefulStopTimeout = testing.Getenv(t, "TEST_RUNNER_GRACEFUL_STOP_TIMEOUT", "15")
e.runnerNamespace = testing.Getenv(t, "TEST_RUNNER_NAMESPACE", "default")
e.remoteKubeconfig = testing.Getenv(t, "ARC_E2E_REMOTE_KUBECONFIG", "")
e.imagePullSecretName = testing.Getenv(t, "ARC_E2E_IMAGE_PULL_SECRET_NAME", "")
@@ -712,6 +721,48 @@ func (e *env) undeploy(t *testing.T, kind DeployKind, testID string) {
e.do(t, "delete", kind, testID)
}
func (e *env) setDeletionTimestampsOnRunningPods(t *testing.T, deployKind DeployKind) {
t.Helper()
var scope, kind, labelKind string
if e.testOrg != "" {
scope = "org"
} else if e.testEnterprise != "" {
scope = "enterprise"
} else {
scope = "repo"
}
if deployKind == RunnerDeployments {
kind = "runnerdeploy"
labelKind = "runner-deployment"
} else {
kind = "runnerset"
labelKind = "runnerset"
}
label := fmt.Sprintf("%s-name=%s-%s", labelKind, scope, kind)
ctx := context.Background()
c := e.getKubectlConfig()
t.Logf("Finding pods with label %s", label)
pods, err := e.Kubectl.FindPods(ctx, label, c)
require.NoError(t, err)
if len(pods) == 0 {
return
}
t.Logf("Setting deletionTimestamps on pods %s", strings.Join(pods, ", "))
err = e.Kubectl.DeletePods(ctx, pods, c)
require.NoError(t, err)
t.Logf("Deleted pods %s", strings.Join(pods, ", "))
}
func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ...string) {
t.Helper()
@@ -722,6 +773,8 @@ func (e *env) do(t *testing.T, op string, kind DeployKind, testID string, env ..
"OP=" + op,
"RUNNER_NAMESPACE=" + e.runnerNamespace,
"RUNNER_SERVICE_ACCOUNT_NAME=" + e.runnerServiceAccuontName,
"RUNNER_GRACEFUL_STOP_TIMEOUT=" + e.runnerGracefulStopTimeout,
"RUNNER_TERMINATION_GRACE_PERIOD_SECONDS=" + e.runnerTerminationGracePeriodSeconds,
}
scriptEnv = append(scriptEnv, env...)
@@ -825,7 +878,7 @@ func (e *env) testJobs(testID string) []job {
func (e *env) verifyActionsWorkflowRun(t *testing.T, testID string) {
t.Helper()
verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout())
verifyActionsWorkflowRun(t, e.Env, e.testJobs(testID), e.verifyTimeout(), e.getKubectlConfig())
}
func (e *env) verifyTimeout() time.Duration {
@@ -836,6 +889,18 @@ func (e *env) verifyTimeout() time.Duration {
return 8 * 60 * time.Second
}
func (e *env) getKubectlConfig() testing.KubectlConfig {
kubectlEnv := []string{
"KUBECONFIG=" + e.Kubeconfig,
}
cmCfg := testing.KubectlConfig{
Env: kubectlEnv,
}
return cmCfg
}
type job struct {
name, testArg, configMapName string
}
@@ -969,10 +1034,18 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
// When rootless, we need to use the `docker` buildx driver, which doesn't support cache export
// so we end up with the below error on docker-build:
// error: cache export feature is currently not supported for docker driver. Please switch to a different driver (eg. "docker buildx create --use")
// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-container-driver
// for the `docker-container` driver.
dockerBuildCache = "--cache-from=type=local,src=/home/runner/.cache/buildx " +
"--cache-to=type=local,dest=/home/runner/.cache/buildx-new,mode=max "
dockerfile = "Dockerfile"
// Note though, if the cache does not exist yet, the buildx build seem to write cache data to /home/runner/.cache/buildx,
// not buildx-new.
// I think the following message emitted by buildx in the end is relevant to this behaviour, but not 100% sure:
// WARNING: local cache import at /home/runner/.cache/buildx not found due to err: could not read /home/runner/.cache/buildx/index.json: open /home/runner/.cache/buildx/index.json: no such file or directory
} else {
// See https://docs.docker.com/engine/reference/commandline/buildx_create/#docker-driver
// for the `docker` driver.
setupBuildXActionWith.Driver = "docker"
dockerfile = "Dockerfile.nocache"
}
@@ -997,20 +1070,35 @@ func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNam
fmt.Sprintf("-f %s .", dockerfile),
},
)
}
}
if useSudo {
steps = append(steps,
testing.Step{
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
Run: "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
},
testing.Step{
Run: "ls -lah /home/runner/.cache/*",
},
)
if useSudo {
steps = append(steps,
testing.Step{
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
Run: "if -d /home/runner/.cache/buildx-new; then " + sudo + "rm -rf /home/runner/.cache/buildx && " + sudo + `mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx; else echo "/home/runner/.cache/buildx-new is not found. Perhaps you're running this on a stateleess runner?"; fi`,
},
testing.Step{
Run: "ls -lah /home/runner/.cache/*",
},
)
}
}
if useSudo {
if kind == RunnerDeployments {
steps = append(steps,
testing.Step{
// https://github.com/docker/build-push-action/blob/master/docs/advanced/cache.md#local-cache
// See https://github.com/moby/buildkit/issues/1896 for why this is needed
Run: sudo + "rm -rf /home/runner/.cache/buildx && mv /home/runner/.cache/buildx-new /home/runner/.cache/buildx",
},
testing.Step{
Run: sudo + "ls -lah /home/runner/.cache/*",
},
)
}
}
}
steps = append(steps,
@@ -1062,7 +1150,7 @@ kubectl create cm %s$id --from-literal=status=ok
}
}
func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration) {
func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, timeout time.Duration, cmCfg testing.KubectlConfig) {
t.Helper()
var expected []string
@@ -1079,14 +1167,6 @@ func verifyActionsWorkflowRun(t *testing.T, env *testing.Env, testJobs []job, ti
for i := range testJobs {
testResultCMName := testJobs[i].configMapName
kubectlEnv := []string{
"KUBECONFIG=" + env.Kubeconfig,
}
cmCfg := testing.KubectlConfig{
Env: kubectlEnv,
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

View File

@@ -3,14 +3,14 @@
# UNITTEST: retry config
# Will simulate a configuration failure and expects:
# - the configuration step to be run 10 times
# - the entrypoint script to exit with error code 2
# - the startup script to exit with error code 2
# - the run.sh script to never run.
source ../assets/logging.sh
entrypoint_log() {
startup_log() {
while read I; do
printf "\tentrypoint.sh: $I\n"
printf "\tstartup.sh: $I\n"
done
}
@@ -44,12 +44,12 @@ cleanup() {
# Always run cleanup when test ends regardless of how it ends
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
log "Running the entrypoint"
log "Running the startup script"
log ""
# Run the runner entrypoint script which as a final step runs this
# Run the runner startup script which as a final step runs this
# unit tests run.sh as it was symlinked
../../../runner/entrypoint.sh 2> >(entrypoint_log)
../../../runner/startup.sh 2> >(startup_log)
if [ "$?" != "2" ]; then
error "========================================="

View File

@@ -3,14 +3,14 @@
# UNITTEST: should work as non ephemeral
# Will simulate a scenario where ephemeral=false. expects:
# - the configuration step to be run exactly once
# - the entrypoint script to exit with no error
# - the startup script to exit with no error
# - the run.sh script to run without the --once flag
source ../assets/logging.sh
entrypoint_log() {
startup_log() {
while read I; do
printf "\tentrypoint.sh: $I\n"
printf "\tstartup.sh: $I\n"
done
}
@@ -44,16 +44,16 @@ cleanup() {
# Always run cleanup when test ends regardless of how it ends
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
log "Running the entrypoint"
log "Running the startup script"
log ""
# Run the runner entrypoint script which as a final step runs this
# Run the runner entrypstartupoint script which as a final step runs this
# unit tests run.sh as it was symlinked
../../../runner/entrypoint.sh 2> >(entrypoint_log)
../../../runner/startup.sh 2> >(startup_log)
if [ "$?" != "0" ]; then
error "==========================================="
error "FAIL | Entrypoint script did not exit successfully"
error "FAIL | Startup script did not exit successfully"
exit 1
fi

View File

@@ -3,14 +3,14 @@
# UNITTEST: should work normally
# Will simulate a normal execution scenario. expects:
# - the configuration step to be run exactly once
# - the entrypoint script to exit with no error
# - the startup script to exit with no error
# - the run.sh script to run with the --once flag activated.
source ../assets/logging.sh
entrypoint_log() {
startup_log() {
while read I; do
printf "\tentrypoint.sh: $I\n"
printf "\startup.sh: $I\n"
done
}
@@ -42,12 +42,12 @@ cleanup() {
# Always run cleanup when test ends regardless of how it ends
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
log "Running the entrypoint"
log "Running the startup script"
log ""
# Run the runner entrypoint script which as a final step runs this
# Run the runner startup script which as a final step runs this
# unit tests run.sh as it was symlinked
../../../runner/entrypoint.sh 2> >(entrypoint_log)
../../../runner/startup.sh 2> >(startup_log)
if [ "$?" != "0" ]; then
error "=========================="

View File

@@ -3,14 +3,14 @@
# UNITTEST: should work disable update
# Will simulate a scneario where disableupdate=true. expects:
# - the configuration step to be run exactly once
# - the entrypoint script to exit with no error
# - the startup script to exit with no error
# - the config.sh script to run with the --disableupdate flag set to 'true'.
source ../assets/logging.sh
entrypoint_log() {
startup_log() {
while read I; do
printf "\tentrypoint.sh: $I\n"
printf "\tstartup.sh: $I\n"
done
}
@@ -43,12 +43,12 @@ cleanup() {
# Always run cleanup when test ends regardless of how it ends
trap cleanup SIGINT SIGTERM SIGQUIT EXIT
log "Running the entrypoint"
log "Running the startup script"
log ""
# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
../../../runner/entrypoint.sh 2> >(entrypoint_log)
# run.sh and config.sh get used by the runner's real startup.sh and are part of actions/runner.
# We change symlink dummy versions so the startup.sh can run allowing us to test the real entrypoint.sh
../../../runner/startup.sh 2> >(startup_log)
if [ "$?" != "0" ]; then
error "=========================="