Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535 Ref #1581 Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
2025-12-15 06:26:57 +00:00 · 2022-11-01 20:30:10 +09:00
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions
--- a/test/startup/assets/config.sh
+++ b/test/startup/assets/config.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+export LIGHTGREEN='\e[0;32m'
+export LIGHTRED='\e[0;31m'
+export WHITE='\e[0;97m'
+export RESET='\e[0m'
+
+log(){
+  printf "\t${WHITE}$@${RESET}\n" 2>&1
+}
+
+success(){
+  printf "\t${LIGHTGREEN}$@${RESET}\n" 2>&1
+}
+
+error(){
+  printf "\t${LIGHTRED}$@${RESET}\n" 2>&1
+}
+
+success "I'm configured normally"
+
+# Condition for should_retry_configuring test
+if [ -z "${FAIL_RUNNER_CONFIG_SETUP}" ]; then
+  touch .runner
+fi
+
+echo "$@" > runner_config
+success "created a dummy config file"
+# adding a counter to see how many times we've gone through the configuration step
+count=`cat counter 2>/dev/null|| echo "0"`
+count=$((count + 1))
+echo ${count} > counter
+
--- a/test/startup/assets/logging.sh
+++ b/test/startup/assets/logging.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+export LIGHTGREEN='\e[0;32m'
+export LIGHTRED='\e[0;31m'
+export WHITE='\e[0;97m'
+export RESET='\e[0m'
+
+log(){
+  printf "${WHITE}$@${RESET}\n" 2>&1
+}
+
+success(){
+  printf "${LIGHTGREEN}$@${RESET}\n" 2>&1
+}
+
+error(){
+  printf "${LIGHTRED}$@${RESET}\n" 2>&1
+}
--- a/test/startup/assets/run.sh
+++ b/test/startup/assets/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+export LIGHTGREEN='\e[0;32m'
+export LIGHTRED='\e[0;31m'
+export WHITE='\e[0;97m'
+export RESET='\e[0m'
+
+log(){
+  printf "\t${WHITE}$@${RESET}\n" 2>&1
+}
+
+success(){
+  printf "\t${LIGHTGREEN}$@${RESET}\n" 2>&1
+}
+
+error(){
+  printf "\t${LIGHTRED}$@${RESET}\n" 2>&1
+  exit 1
+}
+
+log "Dumping set runner arguments"
+echo "$@" > runner_args
+success "Pretending to run service..."
+touch run_sh_ran
+success "Success"
+success ""
--- a/test/startup/should_retry_configuring/test.sh
+++ b/test/startup/should_retry_configuring/test.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+# UNITTEST: retry config
+# Will simulate a configuration failure and expects:
+# - the configuration step to be run 10 times
+# - the startup script to exit with error code 2
+# - the run.sh script to never run.
+
+source ../assets/logging.sh
+
+startup_log() {
+  while read I; do
+    printf "\tstartup.sh: $I\n"
+  done
+}
+
+log "Setting up test area"
+export RUNNER_HOME=testarea
+mkdir -p ${RUNNER_HOME}
+
+log "Setting up the test config"
+export UNITTEST=true
+export FAIL_RUNNER_CONFIG_SETUP=true
+export RUNNER_NAME="example_runner_name"
+export RUNNER_REPO="myorg/myrepo"
+export RUNNER_TOKEN="xxxxxxxxxxxxx"
+
+# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
+# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
+log "Symlink dummy config.sh and run.sh"
+ln -s ../../assets/config.sh ${RUNNER_HOME}/config.sh
+ln -s ../../assets/run.sh ${RUNNER_HOME}/run.sh
+
+cleanup() {
+  rm -rf ${RUNNER_HOME}
+  unset UNITTEST
+  unset RUNNERHOME
+  unset RUNNER_NAME
+  unset RUNNER_REPO
+  unset RUNNER_TOKEN
+  unset FAIL_RUNNER_CONFIG_SETUP
+}
+
+# Always run cleanup when test ends regardless of how it ends
+trap cleanup SIGINT SIGTERM SIGQUIT EXIT
+
+log "Running the startup script"
+log ""
+
+# Run the runner startup script which as a final step runs this
+# unit tests run.sh as it was symlinked
+../../../runner/startup.sh 2> >(startup_log)
+
+if [ "$?" != "2" ]; then
+  error "========================================="
+  error "FAIL | Configuration should have thrown an error"
+  exit 1
+fi
+
+success "PASS | Entrypoint didn't complete successfully"
+
+log "Checking the counter, should have 10 iterations"
+count=`cat ${RUNNER_HOME}/counter || "notfound"`
+if [ "${count}" != "10" ]; then
+  error "============================================="
+  error "FAIL | The retry loop should have done 10 iterations"
+  exit 1
+fi
+success "PASS | Retry loop went up to 10"
+
+log "Checking that run.sh never ran"
+if [ -f ${RUNNER_HOME}/run_sh_ran ]; then
+  error "================================================================="
+  error "FAIL | run.sh was invoked, entrypoint.sh should have failed before that."
+  exit 1
+fi
+
+success "PASS | run.sh never ran"
+success
+success "==========================="
+success "Test completed successfully"
+exit 0
--- a/test/startup/should_work_non_ephemeral/test.sh
+++ b/test/startup/should_work_non_ephemeral/test.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# UNITTEST: should work as non ephemeral
+# Will simulate a scenario where ephemeral=false. expects:
+# - the configuration step to be run exactly once
+# - the startup script to exit with no error
+# - the run.sh script to run without the --once flag
+
+source ../assets/logging.sh
+
+startup_log() {
+  while read I; do
+    printf "\tstartup.sh: $I\n"
+  done
+}
+
+log "Setting up test area"
+export RUNNER_HOME=testarea
+mkdir -p ${RUNNER_HOME}
+
+log "Setting up the test"
+export UNITTEST=true
+export RUNNER_NAME="example_runner_name"
+export RUNNER_REPO="myorg/myrepo"
+export RUNNER_TOKEN="xxxxxxxxxxxxx"
+export RUNNER_EPHEMERAL=false
+
+# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
+# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
+log "Symlink dummy config.sh and run.sh"
+ln -s ../../assets/config.sh ${RUNNER_HOME}/config.sh
+ln -s ../../assets/run.sh ${RUNNER_HOME}/run.sh
+
+cleanup() {
+  rm -rf ${RUNNER_HOME}
+  unset UNITTEST
+  unset RUNNERHOME
+  unset RUNNER_NAME
+  unset RUNNER_REPO
+  unset RUNNER_TOKEN
+  unset RUNNER_EPHEMERAL
+}
+
+# Always run cleanup when test ends regardless of how it ends
+trap cleanup SIGINT SIGTERM SIGQUIT EXIT
+
+log "Running the startup script"
+log ""
+
+# Run the runner entrypstartupoint script which as a final step runs this
+# unit tests run.sh as it was symlinked
+../../../runner/startup.sh 2> >(startup_log)
+
+if [ "$?" != "0" ]; then
+  error "==========================================="
+  error "FAIL | Startup script did not exit successfully"
+  exit 1
+fi
+
+log "Testing if we went through the configuration step only once"
+count=`cat ${RUNNER_HOME}/counter || echo "not_found"`
+if [ ${count} != "1" ]; then
+  error "==============================================="
+  error "FAIL | The configuration step was not run exactly once"
+  exit 1
+fi
+
+success "PASS | The configuration ran ${count} time(s)"
+
+log "Testing if run.sh ran"
+if [ ! -f "${RUNNER_HOME}/run_sh_ran" ]; then
+  error "=============================="
+  error "FAIL | The runner service has not run"
+  exit 1
+fi
+success "PASS | run.sh ran"
+success ""
+success "==========================="
+success "Test completed successfully"
--- a/test/startup/should_work_normally/test.sh
+++ b/test/startup/should_work_normally/test.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+# UNITTEST: should work normally
+# Will simulate a normal execution scenario. expects:
+# - the configuration step to be run exactly once
+# - the startup script to exit with no error
+# - the run.sh script to run with the --once flag activated.
+
+source ../assets/logging.sh
+
+startup_log() {
+  while read I; do
+    printf "\startup.sh: $I\n"
+  done
+}
+
+log "Setting up test area"
+export RUNNER_HOME=testarea
+mkdir -p ${RUNNER_HOME}
+
+log "Setting up the test"
+export UNITTEST=true
+export RUNNER_NAME="example_runner_name"
+export RUNNER_REPO="myorg/myrepo"
+export RUNNER_TOKEN="xxxxxxxxxxxxx"
+
+# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
+# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
+log "Symlink dummy config.sh and run.sh"
+ln -s ../../assets/config.sh ${RUNNER_HOME}/config.sh
+ln -s ../../assets/run.sh ${RUNNER_HOME}/run.sh
+
+cleanup() {
+  rm -rf ${RUNNER_HOME}
+  unset UNITTEST
+  unset RUNNERHOME
+  unset RUNNER_NAME
+  unset RUNNER_REPO
+  unset RUNNER_TOKEN
+}
+
+# Always run cleanup when test ends regardless of how it ends
+trap cleanup SIGINT SIGTERM SIGQUIT EXIT
+
+log "Running the startup script"
+log ""
+
+# Run the runner startup script which as a final step runs this
+# unit tests run.sh as it was symlinked
+../../../runner/startup.sh 2> >(startup_log)
+
+if [ "$?" != "0" ]; then
+  error "=========================="
+  error "Test completed with errors"
+  exit 1
+fi
+
+log "Testing if the configuration step was run only once"
+count=`cat ${RUNNER_HOME}/counter || echo "not_found"`
+if [ ${count} != "1" ]; then
+  error "==============================================="
+  error "FAIL | The configuration step was not run exactly once"
+  exit 1
+fi
+
+success "PASS | The configuration ran ${count} time(s)"
+
+log "Testing if the configuration included the --ephemeral flag"
+if grep -q -- '--ephemeral' ${RUNNER_HOME}/runner_config; then
+  error "==============================================="
+  error "FAIL | The configuration should not include the --ephemeral flag"
+  exit 1
+fi
+
+success "PASS | The --ephemeral switch was included in the configuration"
+
+log "Testing if run.sh ran"
+if [ ! -f "${RUNNER_HOME}/run_sh_ran" ]; then
+  error "=============================="
+  error "FAIL | The runner service has not run"
+  exit 1
+fi
+
+success "PASS | run.sh ran"
+success ""
+success "==========================="
+success "Test completed successfully"
--- a/test/startup/should_work_use_disable_update_switch/test.sh
+++ b/test/startup/should_work_use_disable_update_switch/test.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+# UNITTEST: should work disable update
+# Will simulate a scneario where disableupdate=true. expects:
+# - the configuration step to be run exactly once
+# - the startup script to exit with no error
+# - the config.sh script to run with the --disableupdate flag set to 'true'.
+
+source ../assets/logging.sh
+
+startup_log() {
+  while read I; do
+    printf "\tstartup.sh: $I\n"
+  done
+}
+
+log "Setting up test area"
+export RUNNER_HOME=testarea
+mkdir -p ${RUNNER_HOME}
+
+log "Setting up the test"
+export UNITTEST=true
+export RUNNER_NAME="example_runner_name"
+export RUNNER_REPO="myorg/myrepo"
+export RUNNER_TOKEN="xxxxxxxxxxxxx"
+export DISABLE_RUNNER_UPDATE="true"
+
+# run.sh and config.sh get used by the runner's real entrypoint.sh and are part of actions/runner.
+# We change symlink dummy versions so the entrypoint.sh can run allowing us to test the real entrypoint.sh
+log "Symlink dummy config.sh and run.sh"
+ln -s ../../assets/config.sh ${RUNNER_HOME}/config.sh
+ln -s ../../assets/run.sh ${RUNNER_HOME}/run.sh
+
+cleanup() {
+  rm -rf ${RUNNER_HOME}
+  unset UNITTEST
+  unset RUNNERHOME
+  unset RUNNER_NAME
+  unset RUNNER_REPO
+  unset RUNNER_TOKEN
+}
+
+# Always run cleanup when test ends regardless of how it ends
+trap cleanup SIGINT SIGTERM SIGQUIT EXIT
+
+log "Running the startup script"
+log ""
+
+# run.sh and config.sh get used by the runner's real startup.sh and are part of actions/runner.
+# We change symlink dummy versions so the startup.sh can run allowing us to test the real entrypoint.sh
+../../../runner/startup.sh 2> >(startup_log)
+
+if [ "$?" != "0" ]; then
+  error "=========================="
+  error "FAIL | Test completed with errors"
+  exit 1
+fi
+
+log "Testing if the configuration step was run only once"
+count=`cat ${RUNNER_HOME}/counter || echo "not_found"`
+if [ ${count} != "1" ]; then
+  error "==============================================="
+  error "FAIL | The configuration step was not run exactly once"
+  exit 1
+fi
+success "PASS | The configuration ran ${count} time(s)"
+
+log "Testing if the configuration included the --disableupdate flag"
+if ! grep -q -- '--disableupdate' ${RUNNER_HOME}/runner_config; then
+  error "==============================================="
+  error "FAIL | The configuration should not include the --disableupdate flag"
+  exit 1
+fi
+
+success "PASS | The --disableupdate switch was included in the configuration"
+
+log "Testing if run.sh ran"
+if [ ! -f "${RUNNER_HOME}/run_sh_ran" ]; then
+  error "=============================="
+  error "FAIL | The runner service has not run"
+  exit 1
+fi
+
+success "PASS | run.sh ran"
+success ""
+success "==========================="
+success "Test completed successfully"
--- a/test/startup/test.sh
+++ b/test/startup/test.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+source assets/logging.sh
+
+for unittest in ./should*; do
+  log "**********************************"
+  log " UNIT TEST: ${unittest}"
+  log "**********************************"
+  log ""
+  cd ${unittest}
+  ./test.sh
+  ret_code=$?
+  cd ..
+
+  log ""
+  log ""
+  if [ "${ret_code}" = "0" ]; then
+    success "Completed: unit test ${unittest}"
+  else
+    error "Completed: unit test ${unittest} with errors"
+    failed="true"
+  fi
+done
+
+if [ -n "${failed:-}" ]; then
+  error ""
+  error "*************************************"
+  error "All unit tests completed, with errors"
+  error "*************************************"
+  exit 1
+else
+  success ""
+  success "***************************************"
+  success "All unit tests completed with no errors"
+  success "***************************************"
+fi