Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535 Ref #1581 Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
2025-12-24 02:27:54 +08:00 · 2022-11-01 20:30:10 +09:00
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions
--- a/runner/startup.sh
+++ b/runner/startup.sh
@@ -1,72 +1,172 @@
 #!/bin/bash
-source logger.bash
+source logger.sh

-function wait_for_process () {
-    local max_time_wait=30
-    local process_name="$1"
-    local waited_sec=0
-    while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
-        log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
-        log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
-        sleep 1
-        ((waited_sec=waited_sec+1))
-        if ((waited_sec >= max_time_wait)); then
-            return 1
-        fi
-    done
-    return 0
-}
+RUNNER_ASSETS_DIR=${RUNNER_ASSETS_DIR:-/runnertmp}
+RUNNER_HOME=${RUNNER_HOME:-/runner}

-sudo /bin/bash <<SCRIPT
-mkdir -p /etc/docker
+# Let GitHub runner execute these hooks. These environment variables are used by GitHub's Runner as described here
+# https://github.com/actions/runner/blob/main/docs/adrs/1751-runner-job-hooks.md
+# Scripts referenced in the ACTIONS_RUNNER_HOOK_ environment variables must end in .sh or .ps1
+# for it to become a valid hook script, otherwise GitHub will fail to run the hook
+export ACTIONS_RUNNER_HOOK_JOB_STARTED=/etc/arc/hooks/job-started.sh
+export ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/etc/arc/hooks/job-completed.sh

-if [ ! -f /etc/docker/daemon.json ]; then
-  echo "{}" > /etc/docker/daemon.json
+if [ -n "${STARTUP_DELAY_IN_SECONDS}" ]; then
+  log.notice "Delaying startup by ${STARTUP_DELAY_IN_SECONDS} seconds"
+  sleep "${STARTUP_DELAY_IN_SECONDS}"
 fi

-if [ -n "${MTU}" ]; then
-jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
-# See https://docs.docker.com/engine/security/rootless/
-echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
+if [ -z "${GITHUB_URL}" ]; then
+  log.debug 'Working with public GitHub'
+  GITHUB_URL="https://github.com/"
+else
+  length=${#GITHUB_URL}
+  last_char=${GITHUB_URL:length-1:1}
+
+  [[ $last_char != "/" ]] && GITHUB_URL="$GITHUB_URL/"; :
+  log.debug "Github endpoint URL ${GITHUB_URL}"
 fi

-if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
-jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
+if [ -z "${RUNNER_NAME}" ]; then
+  log.error 'RUNNER_NAME must be set'
+  exit 1
 fi
-SCRIPT

-dump() {
-  local path=${1:?missing required <path> argument}
-  shift
-  printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
-  cat "$path" 1>&2
-  printf -- '---\n' 1>&2
-}
+if [ -n "${RUNNER_ORG}" ] && [ -n "${RUNNER_REPO}" ] && [ -n "${RUNNER_ENTERPRISE}" ]; then
+  ATTACH="${RUNNER_ORG}/${RUNNER_REPO}"
+elif [ -n "${RUNNER_ORG}" ]; then
+  ATTACH="${RUNNER_ORG}"
+elif [ -n "${RUNNER_REPO}" ]; then
+  ATTACH="${RUNNER_REPO}"
+elif [ -n "${RUNNER_ENTERPRISE}" ]; then
+  ATTACH="enterprises/${RUNNER_ENTERPRISE}"
+else
+  log.error 'At least one of RUNNER_ORG, RUNNER_REPO, or RUNNER_ENTERPRISE must be set'
+  exit 1
+fi

-for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
-  dump "$config" 'Using {path} with the following content:'
+if [ -z "${RUNNER_TOKEN}" ]; then
+  log.error 'RUNNER_TOKEN must be set'
+  exit 1
+fi
+
+if [ -z "${RUNNER_REPO}" ] && [ -n "${RUNNER_GROUP}" ];then
+  RUNNER_GROUPS=${RUNNER_GROUP}
+fi
+
+# Hack due to https://github.com/actions-runner-controller/actions-runner-controller/issues/252#issuecomment-758338483
+if [ ! -d "${RUNNER_HOME}" ]; then
+  log.error "$RUNNER_HOME should be an emptyDir mount. Please fix the pod spec."
+  exit 1
+fi
+
+# if this is not a testing environment
+if [[ "${UNITTEST:-}" == '' ]]; then
+  sudo chown -R runner:docker "$RUNNER_HOME"
+  # enable dotglob so we can copy a ".env" file to load in env vars as part of the service startup if one is provided
+  # loading a .env from the root of the service is part of the actions/runner logic
+  shopt -s dotglob
+  # use cp instead of mv to avoid issues when src and dst are on different devices
+  cp -r "$RUNNER_ASSETS_DIR"/* "$RUNNER_HOME"/
+  shopt -u dotglob
+fi
+
+if ! cd "${RUNNER_HOME}"; then
+  log.error "Failed to cd into ${RUNNER_HOME}"
+  exit 1
+fi
+
+# past that point, it's all relative pathes from /runner
+
+config_args=()
+if [ "${RUNNER_FEATURE_FLAG_ONCE:-}" != "true" ] && [ "${RUNNER_EPHEMERAL}" == "true" ]; then
+  config_args+=(--ephemeral)
+  log.debug 'Passing --ephemeral to config.sh to enable the ephemeral runner.'
+fi
+if [ "${DISABLE_RUNNER_UPDATE:-}" == "true" ]; then
+  config_args+=(--disableupdate)
+  log.debug 'Passing --disableupdate to config.sh to disable automatic runner updates.'
+fi
+
+update-status "Registering"
+
+retries_left=10
+while [[ ${retries_left} -gt 0 ]]; do
+  log.debug 'Configuring the runner.'
+  ./config.sh --unattended --replace \
+    --name "${RUNNER_NAME}" \
+    --url "${GITHUB_URL}${ATTACH}" \
+    --token "${RUNNER_TOKEN}" \
+    --runnergroup "${RUNNER_GROUPS}" \
+    --labels "${RUNNER_LABELS}" \
+    --work "${RUNNER_WORKDIR}" "${config_args[@]}"
+
+  if [ -f .runner ]; then
+    log.debug 'Runner successfully configured.'
+    break
+  fi
+
+  log.debug 'Configuration failed. Retrying'
+  retries_left=$((retries_left - 1))
+  sleep 1
 done

-log.debug 'Starting supervisor daemon'
-sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
-
-log.debug 'Waiting for processes to be running...'
-processes=(dockerd)
-
-for process in "${processes[@]}"; do
-    if ! wait_for_process "$process"; then
-        log.error "$process is not running after max time"
-        dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
-        dump /var/log/supervisor/supervisord.log 'Dumping {path} to aid investigation'
-        exit 1
-    else
-        log.debug "$process is running"
-    fi
-done
-
-if [ -n "${MTU}" ]; then
-  sudo ifconfig docker0 mtu "${MTU}" up
+if [ ! -f .runner ]; then
+  # we couldn't configure and register the runner; no point continuing
+  log.error 'Configuration failed!'
+  exit 2
 fi

-# Wait processes to be running
-entrypoint.sh
+cat .runner
+# Note: the `.runner` file's content should be something like the below:
+#
+# $ cat /runner/.runner
+# {
+# "agentId": 117, #=> corresponds to the ID of the runner
+# "agentName": "THE_RUNNER_POD_NAME",
+# "poolId": 1,
+# "poolName": "Default",
+# "serverUrl": "https://pipelines.actions.githubusercontent.com/SOME_RANDOM_ID",
+# "gitHubUrl": "https://github.com/USER/REPO",
+# "workFolder": "/some/work/dir" #=> corresponds to Runner.Spec.WorkDir
+# }
+#
+# Especially `agentId` is important, as other than listing all the runners in the repo,
+# this is the only change we could get the exact runnner ID which can be useful for further
+# GitHub API call like the below. Note that 171 is the agentId seen above.
+#   curl \
+#     -H "Accept: application/vnd.github.v3+json" \
+#     -H "Authorization: bearer ${GITHUB_TOKEN}"
+#     https://api.github.com/repos/USER/REPO/actions/runners/171
+
+# Hack due to the DinD volumes
+if [ -z "${UNITTEST:-}" ] && [ -e ./externalstmp ]; then
+  mkdir -p ./externals
+  mv ./externalstmp/* ./externals/
+fi
+
+if [[ "${DISABLE_WAIT_FOR_DOCKER}" != "true" ]] && [[ "${DOCKER_ENABLED}" == "true" ]]; then
+    log.debug 'Docker enabled runner detected and Docker daemon wait is enabled'
+    log.debug 'Waiting until Docker is available or the timeout is reached'
+    timeout 120s bash -c 'until docker ps ;do sleep 1; done'
+else
+  log.notice 'Docker wait check skipped. Either Docker is disabled or the wait is disabled, continuing with entrypoint'
+fi
+
+# Unset entrypoint environment variables so they don't leak into the runner environment
+unset RUNNER_NAME RUNNER_REPO RUNNER_TOKEN STARTUP_DELAY_IN_SECONDS DISABLE_WAIT_FOR_DOCKER
+
+# Docker ignores PAM and thus never loads the system environment variables that
+# are meant to be set in every environment of every user. We emulate the PAM
+# behavior by reading the environment variables without interpreting them.
+#
+# https://github.com/actions-runner-controller/actions-runner-controller/issues/1135
+# https://github.com/actions/runner/issues/1703
+
+# /etc/environment may not exist when running unit tests depending on the platform being used
+# (e.g. Mac OS) so we just skip the mapping entirely
+if [ -z "${UNITTEST:-}" ]; then
+  mapfile -t env </etc/environment
+fi
+update-status "Idle"
+exec env -- "${env[@]}" ./run.sh