Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535 Ref #1581 Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
2025-12-28 04:29:08 +08:00 · 2022-11-01 20:30:10 +09:00
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions
--- a/runner/actions-runner-dind-rootless.dockerfile
+++ b/runner/actions-runner-dind-rootless.dockerfile
@@ -102,9 +102,9 @@ RUN export ARCH=$(echo ${TARGETPLATFORM} | cut -d / -f2) \
    && curl -f -L -o /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v${DUMB_INIT_VERSION}/dumb-init_${DUMB_INIT_VERSION}_${ARCH} \
    && chmod +x /usr/local/bin/dumb-init

-COPY entrypoint.sh logger.bash rootless-startup.sh update-status /usr/bin/
+COPY entrypoint-dind-rootless.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/

-RUN chmod +x /usr/bin/rootless-startup.sh /usr/bin/entrypoint.sh
+RUN chmod +x /usr/bin/entrypoint-dind-rootless.sh /usr/bin/startup.sh

 # Copy the docker shim which propagates the docker MTU to underlying networks
 # to replace the docker binary in the PATH.
@@ -140,5 +140,5 @@ RUN curl -fsSL https://get.docker.com/rootless | sh
 RUN curl -L "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-Linux-x86_64" -o /home/runner/bin/docker-compose ; \
    chmod +x /home/runner/bin/docker-compose

-ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
-CMD ["rootless-startup.sh"]
+ENTRYPOINT ["/bin/bash", "-c"]
+CMD ["entrypoint-dind-rootless.sh"]
--- a/runner/actions-runner-dind.dockerfile
+++ b/runner/actions-runner-dind.dockerfile
@@ -99,9 +99,9 @@ RUN mkdir /opt/hostedtoolcache \

 # We place the scripts in `/usr/bin` so that users who extend this image can
 # override them with scripts of the same name placed in `/usr/local/bin`.
-COPY entrypoint.sh logger.bash startup.sh update-status /usr/bin/
+COPY entrypoint-dind.sh startup.sh logger.sh wait.sh graceful-stop.sh update-status /usr/bin/
 COPY supervisor/ /etc/supervisor/conf.d/
-RUN chmod +x /usr/bin/startup.sh /usr/bin/entrypoint.sh
+RUN chmod +x /usr/bin/entrypoint-dind.sh /usr/bin/startup.sh

 # Copy the docker shim which propagates the docker MTU to underlying networks
 # to replace the docker binary in the PATH.
@@ -130,5 +130,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \
 # No group definition, as that makes it harder to run docker.
 USER runner

-ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
-CMD ["startup.sh"]
+ENTRYPOINT ["/bin/bash", "-c"]
+CMD ["entrypoint-dind.sh"]
--- a/runner/actions-runner.dockerfile
+++ b/runner/actions-runner.dockerfile
@@ -117,7 +117,7 @@ RUN mkdir /opt/hostedtoolcache \

 # We place the scripts in `/usr/bin` so that users who extend this image can
 # override them with scripts of the same name placed in `/usr/local/bin`.
-COPY entrypoint.sh logger.bash update-status /usr/bin/
+COPY entrypoint.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/

 # Copy the docker shim which propagates the docker MTU to underlying networks
 # to replace the docker binary in the PATH.
@@ -136,5 +136,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \

 USER runner

-ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
+ENTRYPOINT ["/bin/bash", "-c"]
 CMD ["entrypoint.sh"]
--- a/runner/entrypoint-dind-rootless.sh
+++ b/runner/entrypoint-dind-rootless.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
-source logger.bash
+source logger.sh
+source graceful-stop.sh
+trap graceful_stop TERM

 log.notice "Writing out Docker config file"
 /bin/bash <<SCRIPT
@@ -21,7 +23,20 @@ fi
 SCRIPT

 log.notice "Starting Docker (rootless)"
+
+dumb-init bash <<'SCRIPT' &
+# Note that we don't want dockerd to be terminated before the runner agent,
+# because it defeats the goal of the runner agent graceful stop logic implemenbed above.
+# We can't rely on e.g. `dumb-init --single-child` for that, because with `--single-child` we can't even trap SIGTERM
+# for not only dockerd but also the runner agent.
 /home/runner/bin/dockerd-rootless.sh --config-file /home/runner/.config/docker/daemon.json >> /dev/null 2>&1 &

-# Wait processes to be running
-entrypoint.sh
+startup.sh
+SCRIPT
+
+RUNNER_INIT_PID=$!
+log.notice "Runner init started with pid $RUNNER_INIT_PID"
+wait $RUNNER_INIT_PID
+log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
+
+trap - TERM
--- a/runner/entrypoint-dind.sh
+++ b/runner/entrypoint-dind.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+source logger.sh
+source graceful-stop.sh
+trap graceful_stop TERM
+
+sudo /bin/bash <<SCRIPT
+mkdir -p /etc/docker
+
+if [ ! -f /etc/docker/daemon.json ]; then
+  echo "{}" > /etc/docker/daemon.json
+fi
+
+if [ -n "${MTU}" ]; then
+jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
+# See https://docs.docker.com/engine/security/rootless/
+echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
+fi
+
+if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
+jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
+fi
+SCRIPT
+
+dumb-init bash <<'SCRIPT' &
+source logger.sh
+source wait.sh
+
+dump() {
+  local path=${1:?missing required <path> argument}
+  shift
+  printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
+  cat "$path" 1>&2
+  printf -- '---\n' 1>&2
+}
+
+for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
+  dump "$config" 'Using {path} with the following content:'
+done
+
+log.debug 'Starting supervisor daemon'
+sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
+
+log.debug 'Waiting for processes to be running...'
+processes=(dockerd)
+
+for process in "${processes[@]}"; do
+    if ! wait_for_process "$process"; then
+        log.error "$process is not running after max time"
+        dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
+        dump /var/log/supervisor/supervisord.log 'Dumping {path} to aid investigation'
+        exit 1
+    else
+        log.debug "$process is running"
+    fi
+done
+
+if [ -n "${MTU}" ]; then
+  sudo ifconfig docker0 mtu "${MTU}" up
+fi
+
+startup.sh
+SCRIPT
+
+RUNNER_INIT_PID=$!
+log.notice "Runner init started with pid $RUNNER_INIT_PID"
+wait $RUNNER_INIT_PID
+log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
+
+trap - TERM
--- a/runner/entrypoint.sh
+++ b/runner/entrypoint.sh
@@ -1,172 +1,30 @@
 #!/bin/bash
-source logger.bash
+source logger.sh
+source graceful-stop.sh
+trap graceful_stop TERM

-RUNNER_ASSETS_DIR=${RUNNER_ASSETS_DIR:-/runnertmp}
-RUNNER_HOME=${RUNNER_HOME:-/runner}
+dumb-init bash <<'SCRIPT' &
+source logger.sh

-# Let GitHub runner execute these hooks. These environment variables are used by GitHub's Runner as described here
-# https://github.com/actions/runner/blob/main/docs/adrs/1751-runner-job-hooks.md
-# Scripts referenced in the ACTIONS_RUNNER_HOOK_ environment variables must end in .sh or .ps1
-# for it to become a valid hook script, otherwise GitHub will fail to run the hook
-export ACTIONS_RUNNER_HOOK_JOB_STARTED=/etc/arc/hooks/job-started.sh
-export ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/etc/arc/hooks/job-completed.sh
+startup.sh
+SCRIPT

-if [ -n "${STARTUP_DELAY_IN_SECONDS}" ]; then
-  log.notice "Delaying startup by ${STARTUP_DELAY_IN_SECONDS} seconds"
-  sleep "${STARTUP_DELAY_IN_SECONDS}"
+RUNNER_INIT_PID=$!
+log.notice "Runner init started with pid $RUNNER_INIT_PID"
+wait $RUNNER_INIT_PID
+log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
+
+if [ -f /runner/.runner ]; then
+# If the runner failed with the following error:
+#   √ Connected to GitHub
+#   Failed to create a session. The runner registration has been deleted from the server, please re-configure.
+#   Runner listener exit with terminated error, stop the service, no retry needed.
+#   Exiting runner...
+# It might have failed to delete the .runner file.
+# We use the existence of the .runner file as the indicator that the runner agent has not stopped yet.
+# Remove it by ourselves now, so that the dockerd sidecar prestop won't hang waiting for the .runner file to appear.
+  echo "Removing the .runner file"
+  rm -f /runner/.runner
 fi

-if [ -z "${GITHUB_URL}" ]; then
-  log.debug 'Working with public GitHub'
-  GITHUB_URL="https://github.com/"
-else
-  length=${#GITHUB_URL}
-  last_char=${GITHUB_URL:length-1:1}
-
-  [[ $last_char != "/" ]] && GITHUB_URL="$GITHUB_URL/"; :
-  log.debug "Github endpoint URL ${GITHUB_URL}"
-fi
-
-if [ -z "${RUNNER_NAME}" ]; then
-  log.error 'RUNNER_NAME must be set'
-  exit 1
-fi
-
-if [ -n "${RUNNER_ORG}" ] && [ -n "${RUNNER_REPO}" ] && [ -n "${RUNNER_ENTERPRISE}" ]; then
-  ATTACH="${RUNNER_ORG}/${RUNNER_REPO}"
-elif [ -n "${RUNNER_ORG}" ]; then
-  ATTACH="${RUNNER_ORG}"
-elif [ -n "${RUNNER_REPO}" ]; then
-  ATTACH="${RUNNER_REPO}"
-elif [ -n "${RUNNER_ENTERPRISE}" ]; then
-  ATTACH="enterprises/${RUNNER_ENTERPRISE}"
-else
-  log.error 'At least one of RUNNER_ORG, RUNNER_REPO, or RUNNER_ENTERPRISE must be set'
-  exit 1
-fi
-
-if [ -z "${RUNNER_TOKEN}" ]; then
-  log.error 'RUNNER_TOKEN must be set'
-  exit 1
-fi
-
-if [ -z "${RUNNER_REPO}" ] && [ -n "${RUNNER_GROUP}" ];then
-  RUNNER_GROUPS=${RUNNER_GROUP}
-fi
-
-# Hack due to https://github.com/actions-runner-controller/actions-runner-controller/issues/252#issuecomment-758338483
-if [ ! -d "${RUNNER_HOME}" ]; then
-  log.error "$RUNNER_HOME should be an emptyDir mount. Please fix the pod spec."
-  exit 1
-fi
-
-# if this is not a testing environment
-if [[ "${UNITTEST:-}" == '' ]]; then
-  sudo chown -R runner:docker "$RUNNER_HOME"
-  # enable dotglob so we can copy a ".env" file to load in env vars as part of the service startup if one is provided
-  # loading a .env from the root of the service is part of the actions/runner logic
-  shopt -s dotglob
-  # use cp instead of mv to avoid issues when src and dst are on different devices
-  cp -r "$RUNNER_ASSETS_DIR"/* "$RUNNER_HOME"/
-  shopt -u dotglob
-fi
-
-if ! cd "${RUNNER_HOME}"; then
-  log.error "Failed to cd into ${RUNNER_HOME}"
-  exit 1
-fi
-
-# past that point, it's all relative pathes from /runner
-
-config_args=()
-if [ "${RUNNER_FEATURE_FLAG_ONCE:-}" != "true" ] && [ "${RUNNER_EPHEMERAL}" == "true" ]; then
-  config_args+=(--ephemeral)
-  log.debug 'Passing --ephemeral to config.sh to enable the ephemeral runner.'
-fi
-if [ "${DISABLE_RUNNER_UPDATE:-}" == "true" ]; then
-  config_args+=(--disableupdate)
-  log.debug 'Passing --disableupdate to config.sh to disable automatic runner updates.'
-fi
-
-update-status "Registering"
-
-retries_left=10
-while [[ ${retries_left} -gt 0 ]]; do
-  log.debug 'Configuring the runner.'
-  ./config.sh --unattended --replace \
-    --name "${RUNNER_NAME}" \
-    --url "${GITHUB_URL}${ATTACH}" \
-    --token "${RUNNER_TOKEN}" \
-    --runnergroup "${RUNNER_GROUPS}" \
-    --labels "${RUNNER_LABELS}" \
-    --work "${RUNNER_WORKDIR}" "${config_args[@]}"
-
-  if [ -f .runner ]; then
-    log.debug 'Runner successfully configured.'
-    break
-  fi
-
-  log.debug 'Configuration failed. Retrying'
-  retries_left=$((retries_left - 1))
-  sleep 1
-done
-
-if [ ! -f .runner ]; then
-  # we couldn't configure and register the runner; no point continuing
-  log.error 'Configuration failed!'
-  exit 2
-fi
-
-cat .runner
-# Note: the `.runner` file's content should be something like the below:
-#
-# $ cat /runner/.runner
-# {
-# "agentId": 117, #=> corresponds to the ID of the runner
-# "agentName": "THE_RUNNER_POD_NAME",
-# "poolId": 1,
-# "poolName": "Default",
-# "serverUrl": "https://pipelines.actions.githubusercontent.com/SOME_RANDOM_ID",
-# "gitHubUrl": "https://github.com/USER/REPO",
-# "workFolder": "/some/work/dir" #=> corresponds to Runner.Spec.WorkDir
-# }
-#
-# Especially `agentId` is important, as other than listing all the runners in the repo,
-# this is the only change we could get the exact runnner ID which can be useful for further
-# GitHub API call like the below. Note that 171 is the agentId seen above.
-#   curl \
-#     -H "Accept: application/vnd.github.v3+json" \
-#     -H "Authorization: bearer ${GITHUB_TOKEN}"
-#     https://api.github.com/repos/USER/REPO/actions/runners/171
-
-# Hack due to the DinD volumes
-if [ -z "${UNITTEST:-}" ] && [ -e ./externalstmp ]; then
-  mkdir -p ./externals
-  mv ./externalstmp/* ./externals/
-fi
-
-if [[ "${DISABLE_WAIT_FOR_DOCKER}" != "true" ]] && [[ "${DOCKER_ENABLED}" == "true" ]]; then
-    log.debug 'Docker enabled runner detected and Docker daemon wait is enabled'
-    log.debug 'Waiting until Docker is available or the timeout is reached'
-    timeout 120s bash -c 'until docker ps ;do sleep 1; done'
-else
-  log.notice 'Docker wait check skipped. Either Docker is disabled or the wait is disabled, continuing with entrypoint'
-fi
-
-# Unset entrypoint environment variables so they don't leak into the runner environment
-unset RUNNER_NAME RUNNER_REPO RUNNER_TOKEN STARTUP_DELAY_IN_SECONDS DISABLE_WAIT_FOR_DOCKER
-
-# Docker ignores PAM and thus never loads the system environment variables that
-# are meant to be set in every environment of every user. We emulate the PAM
-# behavior by reading the environment variables without interpreting them.
-#
-# https://github.com/actions-runner-controller/actions-runner-controller/issues/1135
-# https://github.com/actions/runner/issues/1703
-
-# /etc/environment may not exist when running unit tests depending on the platform being used
-# (e.g. Mac OS) so we just skip the mapping entirely
-if [ -z "${UNITTEST:-}" ]; then
-  mapfile -t env </etc/environment
-fi
-update-status "Idle"
-exec env -- "${env[@]}" ./run.sh
+trap - TERM
--- a/runner/graceful-stop.sh
+++ b/runner/graceful-stop.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# This should be shorter enough than the terminationGracePeriodSeconds,
+# so that the job is cancelled immediately, instead of hanging for 10 minutes or so and failing without any error message.
+RUNNER_GRACEFUL_STOP_TIMEOUT=${RUNNER_GRACEFUL_STOP_TIMEOUT:-15}
+
+graceful_stop() {
+  log.notice "Executing actions-runner-controller's SIGTERM handler."
+  log.notice "Note that if this takes more time than terminationGracePeriodSeconds, the runner will be forcefully terminated by Kubernetes, which may result in the in-progress workflow job, if any, to fail."
+
+  log.notice "Ensuring dockerd is still running."
+  if ! docker ps -a; then
+    log.warning "Detected configuration error: dockerd should be running but is already nowhere. This is wrong. Ensure that your init system to NOT pass SIGTERM directly to dockerd!"
+  fi
+
+  # The below procedure atomically removes the runner from GitHub Actions service,
+  # to ensure that the runner is not running any job.
+  # This is required to not terminate the actions runner agent while running the job.
+  # If we didn't do this atomically, we might end up with a rare race where
+  # the runner agent is terminated while it was about to start a job.
+
+  # `pushd`` is needed to run the config.sh successfully.
+  # Without this the author of this script ended up with errors like the below:
+  #   Cannot connect to server, because config files are missing. Skipping removing runner from the server.
+  #   Does not exist. Skipping Removing .credentials
+  #   Does not exist. Skipping Removing .runner
+  if ! pushd /runner; then
+    log.error "Failed to pushd ${RUNNER_HOME}"
+    exit 1
+  fi
+
+  # We need to wait for the registration first.
+  # Otherwise a direct runner pod deletion triggered while the runner entrypoint.sh is about to register itself with
+  # config.sh can result in this graceful stop process to get skipped.
+  # In that case, the pod is eventually and forcefully terminated by ARC and K8s, resulting
+  # in the possible running workflow job after this graceful stop process failed might get cancelled prematurely.
+  log.notice "Waiting for the runner to register first."
+  while ! [ -f /runner/.runner ]; do
+    sleep 1
+  done
+  log.notice "Observed that the runner has been registered."
+
+  if ! /runner/config.sh remove --token "$RUNNER_TOKEN"; then
+    i=0
+    log.notice "Waiting for RUNNER_GRACEFUL_STOP_TIMEOUT=$RUNNER_GRACEFUL_STOP_TIMEOUT seconds until the runner agent to stop by itself."
+    while [[ $i -lt $RUNNER_GRACEFUL_STOP_TIMEOUT ]]; do
+      sleep 1
+      if ! pgrep Runner.Listener > /dev/null; then
+        log.notice "The runner agent stopped before RUNNER_GRACEFUL_STOP_TIMEOUT=$RUNNER_GRACEFUL_STOP_TIMEOUT"
+        break
+      fi
+      i=$((i+1))
+    done
+  fi
+
+  if ! popd; then
+    log.error "Failed to popd from ${RUNNER_HOME}"
+    exit 1
+  fi
+
+  if pgrep Runner.Listener > /dev/null; then
+    # The below procedure fixes the runner to correctly notify the Actions service for the cancellation of this runner.
+    # It enables you to see `Error: The operation was canceled.` in the worklow job log, in case a job was still running on this runner when the
+    # termination is requested.
+    #
+    # Note though, due to how Actions work, no all job steps gets `Error: The operation was canceled.` in the job step logs.
+    # Jobs that were still in the first `Stet up job` step` seem to get `Error: A task was canceled.`,
+    #
+    # Anyway, without this, a runer pod is "forcefully" killed by any other controller (like cluster-autoscaler) can result in the workflow job to
+    # hang for 10 minutes or so.
+    # After 10 minutes, the Actions UI just shows the failure icon for the step, without `Error: The operation was canceled.`,
+    # not even showing `Error: The operation was canceled.`, which is confusing.
+    runner_listener_pid=$(pgrep Runner.Listener)
+    log.notice "Sending SIGTERM to the actions runner agent ($runner_listener_pid)."
+    kill -TERM "$runner_listener_pid"
+
+    log.notice "SIGTERM sent. If the runner is still running a job, you'll probably see \"Error: The operation was canceled.\" in its log."
+    log.notice "Waiting for the actions runner agent to stop."
+    while pgrep Runner.Listener > /dev/null; do
+      sleep 1
+    done
+  fi
+
+  # This message is supposed to be output only after the runner agent output:
+  #   2022-08-27 02:04:37Z: Job test3 completed with result: Canceled
+  # because this graceful stopping logic is basically intended to let the runner agent have some time
+  # needed to "Cancel" it.
+  # At the times we didn't have this logic, the runner agent was even unable to output the Cancelled message hence
+  # unable to gracefully stop, hence the workflow job hanged like forever.
+  log.notice "The actions runner process exited."
+  
+  if [ "$RUNNER_INIT_PID" != "" ]; then
+    log.notice "Holding on until runner init (pid $RUNNER_INIT_PID) exits, so that there will hopefully be no zombie processes remaining."
+    # We don't need to kill -TERM $RUNNER_INIT_PID as the init is supposed to exit by itself once the foreground process(=the runner agent) exists.
+    wait "$RUNNER_INIT_PID" || :
+  fi
+  
+  log.notice "Graceful stop completed."
+}
--- a/runner/hooks/job-completed.sh
+++ b/runner/hooks/job-completed.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -Eeuo pipefail

-# shellcheck source=runner/logger.bash
-source logger.bash
+# shellcheck source=runner/logger.sh
+source logger.sh

 log.debug "Running ARC Job Completed Hooks"

--- a/runner/hooks/job-started.sh
+++ b/runner/hooks/job-started.sh
@@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 set -Eeuo pipefail

-# shellcheck source=runner/logger.bash
-source logger.bash
+# shellcheck source=runner/logger.sh
+source logger.sh

 log.debug "Running ARC Job Started Hooks"

--- a/runner/logger.bash
+++ b/runner/logger.bash
@@ -6,7 +6,7 @@
 # are not using any variables that need to be set, and are not using any pipes.

 # This logger implementation can be replaced with another logger implementation
-# by placing a script called `logger.bash` in `/usr/local/bin` of the image. The
+# by placing a script called `logger.sh` in `/usr/local/bin` of the image. The
 # only requirement for the script is that it defines the following functions:
 #
 # - `log.debug`
--- a/runner/startup.sh
+++ b/runner/startup.sh
@@ -1,72 +1,172 @@
 #!/bin/bash
-source logger.bash
+source logger.sh

-function wait_for_process () {
-    local max_time_wait=30
-    local process_name="$1"
-    local waited_sec=0
-    while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
-        log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
-        log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
-        sleep 1
-        ((waited_sec=waited_sec+1))
-        if ((waited_sec >= max_time_wait)); then
-            return 1
-        fi
-    done
-    return 0
-}
+RUNNER_ASSETS_DIR=${RUNNER_ASSETS_DIR:-/runnertmp}
+RUNNER_HOME=${RUNNER_HOME:-/runner}

-sudo /bin/bash <<SCRIPT
-mkdir -p /etc/docker
+# Let GitHub runner execute these hooks. These environment variables are used by GitHub's Runner as described here
+# https://github.com/actions/runner/blob/main/docs/adrs/1751-runner-job-hooks.md
+# Scripts referenced in the ACTIONS_RUNNER_HOOK_ environment variables must end in .sh or .ps1
+# for it to become a valid hook script, otherwise GitHub will fail to run the hook
+export ACTIONS_RUNNER_HOOK_JOB_STARTED=/etc/arc/hooks/job-started.sh
+export ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/etc/arc/hooks/job-completed.sh

-if [ ! -f /etc/docker/daemon.json ]; then
-  echo "{}" > /etc/docker/daemon.json
+if [ -n "${STARTUP_DELAY_IN_SECONDS}" ]; then
+  log.notice "Delaying startup by ${STARTUP_DELAY_IN_SECONDS} seconds"
+  sleep "${STARTUP_DELAY_IN_SECONDS}"
 fi

-if [ -n "${MTU}" ]; then
-jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
-# See https://docs.docker.com/engine/security/rootless/
-echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
+if [ -z "${GITHUB_URL}" ]; then
+  log.debug 'Working with public GitHub'
+  GITHUB_URL="https://github.com/"
+else
+  length=${#GITHUB_URL}
+  last_char=${GITHUB_URL:length-1:1}
+
+  [[ $last_char != "/" ]] && GITHUB_URL="$GITHUB_URL/"; :
+  log.debug "Github endpoint URL ${GITHUB_URL}"
 fi

-if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
-jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
+if [ -z "${RUNNER_NAME}" ]; then
+  log.error 'RUNNER_NAME must be set'
+  exit 1
 fi
-SCRIPT

-dump() {
-  local path=${1:?missing required <path> argument}
-  shift
-  printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
-  cat "$path" 1>&2
-  printf -- '---\n' 1>&2
-}
+if [ -n "${RUNNER_ORG}" ] && [ -n "${RUNNER_REPO}" ] && [ -n "${RUNNER_ENTERPRISE}" ]; then
+  ATTACH="${RUNNER_ORG}/${RUNNER_REPO}"
+elif [ -n "${RUNNER_ORG}" ]; then
+  ATTACH="${RUNNER_ORG}"
+elif [ -n "${RUNNER_REPO}" ]; then
+  ATTACH="${RUNNER_REPO}"
+elif [ -n "${RUNNER_ENTERPRISE}" ]; then
+  ATTACH="enterprises/${RUNNER_ENTERPRISE}"
+else
+  log.error 'At least one of RUNNER_ORG, RUNNER_REPO, or RUNNER_ENTERPRISE must be set'
+  exit 1
+fi

-for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
-  dump "$config" 'Using {path} with the following content:'
+if [ -z "${RUNNER_TOKEN}" ]; then
+  log.error 'RUNNER_TOKEN must be set'
+  exit 1
+fi
+
+if [ -z "${RUNNER_REPO}" ] && [ -n "${RUNNER_GROUP}" ];then
+  RUNNER_GROUPS=${RUNNER_GROUP}
+fi
+
+# Hack due to https://github.com/actions-runner-controller/actions-runner-controller/issues/252#issuecomment-758338483
+if [ ! -d "${RUNNER_HOME}" ]; then
+  log.error "$RUNNER_HOME should be an emptyDir mount. Please fix the pod spec."
+  exit 1
+fi
+
+# if this is not a testing environment
+if [[ "${UNITTEST:-}" == '' ]]; then
+  sudo chown -R runner:docker "$RUNNER_HOME"
+  # enable dotglob so we can copy a ".env" file to load in env vars as part of the service startup if one is provided
+  # loading a .env from the root of the service is part of the actions/runner logic
+  shopt -s dotglob
+  # use cp instead of mv to avoid issues when src and dst are on different devices
+  cp -r "$RUNNER_ASSETS_DIR"/* "$RUNNER_HOME"/
+  shopt -u dotglob
+fi
+
+if ! cd "${RUNNER_HOME}"; then
+  log.error "Failed to cd into ${RUNNER_HOME}"
+  exit 1
+fi
+
+# past that point, it's all relative pathes from /runner
+
+config_args=()
+if [ "${RUNNER_FEATURE_FLAG_ONCE:-}" != "true" ] && [ "${RUNNER_EPHEMERAL}" == "true" ]; then
+  config_args+=(--ephemeral)
+  log.debug 'Passing --ephemeral to config.sh to enable the ephemeral runner.'
+fi
+if [ "${DISABLE_RUNNER_UPDATE:-}" == "true" ]; then
+  config_args+=(--disableupdate)
+  log.debug 'Passing --disableupdate to config.sh to disable automatic runner updates.'
+fi
+
+update-status "Registering"
+
+retries_left=10
+while [[ ${retries_left} -gt 0 ]]; do
+  log.debug 'Configuring the runner.'
+  ./config.sh --unattended --replace \
+    --name "${RUNNER_NAME}" \
+    --url "${GITHUB_URL}${ATTACH}" \
+    --token "${RUNNER_TOKEN}" \
+    --runnergroup "${RUNNER_GROUPS}" \
+    --labels "${RUNNER_LABELS}" \
+    --work "${RUNNER_WORKDIR}" "${config_args[@]}"
+
+  if [ -f .runner ]; then
+    log.debug 'Runner successfully configured.'
+    break
+  fi
+
+  log.debug 'Configuration failed. Retrying'
+  retries_left=$((retries_left - 1))
+  sleep 1
 done

-log.debug 'Starting supervisor daemon'
-sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
-
-log.debug 'Waiting for processes to be running...'
-processes=(dockerd)
-
-for process in "${processes[@]}"; do
-    if ! wait_for_process "$process"; then
-        log.error "$process is not running after max time"
-        dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
-        dump /var/log/supervisor/supervisord.log 'Dumping {path} to aid investigation'
-        exit 1
-    else
-        log.debug "$process is running"
-    fi
-done
-
-if [ -n "${MTU}" ]; then
-  sudo ifconfig docker0 mtu "${MTU}" up
+if [ ! -f .runner ]; then
+  # we couldn't configure and register the runner; no point continuing
+  log.error 'Configuration failed!'
+  exit 2
 fi

-# Wait processes to be running
-entrypoint.sh
+cat .runner
+# Note: the `.runner` file's content should be something like the below:
+#
+# $ cat /runner/.runner
+# {
+# "agentId": 117, #=> corresponds to the ID of the runner
+# "agentName": "THE_RUNNER_POD_NAME",
+# "poolId": 1,
+# "poolName": "Default",
+# "serverUrl": "https://pipelines.actions.githubusercontent.com/SOME_RANDOM_ID",
+# "gitHubUrl": "https://github.com/USER/REPO",
+# "workFolder": "/some/work/dir" #=> corresponds to Runner.Spec.WorkDir
+# }
+#
+# Especially `agentId` is important, as other than listing all the runners in the repo,
+# this is the only change we could get the exact runnner ID which can be useful for further
+# GitHub API call like the below. Note that 171 is the agentId seen above.
+#   curl \
+#     -H "Accept: application/vnd.github.v3+json" \
+#     -H "Authorization: bearer ${GITHUB_TOKEN}"
+#     https://api.github.com/repos/USER/REPO/actions/runners/171
+
+# Hack due to the DinD volumes
+if [ -z "${UNITTEST:-}" ] && [ -e ./externalstmp ]; then
+  mkdir -p ./externals
+  mv ./externalstmp/* ./externals/
+fi
+
+if [[ "${DISABLE_WAIT_FOR_DOCKER}" != "true" ]] && [[ "${DOCKER_ENABLED}" == "true" ]]; then
+    log.debug 'Docker enabled runner detected and Docker daemon wait is enabled'
+    log.debug 'Waiting until Docker is available or the timeout is reached'
+    timeout 120s bash -c 'until docker ps ;do sleep 1; done'
+else
+  log.notice 'Docker wait check skipped. Either Docker is disabled or the wait is disabled, continuing with entrypoint'
+fi
+
+# Unset entrypoint environment variables so they don't leak into the runner environment
+unset RUNNER_NAME RUNNER_REPO RUNNER_TOKEN STARTUP_DELAY_IN_SECONDS DISABLE_WAIT_FOR_DOCKER
+
+# Docker ignores PAM and thus never loads the system environment variables that
+# are meant to be set in every environment of every user. We emulate the PAM
+# behavior by reading the environment variables without interpreting them.
+#
+# https://github.com/actions-runner-controller/actions-runner-controller/issues/1135
+# https://github.com/actions/runner/issues/1703
+
+# /etc/environment may not exist when running unit tests depending on the platform being used
+# (e.g. Mac OS) so we just skip the mapping entirely
+if [ -z "${UNITTEST:-}" ]; then
+  mapfile -t env </etc/environment
+fi
+update-status "Idle"
+exec env -- "${env[@]}" ./run.sh
--- a/runner/update-status
+++ b/runner/update-status
@@ -2,8 +2,8 @@
 set -Eeuo pipefail

 if [[ ${1:-} == '' ]]; then
-  # shellcheck source=runner/logger.bash
-  source logger.bash
+  # shellcheck source=runner/logger.sh
+  source logger.sh
  log.error "Missing required argument -- '<phase>'"
  exit 64
 fi
@@ -26,6 +26,6 @@ if [[ ${RUNNER_STATUS_UPDATE_HOOK:-false} == true ]]; then
        --show-error \
        --silent \
        --request PATCH \
-        "${apiserver}/apis/actions.summerwind.dev/v1alpha1/namespaces/${namespace}/runners/${HOSTNAME}/status"
-        1>&-
+        "${apiserver}/apis/actions.summerwind.dev/v1alpha1/namespaces/${namespace}/runners/${HOSTNAME}/status" \
+        1>/dev/null
 fi
--- a/runner/wait.sh
+++ b/runner/wait.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+function wait_for_process () {
+    local max_time_wait=30
+    local process_name="$1"
+    local waited_sec=0
+    while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
+        log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
+        log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
+        sleep 1
+        ((waited_sec=waited_sec+1))
+        if ((waited_sec >= max_time_wait)); then
+            return 1
+        fi
+    done
+    return 0
+}