Fix runners to do their best to gracefully stop on pod eviction (#1759)

Ref #1535
Ref #1581

Signed-off-by: Yusuke Kuoka <ykuoka@gmail.com>
This commit is contained in:
Yusuke Kuoka
2022-11-01 20:30:10 +09:00
committed by GitHub
parent 332548093a
commit c74ad6195f
30 changed files with 757 additions and 301 deletions

View File

@@ -102,9 +102,9 @@ RUN export ARCH=$(echo ${TARGETPLATFORM} | cut -d / -f2) \
&& curl -f -L -o /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v${DUMB_INIT_VERSION}/dumb-init_${DUMB_INIT_VERSION}_${ARCH} \
&& chmod +x /usr/local/bin/dumb-init
COPY entrypoint.sh logger.bash rootless-startup.sh update-status /usr/bin/
COPY entrypoint-dind-rootless.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/
RUN chmod +x /usr/bin/rootless-startup.sh /usr/bin/entrypoint.sh
RUN chmod +x /usr/bin/entrypoint-dind-rootless.sh /usr/bin/startup.sh
# Copy the docker shim which propagates the docker MTU to underlying networks
# to replace the docker binary in the PATH.
@@ -140,5 +140,5 @@ RUN curl -fsSL https://get.docker.com/rootless | sh
RUN curl -L "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-Linux-x86_64" -o /home/runner/bin/docker-compose ; \
chmod +x /home/runner/bin/docker-compose
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
CMD ["rootless-startup.sh"]
ENTRYPOINT ["/bin/bash", "-c"]
CMD ["entrypoint-dind-rootless.sh"]

View File

@@ -99,9 +99,9 @@ RUN mkdir /opt/hostedtoolcache \
# We place the scripts in `/usr/bin` so that users who extend this image can
# override them with scripts of the same name placed in `/usr/local/bin`.
COPY entrypoint.sh logger.bash startup.sh update-status /usr/bin/
COPY entrypoint-dind.sh startup.sh logger.sh wait.sh graceful-stop.sh update-status /usr/bin/
COPY supervisor/ /etc/supervisor/conf.d/
RUN chmod +x /usr/bin/startup.sh /usr/bin/entrypoint.sh
RUN chmod +x /usr/bin/entrypoint-dind.sh /usr/bin/startup.sh
# Copy the docker shim which propagates the docker MTU to underlying networks
# to replace the docker binary in the PATH.
@@ -130,5 +130,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \
# No group definition, as that makes it harder to run docker.
USER runner
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
CMD ["startup.sh"]
ENTRYPOINT ["/bin/bash", "-c"]
CMD ["entrypoint-dind.sh"]

View File

@@ -117,7 +117,7 @@ RUN mkdir /opt/hostedtoolcache \
# We place the scripts in `/usr/bin` so that users who extend this image can
# override them with scripts of the same name placed in `/usr/local/bin`.
COPY entrypoint.sh logger.bash update-status /usr/bin/
COPY entrypoint.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/
# Copy the docker shim which propagates the docker MTU to underlying networks
# to replace the docker binary in the PATH.
@@ -136,5 +136,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \
USER runner
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
ENTRYPOINT ["/bin/bash", "-c"]
CMD ["entrypoint.sh"]

View File

@@ -1,5 +1,7 @@
#!/bin/bash
source logger.bash
source logger.sh
source graceful-stop.sh
trap graceful_stop TERM
log.notice "Writing out Docker config file"
/bin/bash <<SCRIPT
@@ -21,7 +23,20 @@ fi
SCRIPT
log.notice "Starting Docker (rootless)"
dumb-init bash <<'SCRIPT' &
# Note that we don't want dockerd to be terminated before the runner agent,
# because it defeats the goal of the runner agent graceful stop logic implemenbed above.
# We can't rely on e.g. `dumb-init --single-child` for that, because with `--single-child` we can't even trap SIGTERM
# for not only dockerd but also the runner agent.
/home/runner/bin/dockerd-rootless.sh --config-file /home/runner/.config/docker/daemon.json >> /dev/null 2>&1 &
# Wait processes to be running
entrypoint.sh
startup.sh
SCRIPT
RUNNER_INIT_PID=$!
log.notice "Runner init started with pid $RUNNER_INIT_PID"
wait $RUNNER_INIT_PID
log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
trap - TERM

69
runner/entrypoint-dind.sh Executable file
View File

@@ -0,0 +1,69 @@
#!/bin/bash
source logger.sh
source graceful-stop.sh
trap graceful_stop TERM
sudo /bin/bash <<SCRIPT
mkdir -p /etc/docker
if [ ! -f /etc/docker/daemon.json ]; then
echo "{}" > /etc/docker/daemon.json
fi
if [ -n "${MTU}" ]; then
jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
# See https://docs.docker.com/engine/security/rootless/
echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
fi
if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
fi
SCRIPT
dumb-init bash <<'SCRIPT' &
source logger.sh
source wait.sh
dump() {
local path=${1:?missing required <path> argument}
shift
printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
cat "$path" 1>&2
printf -- '---\n' 1>&2
}
for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
dump "$config" 'Using {path} with the following content:'
done
log.debug 'Starting supervisor daemon'
sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
log.debug 'Waiting for processes to be running...'
processes=(dockerd)
for process in "${processes[@]}"; do
if ! wait_for_process "$process"; then
log.error "$process is not running after max time"
dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
dump /var/log/supervisor/supervisord.log 'Dumping {path} to aid investigation'
exit 1
else
log.debug "$process is running"
fi
done
if [ -n "${MTU}" ]; then
sudo ifconfig docker0 mtu "${MTU}" up
fi
startup.sh
SCRIPT
RUNNER_INIT_PID=$!
log.notice "Runner init started with pid $RUNNER_INIT_PID"
wait $RUNNER_INIT_PID
log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
trap - TERM

View File

@@ -1,172 +1,30 @@
#!/bin/bash
source logger.bash
source logger.sh
source graceful-stop.sh
trap graceful_stop TERM
RUNNER_ASSETS_DIR=${RUNNER_ASSETS_DIR:-/runnertmp}
RUNNER_HOME=${RUNNER_HOME:-/runner}
dumb-init bash <<'SCRIPT' &
source logger.sh
# Let GitHub runner execute these hooks. These environment variables are used by GitHub's Runner as described here
# https://github.com/actions/runner/blob/main/docs/adrs/1751-runner-job-hooks.md
# Scripts referenced in the ACTIONS_RUNNER_HOOK_ environment variables must end in .sh or .ps1
# for it to become a valid hook script, otherwise GitHub will fail to run the hook
export ACTIONS_RUNNER_HOOK_JOB_STARTED=/etc/arc/hooks/job-started.sh
export ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/etc/arc/hooks/job-completed.sh
startup.sh
SCRIPT
if [ -n "${STARTUP_DELAY_IN_SECONDS}" ]; then
log.notice "Delaying startup by ${STARTUP_DELAY_IN_SECONDS} seconds"
sleep "${STARTUP_DELAY_IN_SECONDS}"
RUNNER_INIT_PID=$!
log.notice "Runner init started with pid $RUNNER_INIT_PID"
wait $RUNNER_INIT_PID
log.notice "Runner init exited. Exiting this process with code 0 so that the container and the pod is GC'ed Kubernetes soon."
if [ -f /runner/.runner ]; then
# If the runner failed with the following error:
# √ Connected to GitHub
# Failed to create a session. The runner registration has been deleted from the server, please re-configure.
# Runner listener exit with terminated error, stop the service, no retry needed.
# Exiting runner...
# It might have failed to delete the .runner file.
# We use the existence of the .runner file as the indicator that the runner agent has not stopped yet.
# Remove it by ourselves now, so that the dockerd sidecar prestop won't hang waiting for the .runner file to appear.
echo "Removing the .runner file"
rm -f /runner/.runner
fi
if [ -z "${GITHUB_URL}" ]; then
log.debug 'Working with public GitHub'
GITHUB_URL="https://github.com/"
else
length=${#GITHUB_URL}
last_char=${GITHUB_URL:length-1:1}
[[ $last_char != "/" ]] && GITHUB_URL="$GITHUB_URL/"; :
log.debug "Github endpoint URL ${GITHUB_URL}"
fi
if [ -z "${RUNNER_NAME}" ]; then
log.error 'RUNNER_NAME must be set'
exit 1
fi
if [ -n "${RUNNER_ORG}" ] && [ -n "${RUNNER_REPO}" ] && [ -n "${RUNNER_ENTERPRISE}" ]; then
ATTACH="${RUNNER_ORG}/${RUNNER_REPO}"
elif [ -n "${RUNNER_ORG}" ]; then
ATTACH="${RUNNER_ORG}"
elif [ -n "${RUNNER_REPO}" ]; then
ATTACH="${RUNNER_REPO}"
elif [ -n "${RUNNER_ENTERPRISE}" ]; then
ATTACH="enterprises/${RUNNER_ENTERPRISE}"
else
log.error 'At least one of RUNNER_ORG, RUNNER_REPO, or RUNNER_ENTERPRISE must be set'
exit 1
fi
if [ -z "${RUNNER_TOKEN}" ]; then
log.error 'RUNNER_TOKEN must be set'
exit 1
fi
if [ -z "${RUNNER_REPO}" ] && [ -n "${RUNNER_GROUP}" ];then
RUNNER_GROUPS=${RUNNER_GROUP}
fi
# Hack due to https://github.com/actions-runner-controller/actions-runner-controller/issues/252#issuecomment-758338483
if [ ! -d "${RUNNER_HOME}" ]; then
log.error "$RUNNER_HOME should be an emptyDir mount. Please fix the pod spec."
exit 1
fi
# if this is not a testing environment
if [[ "${UNITTEST:-}" == '' ]]; then
sudo chown -R runner:docker "$RUNNER_HOME"
# enable dotglob so we can copy a ".env" file to load in env vars as part of the service startup if one is provided
# loading a .env from the root of the service is part of the actions/runner logic
shopt -s dotglob
# use cp instead of mv to avoid issues when src and dst are on different devices
cp -r "$RUNNER_ASSETS_DIR"/* "$RUNNER_HOME"/
shopt -u dotglob
fi
if ! cd "${RUNNER_HOME}"; then
log.error "Failed to cd into ${RUNNER_HOME}"
exit 1
fi
# past that point, it's all relative pathes from /runner
config_args=()
if [ "${RUNNER_FEATURE_FLAG_ONCE:-}" != "true" ] && [ "${RUNNER_EPHEMERAL}" == "true" ]; then
config_args+=(--ephemeral)
log.debug 'Passing --ephemeral to config.sh to enable the ephemeral runner.'
fi
if [ "${DISABLE_RUNNER_UPDATE:-}" == "true" ]; then
config_args+=(--disableupdate)
log.debug 'Passing --disableupdate to config.sh to disable automatic runner updates.'
fi
update-status "Registering"
retries_left=10
while [[ ${retries_left} -gt 0 ]]; do
log.debug 'Configuring the runner.'
./config.sh --unattended --replace \
--name "${RUNNER_NAME}" \
--url "${GITHUB_URL}${ATTACH}" \
--token "${RUNNER_TOKEN}" \
--runnergroup "${RUNNER_GROUPS}" \
--labels "${RUNNER_LABELS}" \
--work "${RUNNER_WORKDIR}" "${config_args[@]}"
if [ -f .runner ]; then
log.debug 'Runner successfully configured.'
break
fi
log.debug 'Configuration failed. Retrying'
retries_left=$((retries_left - 1))
sleep 1
done
if [ ! -f .runner ]; then
# we couldn't configure and register the runner; no point continuing
log.error 'Configuration failed!'
exit 2
fi
cat .runner
# Note: the `.runner` file's content should be something like the below:
#
# $ cat /runner/.runner
# {
# "agentId": 117, #=> corresponds to the ID of the runner
# "agentName": "THE_RUNNER_POD_NAME",
# "poolId": 1,
# "poolName": "Default",
# "serverUrl": "https://pipelines.actions.githubusercontent.com/SOME_RANDOM_ID",
# "gitHubUrl": "https://github.com/USER/REPO",
# "workFolder": "/some/work/dir" #=> corresponds to Runner.Spec.WorkDir
# }
#
# Especially `agentId` is important, as other than listing all the runners in the repo,
# this is the only change we could get the exact runnner ID which can be useful for further
# GitHub API call like the below. Note that 171 is the agentId seen above.
# curl \
# -H "Accept: application/vnd.github.v3+json" \
# -H "Authorization: bearer ${GITHUB_TOKEN}"
# https://api.github.com/repos/USER/REPO/actions/runners/171
# Hack due to the DinD volumes
if [ -z "${UNITTEST:-}" ] && [ -e ./externalstmp ]; then
mkdir -p ./externals
mv ./externalstmp/* ./externals/
fi
if [[ "${DISABLE_WAIT_FOR_DOCKER}" != "true" ]] && [[ "${DOCKER_ENABLED}" == "true" ]]; then
log.debug 'Docker enabled runner detected and Docker daemon wait is enabled'
log.debug 'Waiting until Docker is available or the timeout is reached'
timeout 120s bash -c 'until docker ps ;do sleep 1; done'
else
log.notice 'Docker wait check skipped. Either Docker is disabled or the wait is disabled, continuing with entrypoint'
fi
# Unset entrypoint environment variables so they don't leak into the runner environment
unset RUNNER_NAME RUNNER_REPO RUNNER_TOKEN STARTUP_DELAY_IN_SECONDS DISABLE_WAIT_FOR_DOCKER
# Docker ignores PAM and thus never loads the system environment variables that
# are meant to be set in every environment of every user. We emulate the PAM
# behavior by reading the environment variables without interpreting them.
#
# https://github.com/actions-runner-controller/actions-runner-controller/issues/1135
# https://github.com/actions/runner/issues/1703
# /etc/environment may not exist when running unit tests depending on the platform being used
# (e.g. Mac OS) so we just skip the mapping entirely
if [ -z "${UNITTEST:-}" ]; then
mapfile -t env </etc/environment
fi
update-status "Idle"
exec env -- "${env[@]}" ./run.sh
trap - TERM

99
runner/graceful-stop.sh Normal file
View File

@@ -0,0 +1,99 @@
#!/bin/bash
# This should be shorter enough than the terminationGracePeriodSeconds,
# so that the job is cancelled immediately, instead of hanging for 10 minutes or so and failing without any error message.
RUNNER_GRACEFUL_STOP_TIMEOUT=${RUNNER_GRACEFUL_STOP_TIMEOUT:-15}
graceful_stop() {
log.notice "Executing actions-runner-controller's SIGTERM handler."
log.notice "Note that if this takes more time than terminationGracePeriodSeconds, the runner will be forcefully terminated by Kubernetes, which may result in the in-progress workflow job, if any, to fail."
log.notice "Ensuring dockerd is still running."
if ! docker ps -a; then
log.warning "Detected configuration error: dockerd should be running but is already nowhere. This is wrong. Ensure that your init system to NOT pass SIGTERM directly to dockerd!"
fi
# The below procedure atomically removes the runner from GitHub Actions service,
# to ensure that the runner is not running any job.
# This is required to not terminate the actions runner agent while running the job.
# If we didn't do this atomically, we might end up with a rare race where
# the runner agent is terminated while it was about to start a job.
# `pushd`` is needed to run the config.sh successfully.
# Without this the author of this script ended up with errors like the below:
# Cannot connect to server, because config files are missing. Skipping removing runner from the server.
# Does not exist. Skipping Removing .credentials
# Does not exist. Skipping Removing .runner
if ! pushd /runner; then
log.error "Failed to pushd ${RUNNER_HOME}"
exit 1
fi
# We need to wait for the registration first.
# Otherwise a direct runner pod deletion triggered while the runner entrypoint.sh is about to register itself with
# config.sh can result in this graceful stop process to get skipped.
# In that case, the pod is eventually and forcefully terminated by ARC and K8s, resulting
# in the possible running workflow job after this graceful stop process failed might get cancelled prematurely.
log.notice "Waiting for the runner to register first."
while ! [ -f /runner/.runner ]; do
sleep 1
done
log.notice "Observed that the runner has been registered."
if ! /runner/config.sh remove --token "$RUNNER_TOKEN"; then
i=0
log.notice "Waiting for RUNNER_GRACEFUL_STOP_TIMEOUT=$RUNNER_GRACEFUL_STOP_TIMEOUT seconds until the runner agent to stop by itself."
while [[ $i -lt $RUNNER_GRACEFUL_STOP_TIMEOUT ]]; do
sleep 1
if ! pgrep Runner.Listener > /dev/null; then
log.notice "The runner agent stopped before RUNNER_GRACEFUL_STOP_TIMEOUT=$RUNNER_GRACEFUL_STOP_TIMEOUT"
break
fi
i=$((i+1))
done
fi
if ! popd; then
log.error "Failed to popd from ${RUNNER_HOME}"
exit 1
fi
if pgrep Runner.Listener > /dev/null; then
# The below procedure fixes the runner to correctly notify the Actions service for the cancellation of this runner.
# It enables you to see `Error: The operation was canceled.` in the worklow job log, in case a job was still running on this runner when the
# termination is requested.
#
# Note though, due to how Actions work, no all job steps gets `Error: The operation was canceled.` in the job step logs.
# Jobs that were still in the first `Stet up job` step` seem to get `Error: A task was canceled.`,
#
# Anyway, without this, a runer pod is "forcefully" killed by any other controller (like cluster-autoscaler) can result in the workflow job to
# hang for 10 minutes or so.
# After 10 minutes, the Actions UI just shows the failure icon for the step, without `Error: The operation was canceled.`,
# not even showing `Error: The operation was canceled.`, which is confusing.
runner_listener_pid=$(pgrep Runner.Listener)
log.notice "Sending SIGTERM to the actions runner agent ($runner_listener_pid)."
kill -TERM "$runner_listener_pid"
log.notice "SIGTERM sent. If the runner is still running a job, you'll probably see \"Error: The operation was canceled.\" in its log."
log.notice "Waiting for the actions runner agent to stop."
while pgrep Runner.Listener > /dev/null; do
sleep 1
done
fi
# This message is supposed to be output only after the runner agent output:
# 2022-08-27 02:04:37Z: Job test3 completed with result: Canceled
# because this graceful stopping logic is basically intended to let the runner agent have some time
# needed to "Cancel" it.
# At the times we didn't have this logic, the runner agent was even unable to output the Cancelled message hence
# unable to gracefully stop, hence the workflow job hanged like forever.
log.notice "The actions runner process exited."
if [ "$RUNNER_INIT_PID" != "" ]; then
log.notice "Holding on until runner init (pid $RUNNER_INIT_PID) exits, so that there will hopefully be no zombie processes remaining."
# We don't need to kill -TERM $RUNNER_INIT_PID as the init is supposed to exit by itself once the foreground process(=the runner agent) exists.
wait "$RUNNER_INIT_PID" || :
fi
log.notice "Graceful stop completed."
}

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -Eeuo pipefail
# shellcheck source=runner/logger.bash
source logger.bash
# shellcheck source=runner/logger.sh
source logger.sh
log.debug "Running ARC Job Completed Hooks"

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -Eeuo pipefail
# shellcheck source=runner/logger.bash
source logger.bash
# shellcheck source=runner/logger.sh
source logger.sh
log.debug "Running ARC Job Started Hooks"

View File

@@ -6,7 +6,7 @@
# are not using any variables that need to be set, and are not using any pipes.
# This logger implementation can be replaced with another logger implementation
# by placing a script called `logger.bash` in `/usr/local/bin` of the image. The
# by placing a script called `logger.sh` in `/usr/local/bin` of the image. The
# only requirement for the script is that it defines the following functions:
#
# - `log.debug`

View File

@@ -1,72 +1,172 @@
#!/bin/bash
source logger.bash
source logger.sh
function wait_for_process () {
local max_time_wait=30
local process_name="$1"
local waited_sec=0
while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
sleep 1
((waited_sec=waited_sec+1))
if ((waited_sec >= max_time_wait)); then
return 1
fi
done
return 0
}
RUNNER_ASSETS_DIR=${RUNNER_ASSETS_DIR:-/runnertmp}
RUNNER_HOME=${RUNNER_HOME:-/runner}
sudo /bin/bash <<SCRIPT
mkdir -p /etc/docker
# Let GitHub runner execute these hooks. These environment variables are used by GitHub's Runner as described here
# https://github.com/actions/runner/blob/main/docs/adrs/1751-runner-job-hooks.md
# Scripts referenced in the ACTIONS_RUNNER_HOOK_ environment variables must end in .sh or .ps1
# for it to become a valid hook script, otherwise GitHub will fail to run the hook
export ACTIONS_RUNNER_HOOK_JOB_STARTED=/etc/arc/hooks/job-started.sh
export ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/etc/arc/hooks/job-completed.sh
if [ ! -f /etc/docker/daemon.json ]; then
echo "{}" > /etc/docker/daemon.json
if [ -n "${STARTUP_DELAY_IN_SECONDS}" ]; then
log.notice "Delaying startup by ${STARTUP_DELAY_IN_SECONDS} seconds"
sleep "${STARTUP_DELAY_IN_SECONDS}"
fi
if [ -n "${MTU}" ]; then
jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
# See https://docs.docker.com/engine/security/rootless/
echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
if [ -z "${GITHUB_URL}" ]; then
log.debug 'Working with public GitHub'
GITHUB_URL="https://github.com/"
else
length=${#GITHUB_URL}
last_char=${GITHUB_URL:length-1:1}
[[ $last_char != "/" ]] && GITHUB_URL="$GITHUB_URL/"; :
log.debug "Github endpoint URL ${GITHUB_URL}"
fi
if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
if [ -z "${RUNNER_NAME}" ]; then
log.error 'RUNNER_NAME must be set'
exit 1
fi
SCRIPT
dump() {
local path=${1:?missing required <path> argument}
shift
printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
cat "$path" 1>&2
printf -- '---\n' 1>&2
}
if [ -n "${RUNNER_ORG}" ] && [ -n "${RUNNER_REPO}" ] && [ -n "${RUNNER_ENTERPRISE}" ]; then
ATTACH="${RUNNER_ORG}/${RUNNER_REPO}"
elif [ -n "${RUNNER_ORG}" ]; then
ATTACH="${RUNNER_ORG}"
elif [ -n "${RUNNER_REPO}" ]; then
ATTACH="${RUNNER_REPO}"
elif [ -n "${RUNNER_ENTERPRISE}" ]; then
ATTACH="enterprises/${RUNNER_ENTERPRISE}"
else
log.error 'At least one of RUNNER_ORG, RUNNER_REPO, or RUNNER_ENTERPRISE must be set'
exit 1
fi
for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
dump "$config" 'Using {path} with the following content:'
if [ -z "${RUNNER_TOKEN}" ]; then
log.error 'RUNNER_TOKEN must be set'
exit 1
fi
if [ -z "${RUNNER_REPO}" ] && [ -n "${RUNNER_GROUP}" ];then
RUNNER_GROUPS=${RUNNER_GROUP}
fi
# Hack due to https://github.com/actions-runner-controller/actions-runner-controller/issues/252#issuecomment-758338483
if [ ! -d "${RUNNER_HOME}" ]; then
log.error "$RUNNER_HOME should be an emptyDir mount. Please fix the pod spec."
exit 1
fi
# if this is not a testing environment
if [[ "${UNITTEST:-}" == '' ]]; then
sudo chown -R runner:docker "$RUNNER_HOME"
# enable dotglob so we can copy a ".env" file to load in env vars as part of the service startup if one is provided
# loading a .env from the root of the service is part of the actions/runner logic
shopt -s dotglob
# use cp instead of mv to avoid issues when src and dst are on different devices
cp -r "$RUNNER_ASSETS_DIR"/* "$RUNNER_HOME"/
shopt -u dotglob
fi
if ! cd "${RUNNER_HOME}"; then
log.error "Failed to cd into ${RUNNER_HOME}"
exit 1
fi
# past that point, it's all relative pathes from /runner
config_args=()
if [ "${RUNNER_FEATURE_FLAG_ONCE:-}" != "true" ] && [ "${RUNNER_EPHEMERAL}" == "true" ]; then
config_args+=(--ephemeral)
log.debug 'Passing --ephemeral to config.sh to enable the ephemeral runner.'
fi
if [ "${DISABLE_RUNNER_UPDATE:-}" == "true" ]; then
config_args+=(--disableupdate)
log.debug 'Passing --disableupdate to config.sh to disable automatic runner updates.'
fi
update-status "Registering"
retries_left=10
while [[ ${retries_left} -gt 0 ]]; do
log.debug 'Configuring the runner.'
./config.sh --unattended --replace \
--name "${RUNNER_NAME}" \
--url "${GITHUB_URL}${ATTACH}" \
--token "${RUNNER_TOKEN}" \
--runnergroup "${RUNNER_GROUPS}" \
--labels "${RUNNER_LABELS}" \
--work "${RUNNER_WORKDIR}" "${config_args[@]}"
if [ -f .runner ]; then
log.debug 'Runner successfully configured.'
break
fi
log.debug 'Configuration failed. Retrying'
retries_left=$((retries_left - 1))
sleep 1
done
log.debug 'Starting supervisor daemon'
sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
log.debug 'Waiting for processes to be running...'
processes=(dockerd)
for process in "${processes[@]}"; do
if ! wait_for_process "$process"; then
log.error "$process is not running after max time"
dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
dump /var/log/supervisor/supervisord.log 'Dumping {path} to aid investigation'
exit 1
else
log.debug "$process is running"
fi
done
if [ -n "${MTU}" ]; then
sudo ifconfig docker0 mtu "${MTU}" up
if [ ! -f .runner ]; then
# we couldn't configure and register the runner; no point continuing
log.error 'Configuration failed!'
exit 2
fi
# Wait processes to be running
entrypoint.sh
cat .runner
# Note: the `.runner` file's content should be something like the below:
#
# $ cat /runner/.runner
# {
# "agentId": 117, #=> corresponds to the ID of the runner
# "agentName": "THE_RUNNER_POD_NAME",
# "poolId": 1,
# "poolName": "Default",
# "serverUrl": "https://pipelines.actions.githubusercontent.com/SOME_RANDOM_ID",
# "gitHubUrl": "https://github.com/USER/REPO",
# "workFolder": "/some/work/dir" #=> corresponds to Runner.Spec.WorkDir
# }
#
# Especially `agentId` is important, as other than listing all the runners in the repo,
# this is the only change we could get the exact runnner ID which can be useful for further
# GitHub API call like the below. Note that 171 is the agentId seen above.
# curl \
# -H "Accept: application/vnd.github.v3+json" \
# -H "Authorization: bearer ${GITHUB_TOKEN}"
# https://api.github.com/repos/USER/REPO/actions/runners/171
# Hack due to the DinD volumes
if [ -z "${UNITTEST:-}" ] && [ -e ./externalstmp ]; then
mkdir -p ./externals
mv ./externalstmp/* ./externals/
fi
if [[ "${DISABLE_WAIT_FOR_DOCKER}" != "true" ]] && [[ "${DOCKER_ENABLED}" == "true" ]]; then
log.debug 'Docker enabled runner detected and Docker daemon wait is enabled'
log.debug 'Waiting until Docker is available or the timeout is reached'
timeout 120s bash -c 'until docker ps ;do sleep 1; done'
else
log.notice 'Docker wait check skipped. Either Docker is disabled or the wait is disabled, continuing with entrypoint'
fi
# Unset entrypoint environment variables so they don't leak into the runner environment
unset RUNNER_NAME RUNNER_REPO RUNNER_TOKEN STARTUP_DELAY_IN_SECONDS DISABLE_WAIT_FOR_DOCKER
# Docker ignores PAM and thus never loads the system environment variables that
# are meant to be set in every environment of every user. We emulate the PAM
# behavior by reading the environment variables without interpreting them.
#
# https://github.com/actions-runner-controller/actions-runner-controller/issues/1135
# https://github.com/actions/runner/issues/1703
# /etc/environment may not exist when running unit tests depending on the platform being used
# (e.g. Mac OS) so we just skip the mapping entirely
if [ -z "${UNITTEST:-}" ]; then
mapfile -t env </etc/environment
fi
update-status "Idle"
exec env -- "${env[@]}" ./run.sh

View File

@@ -2,8 +2,8 @@
set -Eeuo pipefail
if [[ ${1:-} == '' ]]; then
# shellcheck source=runner/logger.bash
source logger.bash
# shellcheck source=runner/logger.sh
source logger.sh
log.error "Missing required argument -- '<phase>'"
exit 64
fi
@@ -26,6 +26,6 @@ if [[ ${RUNNER_STATUS_UPDATE_HOOK:-false} == true ]]; then
--show-error \
--silent \
--request PATCH \
"${apiserver}/apis/actions.summerwind.dev/v1alpha1/namespaces/${namespace}/runners/${HOSTNAME}/status"
1>&-
"${apiserver}/apis/actions.summerwind.dev/v1alpha1/namespaces/${namespace}/runners/${HOSTNAME}/status" \
1>/dev/null
fi

17
runner/wait.sh Normal file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
function wait_for_process () {
local max_time_wait=30
local process_name="$1"
local waited_sec=0
while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
sleep 1
((waited_sec=waited_sec+1))
if ((waited_sec >= max_time_wait)); then
return 1
fi
done
return 0
}