Add DrainJobsMode (aka UpdateStrategy feature) (#2569)

This commit is contained in:
Bassem Dghaidi
2023-05-23 13:42:30 +02:00
committed by GitHub
parent 032443fcfd
commit 8afef51c8b
9 changed files with 450 additions and 40 deletions

View File

@@ -23,6 +23,14 @@ inputs:
arc-controller-namespace:
description: 'The namespace of the configured gha-runner-scale-set-controller'
required: true
wait-to-finish:
description: 'Wait for the workflow run to finish'
required: true
default: "true"
wait-to-running:
description: 'Wait for the workflow run to start running'
required: true
default: "false"
runs:
using: "composite"
@@ -118,7 +126,36 @@ runs:
| ${{steps.query_workflow.outputs.workflow_run_url}} |
EOF
- name: Wait for workflow to start running
if: inputs.wait-to-running == 'true' && inputs.wait-to-finish == 'false'
uses: actions/github-script@v6
with:
script: |
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms))
}
const owner = '${{inputs.repo-owner}}'
const repo = '${{inputs.repo-name}}'
const workflow_run_id = ${{steps.query_workflow.outputs.workflow_run}}
const workflow_job_id = ${{steps.query_workflow.outputs.workflow_job}}
let count = 0
while (count++<10) {
await sleep(30 * 1000);
let getRunResponse = await github.rest.actions.getWorkflowRun({
owner: owner,
repo: repo,
run_id: workflow_run_id
})
console.log(`${getRunResponse.data.html_url}: ${getRunResponse.data.status} (${getRunResponse.data.conclusion})`);
if (getRunResponse.data.status == 'in_progress') {
console.log(`Workflow run is in progress.`)
return
}
}
core.setFailed(`The triggered workflow run didn't start properly using ${{inputs.arc-name}}`)
- name: Wait for workflow to finish successfully
if: inputs.wait-to-finish == 'true'
uses: actions/github-script@v6
with:
script: |
@@ -151,10 +188,15 @@ runs:
}
core.setFailed(`The triggered workflow run didn't finish properly using ${{inputs.arc-name}}`)
- name: cleanup
if: inputs.wait-to-finish == 'true'
shell: bash
run: |
helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
- name: Gather logs and cleanup
shell: bash
if: always()
run: |
helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-name}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
kubectl logs deployment/arc-gha-runner-scale-set-controller -n ${{inputs.arc-controller-namespace}}

View File

@@ -710,3 +710,173 @@ jobs:
arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
arc-namespace: "arc-runners"
arc-controller-namespace: "arc-systems"
update-strategy-tests:
runs-on: ubuntu-latest
timeout-minutes: 20
if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.id == github.repository_id
env:
WORKFLOW_FILE: "arc-test-sleepy-matrix.yaml"
steps:
- uses: actions/checkout@v3
with:
ref: ${{github.head_ref}}
- uses: ./.github/actions/setup-arc-e2e
id: setup
with:
app-id: ${{secrets.E2E_TESTS_ACCESS_APP_ID}}
app-pk: ${{secrets.E2E_TESTS_ACCESS_PK}}
image-name: ${{env.IMAGE_NAME}}
image-tag: ${{env.IMAGE_VERSION}}
target-org: ${{env.TARGET_ORG}}
- name: Install gha-runner-scale-set-controller
id: install_arc_controller
run: |
helm install arc \
--namespace "arc-systems" \
--create-namespace \
--set image.repository=${{ env.IMAGE_NAME }} \
--set image.tag=${{ env.IMAGE_VERSION }} \
--set flags.updateStrategy="eventual" \
./charts/gha-runner-scale-set-controller \
--debug
count=0
while true; do
POD_NAME=$(kubectl get pods -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller -o name)
if [ -n "$POD_NAME" ]; then
echo "Pod found: $POD_NAME"
break
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for controller pod with label app.kubernetes.io/name=gha-runner-scale-set-controller"
exit 1
fi
sleep 1
count=$((count+1))
done
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller
kubectl get pod -n arc-systems
kubectl describe deployment arc-gha-runner-scale-set-controller -n arc-systems
- name: Install gha-runner-scale-set
id: install_arc
run: |
ARC_NAME=${{github.job}}-$(date +'%M%S')$((($RANDOM + 100) % 100 + 1))
helm install "$ARC_NAME" \
--namespace "arc-runners" \
--create-namespace \
--set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{env.TARGET_REPO}}" \
--set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
./charts/gha-runner-scale-set \
--debug
echo "ARC_NAME=$ARC_NAME" >> $GITHUB_OUTPUT
count=0
while true; do
POD_NAME=$(kubectl get pods -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME -o name)
if [ -n "$POD_NAME" ]; then
echo "Pod found: $POD_NAME"
break
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for listener pod with label actions.github.com/scale-set-name=$ARC_NAME"
exit 1
fi
sleep 1
count=$((count+1))
done
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
- name: Trigger long running jobs and wait for runners to pick them up
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
with:
auth-token: ${{ steps.setup.outputs.token }}
repo-owner: ${{ env.TARGET_ORG }}
repo-name: ${{env.TARGET_REPO}}
workflow-file: ${{env.WORKFLOW_FILE}}
arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
arc-namespace: "arc-runners"
arc-controller-namespace: "arc-systems"
wait-to-running: "true"
wait-to-finish: "false"
- name: Upgrade the gha-runner-scale-set
shell: bash
run: |
helm upgrade --install "${{ steps.install_arc.outputs.ARC_NAME }}" \
--namespace "arc-runners" \
--create-namespace \
--set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{ env.TARGET_REPO }}" \
--set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
--set template.spec.containers[0].name="runner" \
--set template.spec.containers[0].image="ghcr.io/actions/actions-runner:latest" \
--set template.spec.containers[0].command={"/home/runner/run.sh"} \
--set template.spec.containers[0].env[0].name="TEST" \
--set template.spec.containers[0].env[0].value="E2E TESTS" \
./charts/gha-runner-scale-set \
--debug
- name: Assert that the listener is deleted while jobs are running
shell: bash
run: |
count=0
while true; do
LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RESOURCES="$(kubectl get pods -A)"
if [ "$LISTENER_COUNT" -eq 0 ]; then
echo "Listener has been deleted"
echo "$RESOURCES"
exit 0
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for listener to be deleted"
echo "$RESOURCES"
exit 1
fi
echo "Waiting for listener to be deleted"
echo "Listener count: $LISTENER_COUNT target: 0 | Runners count: $RUNNERS_COUNT target: 3"
sleep 1
count=$((count+1))
done
- name: Assert that the listener goes back up after the jobs are done
shell: bash
run: |
count=0
while true; do
LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RESOURCES="$(kubectl get pods -A)"
if [ "$LISTENER_COUNT" -eq 1 ]; then
echo "Listener is up!"
echo "$RESOURCES"
exit 0
fi
if [ "$count" -ge 120 ]; then
echo "Timeout waiting for listener to be recreated"
echo "$RESOURCES"
exit 1
fi
echo "Waiting for listener to be recreated"
echo "Listener count: $LISTENER_COUNT target: 1 | Runners count: $RUNNERS_COUNT target: 0"
sleep 1
count=$((count+1))
done
- name: Gather logs and cleanup
shell: bash
if: always()
run: |
helm uninstall "${{ steps.install_arc.outputs.ARC_NAME }}" --namespace "arc-runners" --debug
kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n "${{ steps.install_arc.outputs.ARC_NAME }}" -l app.kubernetes.io/instance="${{ steps.install_arc.outputs.ARC_NAME }}"
kubectl logs deployment/arc-gha-runner-scale-set-controller -n "arc-systems"