Add DrainJobsMode (aka UpdateStrategy feature) (#2569)

This commit is contained in:
Bassem Dghaidi
2023-05-23 13:42:30 +02:00
committed by GitHub
parent 032443fcfd
commit 8afef51c8b
9 changed files with 450 additions and 40 deletions

View File

@@ -710,3 +710,173 @@ jobs:
arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
arc-namespace: "arc-runners"
arc-controller-namespace: "arc-systems"
update-strategy-tests:
runs-on: ubuntu-latest
timeout-minutes: 20
if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.id == github.repository_id
env:
WORKFLOW_FILE: "arc-test-sleepy-matrix.yaml"
steps:
- uses: actions/checkout@v3
with:
ref: ${{github.head_ref}}
- uses: ./.github/actions/setup-arc-e2e
id: setup
with:
app-id: ${{secrets.E2E_TESTS_ACCESS_APP_ID}}
app-pk: ${{secrets.E2E_TESTS_ACCESS_PK}}
image-name: ${{env.IMAGE_NAME}}
image-tag: ${{env.IMAGE_VERSION}}
target-org: ${{env.TARGET_ORG}}
- name: Install gha-runner-scale-set-controller
id: install_arc_controller
run: |
helm install arc \
--namespace "arc-systems" \
--create-namespace \
--set image.repository=${{ env.IMAGE_NAME }} \
--set image.tag=${{ env.IMAGE_VERSION }} \
--set flags.updateStrategy="eventual" \
./charts/gha-runner-scale-set-controller \
--debug
count=0
while true; do
POD_NAME=$(kubectl get pods -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller -o name)
if [ -n "$POD_NAME" ]; then
echo "Pod found: $POD_NAME"
break
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for controller pod with label app.kubernetes.io/name=gha-runner-scale-set-controller"
exit 1
fi
sleep 1
count=$((count+1))
done
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l app.kubernetes.io/name=gha-runner-scale-set-controller
kubectl get pod -n arc-systems
kubectl describe deployment arc-gha-runner-scale-set-controller -n arc-systems
- name: Install gha-runner-scale-set
id: install_arc
run: |
ARC_NAME=${{github.job}}-$(date +'%M%S')$((($RANDOM + 100) % 100 + 1))
helm install "$ARC_NAME" \
--namespace "arc-runners" \
--create-namespace \
--set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{env.TARGET_REPO}}" \
--set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
./charts/gha-runner-scale-set \
--debug
echo "ARC_NAME=$ARC_NAME" >> $GITHUB_OUTPUT
count=0
while true; do
POD_NAME=$(kubectl get pods -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME -o name)
if [ -n "$POD_NAME" ]; then
echo "Pod found: $POD_NAME"
break
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for listener pod with label actions.github.com/scale-set-name=$ARC_NAME"
exit 1
fi
sleep 1
count=$((count+1))
done
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
- name: Trigger long running jobs and wait for runners to pick them up
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
with:
auth-token: ${{ steps.setup.outputs.token }}
repo-owner: ${{ env.TARGET_ORG }}
repo-name: ${{env.TARGET_REPO}}
workflow-file: ${{env.WORKFLOW_FILE}}
arc-name: ${{steps.install_arc.outputs.ARC_NAME}}
arc-namespace: "arc-runners"
arc-controller-namespace: "arc-systems"
wait-to-running: "true"
wait-to-finish: "false"
- name: Upgrade the gha-runner-scale-set
shell: bash
run: |
helm upgrade --install "${{ steps.install_arc.outputs.ARC_NAME }}" \
--namespace "arc-runners" \
--create-namespace \
--set githubConfigUrl="https://github.com/${{ env.TARGET_ORG }}/${{ env.TARGET_REPO }}" \
--set githubConfigSecret.github_token="${{ steps.setup.outputs.token }}" \
--set template.spec.containers[0].name="runner" \
--set template.spec.containers[0].image="ghcr.io/actions/actions-runner:latest" \
--set template.spec.containers[0].command={"/home/runner/run.sh"} \
--set template.spec.containers[0].env[0].name="TEST" \
--set template.spec.containers[0].env[0].value="E2E TESTS" \
./charts/gha-runner-scale-set \
--debug
- name: Assert that the listener is deleted while jobs are running
shell: bash
run: |
count=0
while true; do
LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RESOURCES="$(kubectl get pods -A)"
if [ "$LISTENER_COUNT" -eq 0 ]; then
echo "Listener has been deleted"
echo "$RESOURCES"
exit 0
fi
if [ "$count" -ge 60 ]; then
echo "Timeout waiting for listener to be deleted"
echo "$RESOURCES"
exit 1
fi
echo "Waiting for listener to be deleted"
echo "Listener count: $LISTENER_COUNT target: 0 | Runners count: $RUNNERS_COUNT target: 3"
sleep 1
count=$((count+1))
done
- name: Assert that the listener goes back up after the jobs are done
shell: bash
run: |
count=0
while true; do
LISTENER_COUNT="$(kubectl get pods -l actions.github.com/scale-set-name=${{ steps.install_arc.outputs.ARC_NAME }} -n arc-systems --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RUNNERS_COUNT="$(kubectl get pods -l app.kubernetes.io/component=runner -n arc-runners --field-selector=status.phase=Running -o=jsonpath='{.items}' | jq 'length')"
RESOURCES="$(kubectl get pods -A)"
if [ "$LISTENER_COUNT" -eq 1 ]; then
echo "Listener is up!"
echo "$RESOURCES"
exit 0
fi
if [ "$count" -ge 120 ]; then
echo "Timeout waiting for listener to be recreated"
echo "$RESOURCES"
exit 1
fi
echo "Waiting for listener to be recreated"
echo "Listener count: $LISTENER_COUNT target: 1 | Runners count: $RUNNERS_COUNT target: 0"
sleep 1
count=$((count+1))
done
- name: Gather logs and cleanup
shell: bash
if: always()
run: |
helm uninstall "${{ steps.install_arc.outputs.ARC_NAME }}" --namespace "arc-runners" --debug
kubectl wait --timeout=10s --for=delete AutoScalingRunnerSet -n "${{ steps.install_arc.outputs.ARC_NAME }}" -l app.kubernetes.io/instance="${{ steps.install_arc.outputs.ARC_NAME }}"
kubectl logs deployment/arc-gha-runner-scale-set-controller -n "arc-systems"