mirror of
https://github.com/actions/actions-runner-controller.git
synced 2025-12-10 11:41:27 +00:00
Compare commits
325 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fea1457f12 | ||
|
|
473295e3fc | ||
|
|
9f6f962fc7 | ||
|
|
2a475f25c7 | ||
|
|
dd9f25ea78 | ||
|
|
b8e4eee904 | ||
|
|
edbdef8d20 | ||
|
|
a190fa97bb | ||
|
|
bfc5ea4727 | ||
|
|
5a9e8545aa | ||
|
|
4446ba57e1 | ||
|
|
d62c8a4697 | ||
|
|
946d5b1fa7 | ||
|
|
da6b07660e | ||
|
|
e3deb0d752 | ||
|
|
82641e5036 | ||
|
|
2fe6adf5b7 | ||
|
|
736126b793 | ||
|
|
6abf5bbac8 | ||
|
|
dc4f116bda | ||
|
|
cda10fd243 | ||
|
|
b5d1a63bdf | ||
|
|
6f3e23973d | ||
|
|
a517c1ff66 | ||
|
|
9b28e633c1 | ||
|
|
8161136cbd | ||
|
|
a9ac5a1cbf | ||
|
|
d4f35cff4f | ||
|
|
f661249f07 | ||
|
|
73e430ce54 | ||
|
|
858ef8979d | ||
|
|
1ce0a183a6 | ||
|
|
63935d2053 | ||
|
|
fc63d6d26e | ||
|
|
5ea08411e6 | ||
|
|
067ed2e5ec | ||
|
|
d86bd2bcd7 | ||
|
|
ddd417f756 | ||
|
|
0386c0734c | ||
|
|
af96de6184 | ||
|
|
abb8615796 | ||
|
|
bc7a3cab1b | ||
|
|
e2c8163b8c | ||
|
|
84d16c1c12 | ||
|
|
071898c96b | ||
|
|
f24e2fa44e | ||
|
|
3c7d3d6b57 | ||
|
|
23f091d7fa | ||
|
|
667764e027 | ||
|
|
de693c4191 | ||
|
|
510fc9c834 | ||
|
|
7fd5e24961 | ||
|
|
9974b1a2b7 | ||
|
|
bd91b73fd9 | ||
|
|
a7ae910ee4 | ||
|
|
2733c36d0e | ||
|
|
0ef9a22cd4 | ||
|
|
933b0c7888 | ||
|
|
1b7ec33135 | ||
|
|
a62882d243 | ||
|
|
0cd13fe51d | ||
|
|
01c8dc237e | ||
|
|
7c4db63718 | ||
|
|
3d88b9630a | ||
|
|
1152e6b31d | ||
|
|
ac27df8301 | ||
|
|
9dd26168d6 | ||
|
|
18bfb28c0b | ||
|
|
84210e900b | ||
|
|
ef3313d147 | ||
|
|
c7eea169ad | ||
|
|
63be0223ad | ||
|
|
5bbea772f7 | ||
|
|
2aa3f1e142 | ||
|
|
3e988afc09 | ||
|
|
84210f3d2b | ||
|
|
536692181b | ||
|
|
23403172cb | ||
|
|
8a8ec43364 | ||
|
|
78c01fd31d | ||
|
|
bf45aa9f6b | ||
|
|
b5aa1750bb | ||
|
|
cdc9d20e7a | ||
|
|
8035d6d9f8 | ||
|
|
65f7ee92a6 | ||
|
|
fca8a538db | ||
|
|
95ddc77245 | ||
|
|
b5194fd75a | ||
|
|
adf69bbea0 | ||
|
|
b43ef70ac6 | ||
|
|
f1caebbaf0 | ||
|
|
ede28f5046 | ||
|
|
f08ab1490d | ||
|
|
772ca57056 | ||
|
|
51b13e3bab | ||
|
|
81017b130f | ||
|
|
bdbcf66569 | ||
|
|
0e15a78541 | ||
|
|
f85c3d06d9 | ||
|
|
51ba7d7160 | ||
|
|
759349de11 | ||
|
|
3014e98681 | ||
|
|
5f4be6a883 | ||
|
|
b98f470a70 | ||
|
|
e46b90f758 | ||
|
|
3a7e8c844b | ||
|
|
65a67ee61c | ||
|
|
215ba36fd1 | ||
|
|
27774b47bd | ||
|
|
fbde2b9a41 | ||
|
|
212098183a | ||
|
|
4a5097d8cf | ||
|
|
9c57d085f8 | ||
|
|
d6622f9369 | ||
|
|
3b67ee727f | ||
|
|
e6bddcd238 | ||
|
|
f60e57d789 | ||
|
|
3ca1152420 | ||
|
|
e94fa19843 | ||
|
|
99832d7104 | ||
|
|
289bcd8b64 | ||
|
|
5e8cba82c2 | ||
|
|
dabbc99c78 | ||
|
|
d01595cfbc | ||
|
|
c1e5829b03 | ||
|
|
800d6bd586 | ||
|
|
d3b7f0bf7d | ||
|
|
dbcb67967f | ||
|
|
55369bf846 | ||
|
|
1f6303daed | ||
|
|
0fd1a681af | ||
|
|
58416db8c8 | ||
|
|
78a0817c2c | ||
|
|
9ed429513d | ||
|
|
46291c1823 | ||
|
|
832e59338e | ||
|
|
70ae5aef1f | ||
|
|
6d10dd8e1d | ||
|
|
61c5a112db | ||
|
|
7bc08fbe7c | ||
|
|
4053ab3e11 | ||
|
|
059481b610 | ||
|
|
9fdb2c009d | ||
|
|
9f7ea0c014 | ||
|
|
0caa0315c6 | ||
|
|
1c726ae20c | ||
|
|
d6cdd5964c | ||
|
|
a622968ff2 | ||
|
|
e8ef84ab76 | ||
|
|
1551f3b5fc | ||
|
|
3ba7179995 | ||
|
|
e7c6c26266 | ||
|
|
ebe7d060cb | ||
|
|
c3e280eadb | ||
|
|
9f254a2393 | ||
|
|
e5cf3b95cf | ||
|
|
24aae58dbc | ||
|
|
13bfa2da4e | ||
|
|
cb4e1fa8f2 | ||
|
|
7a5a6381c3 | ||
|
|
81951780b1 | ||
|
|
3b48db0d26 | ||
|
|
352e206148 | ||
|
|
6288036ed4 | ||
|
|
a37b4dfbe3 | ||
|
|
c4ff1a588f | ||
|
|
4a3b7bc8d5 | ||
|
|
8db071c4ba | ||
|
|
7b8057e417 | ||
|
|
960a704246 | ||
|
|
f907f82275 | ||
|
|
7124451cea | ||
|
|
c8f1acd92c | ||
|
|
b0fd7a75ea | ||
|
|
b09c54045a | ||
|
|
96f2da1c2e | ||
|
|
cac8b76c68 | ||
|
|
e24d942d63 | ||
|
|
b855991373 | ||
|
|
e7e48a77e4 | ||
|
|
85dea9b67c | ||
|
|
1d9347f418 | ||
|
|
631a70a35f | ||
|
|
b614dcf54b | ||
|
|
14f9e7229e | ||
|
|
82770e145b | ||
|
|
971c54bf5c | ||
|
|
b80d9b0cdc | ||
|
|
e46df413a1 | ||
|
|
eb02f6f26e | ||
|
|
7a750b9285 | ||
|
|
d26c8d6529 | ||
|
|
fd0092d13f | ||
|
|
88d17c7988 | ||
|
|
98567dadc9 | ||
|
|
7e8d80689b | ||
|
|
d72c396ff1 | ||
|
|
13e7b440a8 | ||
|
|
a95983fb98 | ||
|
|
ecc8b4472a | ||
|
|
459beeafb9 | ||
|
|
1b327a0721 | ||
|
|
1f8a23c129 | ||
|
|
af8d8f7e1d | ||
|
|
e7ef21fdf9 | ||
|
|
ee7484ac91 | ||
|
|
debf53c640 | ||
|
|
9657d3e5b3 | ||
|
|
2cb04ddde7 | ||
|
|
366f8927d8 | ||
|
|
532a2bb2a9 | ||
|
|
f28cecffe9 | ||
|
|
4cbbcd64ce | ||
|
|
a68eede616 | ||
|
|
c06a806d75 | ||
|
|
857c1700ba | ||
|
|
a40793bb60 | ||
|
|
48a7b78bf3 | ||
|
|
6ff93eae95 | ||
|
|
b25a0fd606 | ||
|
|
3beef84f30 | ||
|
|
76cc758d12 | ||
|
|
c4c6e833a7 | ||
|
|
ecf74e615e | ||
|
|
bb19e85037 | ||
|
|
e7200f274d | ||
|
|
1cc06e7408 | ||
|
|
4551309e30 | ||
|
|
7123b18a47 | ||
|
|
cc55d0bd7d | ||
|
|
c612e87d85 | ||
|
|
326d6a1fe8 | ||
|
|
fa8ff70aa2 | ||
|
|
efb7fca308 | ||
|
|
e4280dcb0d | ||
|
|
f153870f5f | ||
|
|
8ca39caff5 | ||
|
|
791634fb12 | ||
|
|
c4b24f8366 | ||
|
|
a1c6d1d11a | ||
|
|
adc889ce8a | ||
|
|
b83db7be8f | ||
|
|
da2adc0cc5 | ||
|
|
fa287c4395 | ||
|
|
7c0340dea0 | ||
|
|
c3dd1c5c05 | ||
|
|
051089733b | ||
|
|
757e0a82a2 | ||
|
|
83e550cde5 | ||
|
|
22ef7b3a71 | ||
|
|
28fccbcecd | ||
|
|
9628bb2937 | ||
|
|
736a53fed6 | ||
|
|
132faa13a1 | ||
|
|
66e070f798 | ||
|
|
55ff4de79a | ||
|
|
301439b06a | ||
|
|
15ee6d6360 | ||
|
|
5b899f578b | ||
|
|
d8c9eb7ba7 | ||
|
|
cbbc383a80 | ||
|
|
b57e885a73 | ||
|
|
bed927052d | ||
|
|
14a878bfae | ||
|
|
c95e84a528 | ||
|
|
95a5770d55 | ||
|
|
9cc9f8c182 | ||
|
|
b7c5611516 | ||
|
|
138e326705 | ||
|
|
c21fa75afa | ||
|
|
34483e268f | ||
|
|
5f2b5327f7 | ||
|
|
a93b2fdad4 | ||
|
|
25570a0c6d | ||
|
|
d20ad71071 | ||
|
|
8a379ac94b | ||
|
|
27563c4378 | ||
|
|
4a0f68bfe3 | ||
|
|
1917cf90c4 | ||
|
|
0ba3cad6c2 | ||
|
|
7f0e65cb73 | ||
|
|
12a04b7f38 | ||
|
|
a3072c110d | ||
|
|
15b402bb32 | ||
|
|
11be6c1fb6 | ||
|
|
59c3288e87 | ||
|
|
5030e075a9 | ||
|
|
3115d71471 | ||
|
|
c221b6e278 | ||
|
|
a8dbc8a501 | ||
|
|
b1ac63683f | ||
|
|
10bc28af75 | ||
|
|
e23692b3bc | ||
|
|
e7f4a0e200 | ||
|
|
828ddcd44e | ||
|
|
fc821fd473 | ||
|
|
4b0aa92286 | ||
|
|
c69c8dd84d | ||
|
|
e42db00006 | ||
|
|
eff0c7364f | ||
|
|
516695b275 | ||
|
|
686d40c20d | ||
|
|
f0fa99fc53 | ||
|
|
6b12413fdd | ||
|
|
3abecd0f19 | ||
|
|
7156ce040e | ||
|
|
1463d4927f | ||
|
|
5bc16f2619 | ||
|
|
b8e65aa857 | ||
|
|
d4a9750e20 | ||
|
|
a6f0e0008f | ||
|
|
79a31328a5 | ||
|
|
4e6bfd8114 | ||
|
|
3c16188371 | ||
|
|
9e356b419e | ||
|
|
f3ceccd904 | ||
|
|
4b557dc54c | ||
|
|
4c53e3aa75 | ||
|
|
0b9bef2c08 | ||
|
|
a5ed6bd263 | ||
|
|
921f547200 | ||
|
|
9079c5d85f | ||
|
|
a9aea0bd9c | ||
|
|
fcf4778bac | ||
|
|
eb0a4a9603 |
@@ -11,3 +11,4 @@ charts
|
||||
*.md
|
||||
*.txt
|
||||
*.sh
|
||||
test/e2e/.docker-build
|
||||
|
||||
36
.github/ISSUE_TEMPLATE/bug_report.md
vendored
36
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@@ -1,36 +0,0 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
**Checks**
|
||||
|
||||
- [ ] My actions-runner-controller version (v0.x.y) does support the feature
|
||||
- [ ] I'm using an unreleased version of the controller I built from HEAD of the default branch
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior:
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
4. See error
|
||||
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**Environment (please complete the following information):**
|
||||
- Controller Version [e.g. 0.18.2]
|
||||
- Deployment Method [e.g. Helm and Kustomize ]
|
||||
- Helm Chart Version [e.g. 0.11.0, if applicable]
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here.
|
||||
177
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
177
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
Normal file
@@ -0,0 +1,177 @@
|
||||
name: Bug Report
|
||||
description: File a bug report
|
||||
title: "Bug"
|
||||
labels: ["bug"]
|
||||
body:
|
||||
- type: input
|
||||
id: controller-version
|
||||
attributes:
|
||||
label: Controller Version
|
||||
description: Refer to semver-like release tags for controller versions. Any release tags prefixed with `actions-runner-controller-` are for chart releases
|
||||
placeholder: ex. 0.18.2 or git commit ID
|
||||
validations:
|
||||
required: true
|
||||
- type: input
|
||||
id: chart-version
|
||||
attributes:
|
||||
label: Helm Chart Version
|
||||
description: Run `helm list` and see what's shown under CHART VERSION. Any release tags prefixed with `actions-runner-controller-` are for chart releases
|
||||
placeholder: ex. 0.11.0
|
||||
- type: input
|
||||
id: cert-manager-version
|
||||
attributes:
|
||||
label: CertManager Version
|
||||
description: Run `kubectl get po -o yaml $CERT_MANAGER_POD` and see the image tag, or run `helm list` and see what's shown under APP VERSION for your cert-manager Helm release.
|
||||
placeholder: ex. 1.8
|
||||
- type: dropdown
|
||||
id: deployment-method
|
||||
attributes:
|
||||
label: Deployment Method
|
||||
description: Which deployment method did you use to install ARC?
|
||||
options:
|
||||
- Helm
|
||||
- Kustomize
|
||||
- ArgoCD
|
||||
- Other
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: cert-manager
|
||||
attributes:
|
||||
label: cert-manager installation
|
||||
description: Confirm that you've installed cert-manager correctly by answering a few questions
|
||||
placeholder: |
|
||||
- Did you follow https://github.com/actions-runner-controller/actions-runner-controller#installation? If not, describe the installation process so that we can reproduce your environment.
|
||||
- Are you sure you've installed cert-manager from an official source?
|
||||
(Note that we won't provide user support for cert-manager itself. Make sure cert-manager is fully working before testing ARC or reporting a bug
|
||||
validations:
|
||||
required: true
|
||||
- type: checkboxes
|
||||
id: checks
|
||||
attributes:
|
||||
label: Checks
|
||||
description: Please check the boxes below before submitting
|
||||
options:
|
||||
- label: This isn't a question or user support case (For Q&A and community support, go to [Discussions](https://github.com/actions-runner-controller/actions-runner-controller/discussions). It might also be a good idea to contract with any of contributors and maintainers if your business is so critical and therefore you need priority support
|
||||
required: true
|
||||
- label: I've read [releasenotes](https://github.com/actions-runner-controller/actions-runner-controller/tree/master/docs/releasenotes) before submitting this issue and I'm sure it's not due to any recently-introduced backward-incompatible changes
|
||||
required: true
|
||||
- label: My actions-runner-controller version (v0.x.y) does support the feature
|
||||
required: true
|
||||
- label: I've already upgraded ARC (including the CRDs, see charts/actions-runner-controller/docs/UPGRADING.md for details) to the latest and it didn't fix the issue
|
||||
required: true
|
||||
- type: textarea
|
||||
id: resource-definitions
|
||||
attributes:
|
||||
label: Resource Definitions
|
||||
description: "Add copy(s) of your resource definition(s) (RunnerDeployment or RunnerSet, and HorizontalRunnerAutoscaler. If RunnerSet, also include the StorageClass being used)"
|
||||
render: yaml
|
||||
placeholder: |
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerDeployment
|
||||
metadata:
|
||||
name: example
|
||||
spec:
|
||||
#snip
|
||||
---
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerSet
|
||||
metadata:
|
||||
name: example
|
||||
spec:
|
||||
#snip
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: example
|
||||
provisioner: ...
|
||||
reclaimPolicy: ...
|
||||
volumeBindingMode: ...
|
||||
---
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: HorizontalRunnerAutoscaler
|
||||
metadata:
|
||||
name:
|
||||
spec:
|
||||
#snip
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: reproduction-steps
|
||||
attributes:
|
||||
label: To Reproduce
|
||||
description: "Steps to reproduce the behavior"
|
||||
render: markdown
|
||||
placeholder: |
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
4. See error
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: actual-behavior
|
||||
attributes:
|
||||
label: Describe the bug
|
||||
description: Also tell us, what did happen?
|
||||
placeholder: A clear and concise description of what happened.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: expected-behavior
|
||||
attributes:
|
||||
label: Describe the expected behavior
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: A clear and concise description of what the expected behavior is.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: controller-logs
|
||||
attributes:
|
||||
label: Controller Logs
|
||||
description: "NEVER EVER OMIT THIS! Include logs from `actions-runner-controller`'s controller-manager pod"
|
||||
render: shell
|
||||
placeholder: |
|
||||
PROVIDE THE LOGS VIA A GIST LINK (https://gist.github.com/), NOT DIRECTLY IN THIS TEXT AREA
|
||||
|
||||
To grab controller logs:
|
||||
|
||||
# Set NS according to your setup
|
||||
NS=actions-runner-system
|
||||
|
||||
# Grab the pod name and set it to $POD_NAME
|
||||
kubectl -n $NS get po
|
||||
|
||||
kubectl -n $NS logs $POD_NAME > arc.log
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: runner-pod-logs
|
||||
attributes:
|
||||
label: Runner Pod Logs
|
||||
description: "Include logs from runner pod(s)"
|
||||
render: shell
|
||||
placeholder: |
|
||||
PROVIDE THE LOGS VIA A GIST LINK (https://gist.github.com/), NOT DIRECTLY IN THIS TEXT AREA
|
||||
|
||||
To grab the runner pod logs:
|
||||
|
||||
# Set NS according to your setup. It should match your RunnerDeployment's metadata.namespace.
|
||||
NS=default
|
||||
|
||||
# Grab the name of the problematic runner pod and set it to $POD_NAME
|
||||
kubectl -n $NS get po
|
||||
|
||||
kubectl -n $NS logs $POD_NAME -c runner > runnerpod_runner.log
|
||||
kubectl -n $NS logs $POD_NAME -c docker > runnerpod_docker.log
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: additional-context
|
||||
attributes:
|
||||
label: Additional Context
|
||||
description: |
|
||||
Add any other context about the problem here.
|
||||
|
||||
Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
|
||||
15
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
15
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
# Blank issues are mainly for maintainers who are known to write complete issue descriptions without need to following a form
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Sponsor ARC Maintainers
|
||||
about: If your business relies on the continued maintainance of actions-runner-controller, please consider sponsoring the project and the maintainers.
|
||||
url: https://github.com/actions-runner-controller/actions-runner-controller/tree/master/CODEOWNERS
|
||||
- name: Ideas and Feature Requests
|
||||
about: Wanna request a feature? Create a discussion and collect :+1:s first.
|
||||
url: https://github.com/actions-runner-controller/actions-runner-controller/discussions/new?category=ideas
|
||||
- name: Questions and User Support
|
||||
about: Need support using ARC? We use Discussions as the place to provide community support.
|
||||
url: https://github.com/actions-runner-controller/actions-runner-controller/discussions/new?category=questions
|
||||
- name: Need Paid Support?
|
||||
about: Consider contracting with any of the actions-runner-controller maintainers and contributors.
|
||||
url: https://github.com/actions-runner-controller/actions-runner-controller/tree/master/CODEOWNERS
|
||||
@@ -29,23 +29,23 @@ runs:
|
||||
shell: bash
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
version: latest
|
||||
|
||||
- name: Login to DockerHub
|
||||
if: ${{ github.ref == 'master' && github.event.pull_request.merged == true }}
|
||||
uses: docker/login-action@v1
|
||||
if: ${{ github.event_name == 'release' || github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ inputs.username }}
|
||||
password: ${{ inputs.password }}
|
||||
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v1
|
||||
if: ${{ github.ref == 'master' && github.event.pull_request.merged == true }}
|
||||
if: ${{ github.event_name == 'release' || github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ inputs.ghcr_username }}
|
||||
|
||||
25
.github/lock.yml
vendored
25
.github/lock.yml
vendored
@@ -1,25 +0,0 @@
|
||||
# Configuration for Lock Threads
|
||||
# Repo: https://github.com/dessant/lock-threads-app
|
||||
# App: https://github.com/apps/lock
|
||||
|
||||
# Number of days of inactivity before a closed issue or pull request is locked
|
||||
daysUntilLock: 7
|
||||
|
||||
# Skip issues and pull requests created before a given timestamp. Timestamp must
|
||||
# follow ISO 8601 (`YYYY-MM-DD`). Set to `false` to disable
|
||||
skipCreatedBefore: false
|
||||
|
||||
# Issues and pull requests with these labels will be ignored. Set to `[]` to disable
|
||||
exemptLabels: []
|
||||
|
||||
# Label to add before locking, such as `outdated`. Set to `false` to disable
|
||||
lockLabel: false
|
||||
|
||||
# Comment to post before locking. Set to `false` to disable
|
||||
lockComment: >
|
||||
This thread has been automatically locked since there has not been
|
||||
any recent activity after it was closed. Please open a new issue for
|
||||
related bugs.
|
||||
|
||||
# Assign `resolved` as the reason for locking. Set to `false` to disable
|
||||
setLockReason: true
|
||||
24
.github/renovate.json5
vendored
24
.github/renovate.json5
vendored
@@ -13,11 +13,29 @@
|
||||
{
|
||||
// use https://github.com/actions/runner/releases
|
||||
"fileMatch": [
|
||||
".github/workflows/runners.yml"
|
||||
],
|
||||
".github/workflows/runners.yaml"
|
||||
],
|
||||
"matchStrings": ["RUNNER_VERSION: +(?<currentValue>.*?)\\n"],
|
||||
"depNameTemplate": "actions/runner",
|
||||
"datasourceTemplate": "github-releases"
|
||||
},
|
||||
{
|
||||
"fileMatch": [
|
||||
"runner/Makefile",
|
||||
"Makefile"
|
||||
],
|
||||
"matchStrings": ["RUNNER_VERSION \\?= +(?<currentValue>.*?)\\n"],
|
||||
"depNameTemplate": "actions/runner",
|
||||
"datasourceTemplate": "github-releases"
|
||||
},
|
||||
{
|
||||
"fileMatch": [
|
||||
"runner/actions-runner.dockerfile",
|
||||
"runner/actions-runner-dind.dockerfile"
|
||||
],
|
||||
"matchStrings": ["RUNNER_VERSION=+(?<currentValue>.*?)\\n"],
|
||||
"depNameTemplate": "actions/runner",
|
||||
"datasourceTemplate": "github-releases"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
67
.github/stale.yml
vendored
67
.github/stale.yml
vendored
@@ -1,67 +0,0 @@
|
||||
# Configuration for probot-stale - https://github.com/probot/stale
|
||||
|
||||
# Number of days of inactivity before an Issue or Pull Request becomes stale
|
||||
daysUntilStale: 30
|
||||
|
||||
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
|
||||
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
|
||||
daysUntilClose: 14
|
||||
|
||||
# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
|
||||
onlyLabels: []
|
||||
|
||||
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
|
||||
exemptLabels:
|
||||
- pinned
|
||||
- security
|
||||
- enhancement
|
||||
- refactor
|
||||
- documentation
|
||||
- chore
|
||||
- bug
|
||||
- dependencies
|
||||
- needs-investigation
|
||||
|
||||
# Set to true to ignore issues in a project (defaults to false)
|
||||
exemptProjects: false
|
||||
|
||||
# Set to true to ignore issues in a milestone (defaults to false)
|
||||
exemptMilestones: false
|
||||
|
||||
# Set to true to ignore issues with an assignee (defaults to false)
|
||||
exemptAssignees: false
|
||||
|
||||
# Label to use when marking as stale
|
||||
staleLabel: stale
|
||||
|
||||
# Comment to post when marking as stale. Set to `false` to disable
|
||||
markComment: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions.
|
||||
|
||||
# Comment to post when removing the stale label.
|
||||
# unmarkComment: >
|
||||
# Your comment here.
|
||||
|
||||
# Comment to post when closing a stale Issue or Pull Request.
|
||||
# closeComment: >
|
||||
# Your comment here.
|
||||
|
||||
# Limit the number of actions per hour, from 1-30. Default is 30
|
||||
limitPerRun: 30
|
||||
|
||||
# Limit to only `issues` or `pulls`
|
||||
# only: issues
|
||||
|
||||
# Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
|
||||
# pulls:
|
||||
# daysUntilStale: 30
|
||||
# markComment: >
|
||||
# This pull request has been automatically marked as stale because it has not had
|
||||
# recent activity. It will be closed if no further activity occurs. Thank you
|
||||
# for your contributions.
|
||||
|
||||
# issues:
|
||||
# exemptLabels:
|
||||
# - confirmed
|
||||
@@ -1,26 +1,28 @@
|
||||
name: Publish Controller Image
|
||||
name: Publish ARC
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
types:
|
||||
- published
|
||||
|
||||
# https://docs.github.com/en/rest/overview/permissions-required-for-github-apps
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
release-controller:
|
||||
name: Release
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKER_USER }}
|
||||
steps:
|
||||
- name: Set outputs
|
||||
id: vars
|
||||
run: echo ::set-output name=sha_short::${GITHUB_SHA::7}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- uses: actions/setup-go@v2
|
||||
- uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: '^1.17.7'
|
||||
go-version: '1.18.2'
|
||||
|
||||
- name: Install tools
|
||||
run: |
|
||||
@@ -39,25 +41,20 @@ jobs:
|
||||
- name: Upload artifacts
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: make github-release
|
||||
run: |
|
||||
make github-release
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
- name: Setup Docker Environment
|
||||
id: vars
|
||||
uses: ./.github/actions/setup-docker-environment
|
||||
with:
|
||||
version: latest
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
username: ${{ env.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
ghcr_username: ${{ github.actor }}
|
||||
ghcr_password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build and Push
|
||||
uses: docker/build-push-action@v2
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
file: Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
@@ -66,4 +63,8 @@ jobs:
|
||||
${{ env.DOCKERHUB_USERNAME }}/actions-runner-controller:latest
|
||||
${{ env.DOCKERHUB_USERNAME }}/actions-runner-controller:${{ env.VERSION }}
|
||||
${{ env.DOCKERHUB_USERNAME }}/actions-runner-controller:${{ env.VERSION }}-${{ steps.vars.outputs.sha_short }}
|
||||
|
||||
ghcr.io/actions-runner-controller/actions-runner-controller:latest
|
||||
ghcr.io/actions-runner-controller/actions-runner-controller:${{ env.VERSION }}
|
||||
ghcr.io/actions-runner-controller/actions-runner-controller:${{ env.VERSION }}-${{ steps.vars.outputs.sha_short }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
58
.github/workflows/publish-canary.yaml
vendored
Normal file
58
.github/workflows/publish-canary.yaml
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
name: Publish Canary Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- '.github/ISSUE_TEMPLATE/**'
|
||||
- '.github/workflows/validate-chart.yaml'
|
||||
- '.github/workflows/publish-chart.yaml'
|
||||
- '.github/workflows/publish-arc.yaml'
|
||||
- '.github/workflows/runners.yaml'
|
||||
- '.github/workflows/validate-entrypoint.yaml'
|
||||
- '.github/renovate.*'
|
||||
- 'runner/**'
|
||||
- '.gitignore'
|
||||
- 'PROJECT'
|
||||
- 'LICENSE'
|
||||
- 'Makefile'
|
||||
|
||||
# https://docs.github.com/en/rest/overview/permissions-required-for-github-apps
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
canary-build:
|
||||
name: Build and Publish Canary Image
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKER_USER }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Docker Environment
|
||||
id: vars
|
||||
uses: ./.github/actions/setup-docker-environment
|
||||
with:
|
||||
username: ${{ env.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
ghcr_username: ${{ github.actor }}
|
||||
ghcr_password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
# Considered unstable builds
|
||||
# See Issue #285, PR #286, and PR #323 for more information
|
||||
- name: Build and Push
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
file: Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.DOCKERHUB_USERNAME }}/actions-runner-controller:canary
|
||||
ghcr.io/actions-runner-controller/actions-runner-controller:canary
|
||||
cache-from: type=gha,scope=arc-canary
|
||||
cache-to: type=gha,mode=max,scope=arc-canary
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Publish helm chart
|
||||
name: Publish Helm Chart
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -6,7 +6,7 @@ on:
|
||||
- master
|
||||
paths:
|
||||
- 'charts/**'
|
||||
- '.github/workflows/on-push-master-publish-chart.yml'
|
||||
- '.github/workflows/publish-chart.yaml'
|
||||
- '!charts/actions-runner-controller/docs/**'
|
||||
- '!**.md'
|
||||
workflow_dispatch:
|
||||
@@ -15,20 +15,23 @@ env:
|
||||
KUBE_SCORE_VERSION: 1.10.0
|
||||
HELM_VERSION: v3.8.0
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint-chart:
|
||||
runs-on: ubuntu-latest
|
||||
name: Lint Chart
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
publish-chart: ${{ steps.publish-chart-step.outputs.publish }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v2.0
|
||||
uses: azure/setup-helm@v3.0
|
||||
with:
|
||||
version: ${{ env.HELM_VERSION }}
|
||||
|
||||
@@ -49,12 +52,12 @@ jobs:
|
||||
--enable-optional-test container-security-context-readonlyrootfilesystem
|
||||
|
||||
# python is a requirement for the chart-testing action below (supports yamllint among other tests)
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.7
|
||||
python-version: '3.7'
|
||||
|
||||
- name: Set up chart-testing
|
||||
uses: helm/chart-testing-action@v2.2.0
|
||||
uses: helm/chart-testing-action@v2.2.1
|
||||
|
||||
- name: Run chart-testing (list-changed)
|
||||
id: list-changed
|
||||
@@ -65,22 +68,23 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Run chart-testing (lint)
|
||||
run: ct lint --config charts/.ci/ct-config.yaml
|
||||
run: |
|
||||
ct lint --config charts/.ci/ct-config.yaml
|
||||
|
||||
- name: Create kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
uses: helm/kind-action@v1.3.0
|
||||
|
||||
# We need cert-manager already installed in the cluster because we assume the CRDs exist
|
||||
- name: Install cert-manager
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
helm repo add jetstack https://charts.jetstack.io --force-update
|
||||
helm install cert-manager jetstack/cert-manager --set installCRDs=true --wait
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
run: ct install --config charts/.ci/ct-config.yaml
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct install --config charts/.ci/ct-config.yaml
|
||||
|
||||
# WARNING: This relies on the latest release being inat the top of the JSON from GitHub and a clean chart.yaml
|
||||
- name: Check if Chart Publish is Needed
|
||||
@@ -99,12 +103,15 @@ jobs:
|
||||
publish-chart:
|
||||
if: needs.lint-chart.outputs.publish-chart == 'true'
|
||||
needs: lint-chart
|
||||
runs-on: ubuntu-latest
|
||||
name: Publish Chart
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # for helm/chart-releaser-action to push chart release and create a release
|
||||
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -114,7 +121,7 @@ jobs:
|
||||
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
|
||||
|
||||
- name: Run chart-releaser
|
||||
uses: helm/chart-releaser-action@v1.3.0
|
||||
uses: helm/chart-releaser-action@v1.4.0
|
||||
env:
|
||||
CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
||||
|
||||
32
.github/workflows/run-codeql.yaml
vendored
Normal file
32
.github/workflows/run-codeql.yaml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
name: Run CodeQL
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
schedule:
|
||||
- cron: '30 1 * * 0'
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
security-events: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
with:
|
||||
languages: go
|
||||
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v2
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
25
.github/workflows/run-stale.yaml
vendored
Normal file
25
.github/workflows/run-stale.yaml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: Run Stale Bot
|
||||
on:
|
||||
schedule:
|
||||
- cron: '30 1 * * *'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
name: Run Stale
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: write # for actions/stale to close stale issues
|
||||
pull-requests: write # for actions/stale to close stale PRs
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
|
||||
# turn off stale for both issues and PRs
|
||||
days-before-stale: -1
|
||||
# turn stale back on for issues only
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
exempt-issue-labels: 'pinned,security,enhancement,refactor,documentation,chore,bug,dependencies,needs-investigation'
|
||||
@@ -2,30 +2,41 @@ name: Runners
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
- closed
|
||||
branches:
|
||||
- 'master'
|
||||
paths:
|
||||
- 'runner/**'
|
||||
- .github/workflows/runners.yml
|
||||
- '!runner/Makefile'
|
||||
- '.github/workflows/runners.yaml'
|
||||
- '!**.md'
|
||||
# We must do a trigger on a push: instead of a types: closed so GitHub Secrets
|
||||
# are available to the workflow run
|
||||
push:
|
||||
branches:
|
||||
- 'master'
|
||||
paths:
|
||||
- 'runner/**'
|
||||
- '!runner/Makefile'
|
||||
- '.github/workflows/runners.yaml'
|
||||
- '!**.md'
|
||||
|
||||
env:
|
||||
RUNNER_VERSION: 2.287.1
|
||||
RUNNER_VERSION: 2.294.0
|
||||
DOCKER_VERSION: 20.10.12
|
||||
RUNNER_CONTAINER_HOOKS_VERSION: 0.1.2
|
||||
DOCKERHUB_USERNAME: summerwind
|
||||
|
||||
jobs:
|
||||
build:
|
||||
build-runners:
|
||||
name: Build ${{ matrix.name }}-${{ matrix.os-name }}-${{ matrix.os-version }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
name: Build ${{ matrix.name }}-${{ matrix.os-name }}-${{ matrix.os-version }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -33,35 +44,34 @@ jobs:
|
||||
- name: actions-runner
|
||||
os-name: ubuntu
|
||||
os-version: 20.04
|
||||
dockerfile: Dockerfile
|
||||
- name: actions-runner-dind
|
||||
os-name: ubuntu
|
||||
os-version: 20.04
|
||||
dockerfile: Dockerfile.dindrunner
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Docker Environment
|
||||
id: vars
|
||||
uses: ./.github/actions/setup-docker-environment
|
||||
with:
|
||||
with:
|
||||
username: ${{ env.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
ghcr_username: ${{ github.actor }}
|
||||
ghcr_password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build and Push Versioned Tags
|
||||
uses: docker/build-push-action@v2
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
context: ./runner
|
||||
file: ./runner/${{ matrix.dockerfile }}
|
||||
file: ./runner/${{ matrix.name }}.dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.ref == 'master' && github.event.pull_request.merged == true }}
|
||||
push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
build-args: |
|
||||
RUNNER_VERSION=${{ env.RUNNER_VERSION }}
|
||||
DOCKER_VERSION=${{ env.DOCKER_VERSION }}
|
||||
RUNNER_CONTAINER_HOOKS_VERSION=${{ env.RUNNER_CONTAINER_HOOKS_VERSION }}
|
||||
tags: |
|
||||
${{ env.DOCKERHUB_USERNAME }}/${{ matrix.name }}:v${{ env.RUNNER_VERSION }}-${{ matrix.os-name }}-${{ matrix.os-version }}
|
||||
${{ env.DOCKERHUB_USERNAME }}/${{ matrix.name }}:v${{ env.RUNNER_VERSION }}-${{ matrix.os-name }}-${{ matrix.os-version }}-${{ steps.vars.outputs.sha_short }}
|
||||
@@ -69,5 +79,5 @@ jobs:
|
||||
ghcr.io/${{ github.repository }}/${{ matrix.name }}:latest
|
||||
ghcr.io/${{ github.repository }}/${{ matrix.name }}:v${{ env.RUNNER_VERSION }}-${{ matrix.os-name }}-${{ matrix.os-version }}
|
||||
ghcr.io/${{ github.repository }}/${{ matrix.name }}:v${{ env.RUNNER_VERSION }}-${{ matrix.os-name }}-${{ matrix.os-version }}-${{ steps.vars.outputs.sha_short }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
cache-from: type=gha,scope=build-${{ matrix.name }}
|
||||
cache-to: type=gha,mode=max,scope=build-${{ matrix.name }}
|
||||
39
.github/workflows/test.yaml
vendored
39
.github/workflows/test.yaml
vendored
@@ -1,39 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
paths-ignore:
|
||||
- .github/workflows/runners.yml
|
||||
- .github/workflows/on-push-lint-charts.yml
|
||||
- .github/workflows/on-push-master-publish-chart.yml
|
||||
- .github/workflows/release.yml
|
||||
- .github/workflows/test-entrypoint.yml
|
||||
- .github/workflows/wip.yml
|
||||
- 'runner/**'
|
||||
- '**.md'
|
||||
- '.gitignore'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
name: Test
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
- uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: '^1.17.7'
|
||||
- run: go version
|
||||
- name: Install kubebuilder
|
||||
run: |
|
||||
curl -L -O https://github.com/kubernetes-sigs/kubebuilder/releases/download/v2.3.2/kubebuilder_2.3.2_linux_amd64.tar.gz
|
||||
tar zxvf kubebuilder_2.3.2_linux_amd64.tar.gz
|
||||
sudo mv kubebuilder_2.3.2_linux_amd64 /usr/local/kubebuilder
|
||||
- name: Run tests
|
||||
run: make test
|
||||
- name: Verify manifests are up-to-date
|
||||
run: |
|
||||
make manifests
|
||||
git diff --exit-code
|
||||
60
.github/workflows/validate-arc.yaml
vendored
Normal file
60
.github/workflows/validate-arc.yaml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
name: Validate ARC
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- '.github/ISSUE_TEMPLATE/**'
|
||||
- '.github/workflows/publish-canary.yaml'
|
||||
- '.github/workflows/validate-chart.yaml'
|
||||
- '.github/workflows/publish-chart.yaml'
|
||||
- '.github/workflows/runners.yaml'
|
||||
- '.github/workflows/publish-arc.yaml'
|
||||
- '.github/workflows/validate-entrypoint.yaml'
|
||||
- '.github/renovate.*'
|
||||
- 'runner/**'
|
||||
- '.gitignore'
|
||||
- 'PROJECT'
|
||||
- 'LICENSE'
|
||||
- 'Makefile'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test-controller:
|
||||
name: Test ARC
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set-up Go
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: '1.18.2'
|
||||
check-latest: false
|
||||
|
||||
- uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/go/pkg/mod
|
||||
key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-go-
|
||||
|
||||
- name: Install kubebuilder
|
||||
run: |
|
||||
curl -L -O https://github.com/kubernetes-sigs/kubebuilder/releases/download/v2.3.2/kubebuilder_2.3.2_linux_amd64.tar.gz
|
||||
tar zxvf kubebuilder_2.3.2_linux_amd64.tar.gz
|
||||
sudo mv kubebuilder_2.3.2_linux_amd64 /usr/local/kubebuilder
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
make test
|
||||
|
||||
- name: Verify manifests are up-to-date
|
||||
run: |
|
||||
make manifests
|
||||
git diff --exit-code
|
||||
@@ -1,10 +1,10 @@
|
||||
name: Lint and Test Charts
|
||||
name: Validate Helm Chart
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'charts/**'
|
||||
- '.github/workflows/on-push-lint-charts.yml'
|
||||
- '.github/workflows/validate-chart.yaml'
|
||||
- '!charts/actions-runner-controller/docs/**'
|
||||
- '!**.md'
|
||||
workflow_dispatch:
|
||||
@@ -12,18 +12,21 @@ env:
|
||||
KUBE_SCORE_VERSION: 1.10.0
|
||||
HELM_VERSION: v3.8.0
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint-test:
|
||||
runs-on: ubuntu-latest
|
||||
validate-chart:
|
||||
name: Lint Chart
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v2.0
|
||||
uses: azure/setup-helm@v3.0
|
||||
with:
|
||||
version: ${{ env.HELM_VERSION }}
|
||||
|
||||
@@ -44,12 +47,12 @@ jobs:
|
||||
--enable-optional-test container-security-context-readonlyrootfilesystem
|
||||
|
||||
# python is a requirement for the chart-testing action below (supports yamllint among other tests)
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.7
|
||||
python-version: '3.7'
|
||||
|
||||
- name: Set up chart-testing
|
||||
uses: helm/chart-testing-action@v2.2.0
|
||||
uses: helm/chart-testing-action@v2.2.1
|
||||
|
||||
- name: Run chart-testing (list-changed)
|
||||
id: list-changed
|
||||
@@ -60,18 +63,20 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Run chart-testing (lint)
|
||||
run: ct lint --config charts/.ci/ct-config.yaml
|
||||
run: |
|
||||
ct lint --config charts/.ci/ct-config.yaml
|
||||
|
||||
- name: Create kind cluster
|
||||
uses: helm/kind-action@v1.2.0
|
||||
uses: helm/kind-action@v1.3.0
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
|
||||
# We need cert-manager already installed in the cluster because we assume the CRDs exist
|
||||
- name: Install cert-manager
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
helm repo add jetstack https://charts.jetstack.io --force-update
|
||||
helm install cert-manager jetstack/cert-manager --set installCRDs=true --wait
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
run: ct install --config charts/.ci/ct-config.yaml
|
||||
run: |
|
||||
ct install --config charts/.ci/ct-config.yaml
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Unit tests for entrypoint
|
||||
name: Validate Runners
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@@ -9,14 +9,17 @@ on:
|
||||
- 'test/entrypoint/**'
|
||||
- '!**.md'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
test-runner-entrypoint:
|
||||
name: Test entrypoint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
- name: Run unit tests for entrypoint.sh
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
cd test/entrypoint
|
||||
bash entrypoint_unittest.sh
|
||||
make acceptance/runner/entrypoint
|
||||
51
.github/workflows/wip.yml
vendored
51
.github/workflows/wip.yml
vendored
@@ -1,51 +0,0 @@
|
||||
name: Publish Canary Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths-ignore:
|
||||
- .github/workflows/runners.yml
|
||||
- .github/workflows/on-push-lint-charts.yml
|
||||
- .github/workflows/on-push-master-publish-chart.yml
|
||||
- .github/workflows/release.yml
|
||||
- .github/workflows/test-entrypoint.yml
|
||||
- "runner/**"
|
||||
- "**.md"
|
||||
- ".gitignore"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
name: Build and Publish Canary Image
|
||||
env:
|
||||
DOCKERHUB_USERNAME: ${{ secrets.DOCKER_USER }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
with:
|
||||
version: latest
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
|
||||
# Considered unstable builds
|
||||
# See Issue #285, PR #286, and PR #323 for more information
|
||||
- name: Build and Push
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
file: Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.DOCKERHUB_USERNAME }}/actions-runner-controller:canary
|
||||
2
CODEOWNERS
Normal file
2
CODEOWNERS
Normal file
@@ -0,0 +1,2 @@
|
||||
# actions-runner-controller maintainers
|
||||
* @mumoshu @toast-gear
|
||||
43
Dockerfile
43
Dockerfile
@@ -1,29 +1,44 @@
|
||||
# Build the manager binary
|
||||
FROM golang:1.17 as builder
|
||||
|
||||
ARG TARGETPLATFORM
|
||||
FROM --platform=$BUILDPLATFORM golang:1.18.3 as builder
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
ENV GO111MODULE=on \
|
||||
CGO_ENABLED=0
|
||||
# Make it runnable on a distroless image/without libc
|
||||
ENV CGO_ENABLED=0
|
||||
|
||||
# Copy the Go Modules manifests
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# cache deps before building and copying source so that we don't need to re-download as much
|
||||
# and so that source changes don't invalidate our downloaded layer
|
||||
# and so that source changes don't invalidate our downloaded layer.
|
||||
#
|
||||
# Also, we need to do this before setting TARGETPLATFORM/TARGETOS/TARGETARCH/TARGETVARIANT
|
||||
# so that go mod cache is shared across platforms.
|
||||
RUN go mod download
|
||||
|
||||
# Copy the go source
|
||||
COPY . .
|
||||
# COPY . .
|
||||
|
||||
# Usage:
|
||||
# docker buildx build --tag repo/img:tag -f ./Dockerfile . --platform linux/amd64,linux/arm64,linux/arm/v7
|
||||
#
|
||||
# With the above commmand,
|
||||
# TARGETOS can be "linux", TARGETARCH can be "amd64", "arm64", and "arm", TARGETVARIANT can be "v7".
|
||||
|
||||
ARG TARGETPLATFORM TARGETOS TARGETARCH TARGETVARIANT
|
||||
|
||||
# We intentionally avoid `--mount=type=cache,mode=0777,target=/go/pkg/mod` in the `go mod download` and the `go build` runs
|
||||
# to avoid https://github.com/moby/buildkit/issues/2334
|
||||
# We can use docker layer cache so the build is fast enogh anyway
|
||||
# We also use per-platform GOCACHE for the same reason.
|
||||
env GOCACHE /build/${TARGETPLATFORM}/root/.cache/go-build
|
||||
|
||||
# Build
|
||||
RUN export GOOS=$(echo ${TARGETPLATFORM} | cut -d / -f1) && \
|
||||
export GOARCH=$(echo ${TARGETPLATFORM} | cut -d / -f2) && \
|
||||
GOARM=$(echo ${TARGETPLATFORM} | cut -d / -f3 | cut -c2-) && \
|
||||
go build -a -o manager main.go && \
|
||||
go build -a -o github-webhook-server ./cmd/githubwebhookserver
|
||||
RUN --mount=target=. \
|
||||
--mount=type=cache,mode=0777,target=${GOCACHE} \
|
||||
export GOOS=${TARGETOS} GOARCH=${TARGETARCH} GOARM=${TARGETVARIANT#v} && \
|
||||
go build -o /out/manager main.go && \
|
||||
go build -o /out/github-webhook-server ./cmd/githubwebhookserver
|
||||
|
||||
# Use distroless as minimal base image to package the manager binary
|
||||
# Refer to https://github.com/GoogleContainerTools/distroless for more details
|
||||
@@ -31,8 +46,8 @@ FROM gcr.io/distroless/static:nonroot
|
||||
|
||||
WORKDIR /
|
||||
|
||||
COPY --from=builder /workspace/manager .
|
||||
COPY --from=builder /workspace/github-webhook-server .
|
||||
COPY --from=builder /out/manager .
|
||||
COPY --from=builder /out/github-webhook-server .
|
||||
|
||||
USER nonroot:nonroot
|
||||
|
||||
|
||||
21
Makefile
21
Makefile
@@ -5,6 +5,7 @@ else
|
||||
endif
|
||||
DOCKER_USER ?= $(shell echo ${NAME} | cut -d / -f1)
|
||||
VERSION ?= latest
|
||||
RUNNER_VERSION ?= 2.294.0
|
||||
TARGETPLATFORM ?= $(shell arch)
|
||||
RUNNER_NAME ?= ${DOCKER_USER}/actions-runner
|
||||
RUNNER_TAG ?= ${VERSION}
|
||||
@@ -12,9 +13,8 @@ TEST_REPO ?= ${DOCKER_USER}/actions-runner-controller
|
||||
TEST_ORG ?=
|
||||
TEST_ORG_REPO ?=
|
||||
TEST_EPHEMERAL ?= false
|
||||
SYNC_PERIOD ?= 5m
|
||||
SYNC_PERIOD ?= 1m
|
||||
USE_RUNNERSET ?=
|
||||
RUNNER_FEATURE_FLAG_EPHEMERAL ?=
|
||||
KUBECONTEXT ?= kind-acceptance
|
||||
CLUSTER ?= acceptance
|
||||
CERT_MANAGER_VERSION ?= v1.1.1
|
||||
@@ -56,6 +56,7 @@ GO_TEST_ARGS ?= -short
|
||||
# Run tests
|
||||
test: generate fmt vet manifests
|
||||
go test $(GO_TEST_ARGS) ./... -coverprofile cover.out
|
||||
go test -fuzz=Fuzz -fuzztime=10s -run=Fuzz* ./controllers
|
||||
|
||||
test-with-deps: kube-apiserver etcd kubectl
|
||||
# See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/envtest#pkg-constants
|
||||
@@ -109,13 +110,9 @@ vet:
|
||||
generate: controller-gen
|
||||
$(CONTROLLER_GEN) object:headerFile=./hack/boilerplate.go.txt paths="./..."
|
||||
|
||||
# Build the docker image
|
||||
docker-build:
|
||||
docker build -t ${NAME}:${VERSION} .
|
||||
docker build -t ${RUNNER_NAME}:${RUNNER_TAG} --build-arg TARGETPLATFORM=${TARGETPLATFORM} runner
|
||||
|
||||
docker-buildx:
|
||||
export DOCKER_CLI_EXPERIMENTAL=enabled
|
||||
export DOCKER_CLI_EXPERIMENTAL=enabled ;\
|
||||
export DOCKER_BUILDKIT=1
|
||||
@if ! docker buildx ls | grep -q container-builder; then\
|
||||
docker buildx create --platform ${PLATFORMS} --name container-builder --use;\
|
||||
fi
|
||||
@@ -191,12 +188,14 @@ acceptance/deploy:
|
||||
TEST_ORG=${TEST_ORG} TEST_ORG_REPO=${TEST_ORG_REPO} SYNC_PERIOD=${SYNC_PERIOD} \
|
||||
USE_RUNNERSET=${USE_RUNNERSET} \
|
||||
TEST_EPHEMERAL=${TEST_EPHEMERAL} \
|
||||
RUNNER_FEATURE_FLAG_EPHEMERAL=${RUNNER_FEATURE_FLAG_EPHEMERAL} \
|
||||
acceptance/deploy.sh
|
||||
|
||||
acceptance/tests:
|
||||
acceptance/checks.sh
|
||||
|
||||
acceptance/runner/entrypoint:
|
||||
cd test/entrypoint/ && bash test.sh
|
||||
|
||||
# We use -count=1 instead of `go clean -testcache`
|
||||
# See https://terratest.gruntwork.io/docs/testing-best-practices/avoid-test-caching/
|
||||
.PHONY: e2e
|
||||
@@ -223,7 +222,7 @@ ifeq (, $(wildcard $(GOBIN)/controller-gen))
|
||||
CONTROLLER_GEN_TMP_DIR=$$(mktemp -d) ;\
|
||||
cd $$CONTROLLER_GEN_TMP_DIR ;\
|
||||
go mod init tmp ;\
|
||||
go get sigs.k8s.io/controller-tools/cmd/controller-gen@v0.7.0 ;\
|
||||
go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.7.0 ;\
|
||||
rm -rf $$CONTROLLER_GEN_TMP_DIR ;\
|
||||
}
|
||||
endif
|
||||
@@ -243,7 +242,7 @@ ifeq (, $(wildcard $(GOBIN)/yq))
|
||||
YQ_TMP_DIR=$$(mktemp -d) ;\
|
||||
cd $$YQ_TMP_DIR ;\
|
||||
go mod init tmp ;\
|
||||
go get github.com/mikefarah/yq/v3@3.4.0 ;\
|
||||
go install github.com/mikefarah/yq/v3@3.4.0 ;\
|
||||
rm -rf $$YQ_TMP_DIR ;\
|
||||
}
|
||||
endif
|
||||
|
||||
22
SECURITY.md
Normal file
22
SECURITY.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Security Policy
|
||||
|
||||
## Sponsoring the project
|
||||
|
||||
This project is maintained by a small team of two and therefore lacks the resource to provide security fixes in a timely manner.
|
||||
|
||||
If you have important business(es) that relies on this project, please consider sponsoring the project so that the maintainer(s) can commit to providing such service.
|
||||
|
||||
Please refer to https://github.com/sponsors/actions-runner-controller for available tiers.
|
||||
|
||||
## Supported Versions
|
||||
|
||||
| Version | Supported |
|
||||
| ------- | ------------------ |
|
||||
| 0.23.0 | :white_check_mark: |
|
||||
| < 0.23.0| :x: |
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
To report a security issue, please email ykuoka+arcsecurity(at)gmail.com with a description of the issue, the steps you took to create the issue, affected versions, and, if known, mitigations for the issue.
|
||||
|
||||
A maintainer will try to respond within 5 working days. If the issue is confirmed as a vulnerability, a Security Advisory will be opened. This project tries to follow a 90 day disclosure timeline.
|
||||
@@ -1,10 +1,93 @@
|
||||
# Troubleshooting
|
||||
|
||||
* [Invalid header field value](#invalid-header-field-value)
|
||||
* [Runner coming up before network available](#runner-coming-up-before-network-available)
|
||||
* [Deployment fails on GKE due to webhooks](#deployment-fails-on-gke-due-to-webhooks)
|
||||
* [Tools](#tools)
|
||||
* [Installation](#installation)
|
||||
* [InternalError when calling webhook: context deadline exceeded](#internalerror-when-calling-webhook-context-deadline-exceeded)
|
||||
* [Invalid header field value](#invalid-header-field-value)
|
||||
* [Helm chart install failure: certificate signed by unknown authority](#helm-chart-install-failure-certificate-signed-by-unknown-authority)
|
||||
* [Operations](#operations)
|
||||
* [Stuck runner kind or backing pod](#stuck-runner-kind-or-backing-pod)
|
||||
* [Delay in jobs being allocated to runners](#delay-in-jobs-being-allocated-to-runners)
|
||||
* [Runner coming up before network available](#runner-coming-up-before-network-available)
|
||||
* [Outgoing network action hangs indefinitely](#outgoing-network-action-hangs-indefinitely)
|
||||
* [Unable to scale to zero with TotalNumberOfQueuedAndInProgressWorkflowRuns](#unable-to-scale-to-zero-with-totalnumberofqueuedandinprogressworkflowruns)
|
||||
|
||||
## Invalid header field value
|
||||
## Tools
|
||||
|
||||
A list of tools which are helpful for troubleshooting
|
||||
|
||||
* https://github.com/rewanthtammana/kubectl-fields Kubernetes resources hierarchy parsing tool
|
||||
* https://github.com/stern/stern Multi pod and container log tailing for Kubernetes
|
||||
|
||||
## Installation
|
||||
|
||||
Troubeshooting runbooks that relate to ARC installation problems
|
||||
|
||||
### InternalError when calling webhook: context deadline exceeded
|
||||
|
||||
**Problem**
|
||||
|
||||
This issue can come up for various reasons like leftovers from previous installations or not being able to access the K8s service's clusterIP associated with the admission webhook server (of ARC).
|
||||
|
||||
```
|
||||
Internal error occurred: failed calling webhook "mutate.runnerdeployment.actions.summerwind.dev":
|
||||
Post "https://actions-runner-controller-webhook.actions-runner-system.svc:443/mutate-actions-summerwind-dev-v1alpha1-runnerdeployment?timeout=10s": context deadline exceeded
|
||||
```
|
||||
|
||||
**Solution**
|
||||
|
||||
First we will try the common solution of checking webhook leftovers from previous installations:
|
||||
|
||||
1. ```bash
|
||||
kubectl get validatingwebhookconfiguration -A
|
||||
kubectl get mutatingwebhookconfiguration -A
|
||||
```
|
||||
2. If you see any webhooks related to actions-runner-controller, delete them:
|
||||
```bash
|
||||
kubectl delete mutatingwebhookconfiguration actions-runner-controller-mutating-webhook-configuration
|
||||
kubectl delete validatingwebhookconfiguration actions-runner-controller-validating-webhook-configuration
|
||||
```
|
||||
|
||||
If that didn't work then probably your K8s control-plane is somehow unable to access the K8s service's clusterIP associated with the admission webhook server:
|
||||
1. You're running apiserver as a binary and you didn't make service cluster IPs available to the host network.
|
||||
2. You're running the apiserver in the pod but your pod network (i.e. CNI plugin installation and config) is not good so your pods(like kube-apiserver) in the K8s control-plane nodes can't access ARC's admission webhook server pod(s) in probably data-plane nodes.
|
||||
|
||||
|
||||
Another reason could be due to GKEs firewall settings you may run into the following errors when trying to deploy runners on a private GKE cluster:
|
||||
|
||||
To fix this, you may either:
|
||||
|
||||
1. Configure the webhook to use another port, such as 443 or 10250, [each of
|
||||
which allow traffic by default](https://cloud.google.com/kubernetes-engine/docs/how-to/private-clusters#add_firewall_rules).
|
||||
|
||||
```sh
|
||||
# With helm, you'd set `webhookPort` to the port number of your choice
|
||||
# See https://github.com/actions-runner-controller/actions-runner-controller/pull/1410/files for more information
|
||||
helm upgrade --install --namespace actions-runner-system --create-namespace \
|
||||
--wait actions-runner-controller actions-runner-controller/actions-runner-controller \
|
||||
--set webhookPort=10250
|
||||
```
|
||||
|
||||
2. Set up a firewall rule to allow the master node to connect to the default
|
||||
webhook port. The exact way to do this may vary, but the following script
|
||||
should point you in the right direction:
|
||||
|
||||
```sh
|
||||
# 1) Retrieve the network tag automatically given to the worker nodes
|
||||
# NOTE: this only works if you have only one cluster in your GCP project. You will have to manually inspect the result of this command to find the tag for the cluster you want to target
|
||||
WORKER_NODES_TAG=$(gcloud compute instances list --format='text(tags.items[0])' --filter='metadata.kubelet-config:*' | grep tags | awk '{print $2}' | sort | uniq)
|
||||
|
||||
# 2) Take note of the VPC network in which you deployed your cluster
|
||||
# NOTE this only works if you have only one network in which you deploy your clusters
|
||||
NETWORK=$(gcloud compute instances list --format='text(networkInterfaces[0].network)' --filter='metadata.kubelet-config:*' | grep networks | awk -F'/' '{print $NF}' | sort | uniq)
|
||||
|
||||
# 3) Get the master source ip block
|
||||
SOURCE=$(gcloud container clusters describe <cluster-name> --region <region> | grep masterIpv4CidrBlock| cut -d ':' -f 2 | tr -d ' ')
|
||||
|
||||
gcloud compute firewall-rules create k8s-cert-manager --source-ranges $SOURCE --target-tags $WORKER_NODES_TAG --allow TCP:9443 --network $NETWORK
|
||||
```
|
||||
|
||||
### Invalid header field value
|
||||
|
||||
**Problem**
|
||||
|
||||
@@ -23,7 +106,88 @@ Your base64'ed PAT token has a new line at the end, it needs to be created witho
|
||||
* `echo -n $TOKEN | base64`
|
||||
* Create the secret as described in the docs using the shell and documented flags
|
||||
|
||||
## Runner coming up before network available
|
||||
### Helm chart install failure: certificate signed by unknown authority
|
||||
|
||||
**Problem**
|
||||
|
||||
```
|
||||
Error: UPGRADE FAILED: failed to create resource: Internal error occurred: failed calling webhook "webhook.cert-manager.io": failed to call webhook: Post "https://cert-manager-webhook.cert-manager.svc:443/mutate?timeout=10s": x509: certificate signed by unknown authority
|
||||
```
|
||||
|
||||
Apparently, it's failing while `helm` is creating one of resources defined in the ARC chart and the cause was that cert-manager's webhook is not working correctly, due to the missing or the invalid CA certficate.
|
||||
|
||||
You'd try to tail logs from the `cert-manager-cainjector` and see it's failing with an error like:
|
||||
|
||||
```
|
||||
$ kubectl -n cert-manager logs cert-manager-cainjector-7cdbb9c945-g6bt4
|
||||
I0703 03:31:55.159339 1 start.go:91] "starting" version="v1.1.1" revision="3ac7418070e22c87fae4b22603a6b952f797ae96"
|
||||
I0703 03:31:55.615061 1 leaderelection.go:243] attempting to acquire leader lease kube-system/cert-manager-cainjector-leader-election...
|
||||
I0703 03:32:10.738039 1 leaderelection.go:253] successfully acquired lease kube-system/cert-manager-cainjector-leader-election
|
||||
I0703 03:32:10.739941 1 recorder.go:52] cert-manager/controller-runtime/manager/events "msg"="Normal" "message"="cert-manager-cainjector-7cdbb9c945-g6bt4_88e4bc70-eded-4343-a6fb-0ddd6434eb55 became leader" "object"={"kind":"ConfigMap","namespace":"kube-system","name":"cert-manager-cainjector-leader-election","uid":"942a021e-364c-461a-978c-f54a95723cdc","apiVersion":"v1","resourceVersion":"1576"} "reason"="LeaderElection"
|
||||
E0703 03:32:11.192128 1 start.go:119] cert-manager/ca-injector "msg"="manager goroutine exited" "error"=null
|
||||
I0703 03:32:12.339197 1 request.go:645] Throttling request took 1.047437675s, request: GET:https://10.96.0.1:443/apis/storage.k8s.io/v1beta1?timeout=32s
|
||||
E0703 03:32:13.143790 1 start.go:151] cert-manager/ca-injector "msg"="Error registering certificate based controllers. Retrying after 5 seconds." "error"="no matches for kind \"MutatingWebhookConfiguration\" in version \"admissionregistration.k8s.io/v1beta1\""
|
||||
Error: error registering secret controller: no matches for kind "MutatingWebhookConfiguration" in version "admissionregistration.k8s.io/v1beta1"
|
||||
```
|
||||
|
||||
**Solution**
|
||||
|
||||
Your cluster is based on a new enough Kubernetes of version 1.22 or greater which does not support the legacy `admissionregistration.k8s.io/v1beta1` API anymore, and your `cert-manager` is not up-to-date hence it's still trying to use the leagcy Kubernetes API.
|
||||
|
||||
In many cases, it's not an option to downgrade Kubernetes. So, just upgrade `cert-manager` to a more recent version that does have have the support for the specific Kubernetes version you're using.
|
||||
|
||||
See https://cert-manager.io/docs/installation/supported-releases/ for the list of available cert-manager versions.
|
||||
|
||||
## Operations
|
||||
|
||||
Troubeshooting runbooks that relate to ARC operational problems
|
||||
|
||||
### Stuck runner kind or backing pod
|
||||
|
||||
**Problem**
|
||||
|
||||
Sometimes either the runner kind (`kubectl get runners`) or it's underlying pod can get stuck in a terminating state for various reasons. You can get the kind unstuck by removing its finaliser using something like this:
|
||||
|
||||
**Solution**
|
||||
|
||||
Remove the finaliser from the relevent runner kind or pod
|
||||
|
||||
```
|
||||
# Get all kind runners and remove the finalizer
|
||||
$ kubectl get runners --no-headers | awk {'print $1'} | xargs kubectl patch runner --type merge -p '{"metadata":{"finalizers":null}}'
|
||||
|
||||
# Get all pods that are stuck terminating and remove the finalizer
|
||||
$ kubectl -n get pods | grep Terminating | awk {'print $1'} | xargs kubectl patch pod -p '{"metadata":{"finalizers":null}}'
|
||||
```
|
||||
|
||||
_Note the code assumes you have already selected the namespace your runners are in and that they
|
||||
are in a namespace not shared with anything else_
|
||||
|
||||
### Delay in jobs being allocated to runners
|
||||
|
||||
**Problem**
|
||||
|
||||
ARC isn't involved in jobs actually getting allocated to a runner. ARC is responsible for orchestrating runners and the runner lifecycle. Why some people see large delays in job allocation is not clear however it has been https://github.com/actions-runner-controller/actions-runner-controller/issues/1387#issuecomment-1122593984 that this is caused from the self-update process somehow.
|
||||
|
||||
**Solution**
|
||||
|
||||
Disable the self-update process in your runner manifests
|
||||
|
||||
```yaml
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerDeployment
|
||||
metadata:
|
||||
name: example-runnerdeployment-with-sleep
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
...
|
||||
env:
|
||||
- name: DISABLE_RUNNER_UPDATE
|
||||
value: "true"
|
||||
```
|
||||
|
||||
### Runner coming up before network available
|
||||
|
||||
**Problem**
|
||||
|
||||
@@ -61,40 +225,48 @@ metadata:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
...
|
||||
env:
|
||||
# This runner's entrypoint script will have a 5 seconds delay
|
||||
# as a first action within the entrypoint script
|
||||
- name: STARTUP_DELAY_IN_SECONDS
|
||||
value: "5"
|
||||
```
|
||||
|
||||
## Deployment fails on GKE due to webhooks
|
||||
## Outgoing network action hangs indefinitely
|
||||
|
||||
**Problem**
|
||||
|
||||
Due to GKEs firewall settings you may run into the following errors when trying to deploy runners on a private GKE cluster:
|
||||
Some random outgoing network actions hangs indefinitely. This could be because your cluster does not give Docker the standard MTU of 1500, you can check this out by running `ip link` in a pod that encounters the problem and reading the outgoing interface's MTU value. If it is smaller than 1500, then try the following.
|
||||
|
||||
```
|
||||
Internal error occurred: failed calling webhook "mutate.runner.actions.summerwind.dev":
|
||||
Post https://webhook-service.actions-runner-system.svc:443/mutate-actions-summerwind-dev-v1alpha1-runner?timeout=10s:
|
||||
context deadline exceeded
|
||||
**Solution**
|
||||
|
||||
Add a `dockerMTU` key in your runner's spec with the value you read on the outgoing interface. For instance:
|
||||
|
||||
```yaml
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerDeployment
|
||||
metadata:
|
||||
name: github-runner
|
||||
namespace: github-system
|
||||
spec:
|
||||
replicas: 6
|
||||
template:
|
||||
spec:
|
||||
dockerMTU: 1400
|
||||
repository: $username/$repo
|
||||
env: []
|
||||
```
|
||||
|
||||
**Solution**<br />
|
||||
There may be more places you need to tweak for MTU.
|
||||
Please consult issues like #651 for more information.
|
||||
|
||||
To fix this, you need to set up a firewall rule to allow the master node to connect to the webhook port.
|
||||
The exact way to do this may wary, but the following script should point you in the right direction:
|
||||
## Unable to scale to zero with TotalNumberOfQueuedAndInProgressWorkflowRuns
|
||||
|
||||
```
|
||||
# 1) Retrieve the network tag automatically given to the worker nodes
|
||||
# NOTE: this only works if you have only one cluster in your GCP project. You will have to manually inspect the result of this command to find the tag for the cluster you want to target
|
||||
WORKER_NODES_TAG=$(gcloud compute instances list --format='text(tags.items[0])' --filter='metadata.kubelet-config:*' | grep tags | awk '{print $2}' | sort | uniq)
|
||||
**Problem**
|
||||
|
||||
# 2) Take note of the VPC network in which you deployed your cluster
|
||||
# NOTE this only works if you have only one network in which you deploy your clusters
|
||||
NETWORK=$(gcloud compute instances list --format='text(networkInterfaces[0].network)' --filter='metadata.kubelet-config:*' | grep networks | awk -F'/' '{print $NF}' | sort | uniq)
|
||||
HRA doesn't scale the RunnerDeployment to zero, even though you did configure HRA correctly, to have a pull-based scaling metric `TotalNumberOfQueuedAndInProgressWorkflowRuns`, and set `minReplicas: 0`.
|
||||
|
||||
# 3) Get the master source ip block
|
||||
SOURCE=$(gcloud container clusters describe <cluster-name> --region <region> | grep masterIpv4CidrBlock| cut -d ':' -f 2 | tr -d ' ')
|
||||
gcloud compute firewall-rules create k8s-cert-manager --source-ranges $SOURCE --target-tags $WORKER_NODES_TAG --allow TCP:9443 --network $NETWORK
|
||||
```
|
||||
**Solution**
|
||||
|
||||
You very likely have some dangling workflow jobs stuck in `queued` or `in_progress` as seen in [#1057](https://github.com/actions-runner-controller/actions-runner-controller/issues/1057#issuecomment-1133439061).
|
||||
|
||||
Manually call [the "list workflow runs" API](https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository), and [remove the dangling workflow job(s)](https://docs.github.com/en/rest/actions/workflow-runs#delete-a-workflow-run).
|
||||
|
||||
97
acceptance/argotunnel.sh
Executable file
97
acceptance/argotunnel.sh
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# See https://developers.cloudflare.com/cloudflare-one/tutorials/many-cfd-one-tunnel/
|
||||
|
||||
kubectl create ns tunnel || :
|
||||
|
||||
kubectl -n tunnel delete secret tunnel-credentials || :
|
||||
|
||||
kubectl -n tunnel create secret generic tunnel-credentials \
|
||||
--from-file=credentials.json=$HOME/.cloudflared/${TUNNEL_ID}.json || :
|
||||
|
||||
cat <<MANIFEST | kubectl -n tunnel ${OP} -f -
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: cloudflared
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: cloudflared
|
||||
replicas: 2 # You could also consider elastic scaling for this deployment
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: cloudflared
|
||||
spec:
|
||||
containers:
|
||||
- name: cloudflared
|
||||
image: cloudflare/cloudflared:latest
|
||||
args:
|
||||
- tunnel
|
||||
# Points cloudflared to the config file, which configures what
|
||||
# cloudflared will actually do. This file is created by a ConfigMap
|
||||
# below.
|
||||
- --config
|
||||
- /etc/cloudflared/config/config.yaml
|
||||
- run
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
# Cloudflared has a /ready endpoint which returns 200 if and only if
|
||||
# it has an active connection to the edge.
|
||||
path: /ready
|
||||
port: 2000
|
||||
failureThreshold: 1
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/cloudflared/config
|
||||
readOnly: true
|
||||
# Each tunnel has an associated "credentials file" which authorizes machines
|
||||
# to run the tunnel. cloudflared will read this file from its local filesystem,
|
||||
# and it'll be stored in a k8s secret.
|
||||
- name: creds
|
||||
mountPath: /etc/cloudflared/creds
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: creds
|
||||
secret:
|
||||
secretName: tunnel-credentials
|
||||
# Create a config.yaml file from the ConfigMap below.
|
||||
- name: config
|
||||
configMap:
|
||||
name: cloudflared
|
||||
items:
|
||||
- key: config.yaml
|
||||
path: config.yaml
|
||||
---
|
||||
# This ConfigMap is just a way to define the cloudflared config.yaml file in k8s.
|
||||
# It's useful to define it in k8s, rather than as a stand-alone .yaml file, because
|
||||
# this lets you use various k8s templating solutions (e.g. Helm charts) to
|
||||
# parameterize your config, instead of just using string literals.
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: cloudflared
|
||||
data:
|
||||
config.yaml: |
|
||||
# Name of the tunnel you want to run
|
||||
tunnel: ${TUNNEL_NAME}
|
||||
credentials-file: /etc/cloudflared/creds/credentials.json
|
||||
# Serves the metrics server under /metrics and the readiness server under /ready
|
||||
metrics: 0.0.0.0:2000
|
||||
# Autoupdates applied in a k8s pod will be lost when the pod is removed or restarted, so
|
||||
# autoupdate doesn't make sense in Kubernetes. However, outside of Kubernetes, we strongly
|
||||
# recommend using autoupdate.
|
||||
no-autoupdate: true
|
||||
ingress:
|
||||
# The first rule proxies traffic to the httpbin sample Service defined in app.yaml
|
||||
- hostname: ${TUNNEL_HOSTNAME}
|
||||
service: http://actions-runner-controller-github-webhook-server.actions-runner-system:80
|
||||
# This rule matches any traffic which didn't match a previous rule, and responds with HTTP 404.
|
||||
- service: http_status:404
|
||||
MANIFEST
|
||||
|
||||
kubectl -n tunnel delete po -l app=cloudflared || :
|
||||
@@ -6,7 +6,7 @@ tpe=${ACCEPTANCE_TEST_SECRET_TYPE}
|
||||
|
||||
VALUES_FILE=${VALUES_FILE:-$(dirname $0)/values.yaml}
|
||||
|
||||
kubectl delete secret controller-manager || :
|
||||
kubectl delete secret -n actions-runner-system controller-manager || :
|
||||
|
||||
if [ "${tpe}" == "token" ]; then
|
||||
if ! kubectl get secret controller-manager -n actions-runner-system >/dev/null; then
|
||||
@@ -18,8 +18,8 @@ elif [ "${tpe}" == "app" ]; then
|
||||
kubectl create secret generic controller-manager \
|
||||
-n actions-runner-system \
|
||||
--from-literal=github_app_id=${APP_ID:?must not be empty} \
|
||||
--from-literal=github_app_installation_id=${INSTALLATION_ID:?must not be empty} \
|
||||
--from-file=github_app_private_key=${PRIVATE_KEY_FILE_PATH:?must not be empty}
|
||||
--from-literal=github_app_installation_id=${APP_INSTALLATION_ID:?must not be empty} \
|
||||
--from-file=github_app_private_key=${APP_PRIVATE_KEY_FILE:?must not be empty}
|
||||
else
|
||||
echo "ACCEPTANCE_TEST_SECRET_TYPE must be set to either \"token\" or \"app\"" 1>&2
|
||||
exit 1
|
||||
@@ -37,7 +37,10 @@ fi
|
||||
|
||||
tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL}
|
||||
|
||||
TEST_ID=${TEST_ID:-default}
|
||||
|
||||
if [ "${tool}" == "helm" ]; then
|
||||
set -v
|
||||
helm upgrade --install actions-runner-controller \
|
||||
charts/actions-runner-controller \
|
||||
-n actions-runner-system \
|
||||
@@ -46,56 +49,33 @@ if [ "${tool}" == "helm" ]; then
|
||||
--set authSecret.create=false \
|
||||
--set image.repository=${NAME} \
|
||||
--set image.tag=${VERSION} \
|
||||
--set podAnnotations.test-id=${TEST_ID} \
|
||||
--set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \
|
||||
--set imagePullSecrets[0].name=${IMAGE_PULL_SECRET} \
|
||||
--set image.actionsRunnerImagePullSecrets[0].name=${IMAGE_PULL_SECRET} \
|
||||
--set githubWebhookServer.imagePullSecrets[0].name=${IMAGE_PULL_SECRET} \
|
||||
-f ${VALUES_FILE}
|
||||
set +v
|
||||
# To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes`
|
||||
# errors
|
||||
kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds
|
||||
kubectl -n actions-runner-system wait deploy/actions-runner-controller --for condition=available --timeout 60s
|
||||
# This wait fails due to timeout when it's already in crashloopback and this update doesn't change the image tag.
|
||||
# That's why we add `|| :`. With that we prevent stopping the script in case of timeout and
|
||||
# proceed to delete (possibly in crashloopback and/or running with outdated image) pods so that they are recreated by K8s.
|
||||
kubectl -n actions-runner-system wait deploy/actions-runner-controller --for condition=available --timeout 60s || :
|
||||
else
|
||||
kubectl apply \
|
||||
-n actions-runner-system \
|
||||
-f release/actions-runner-controller.yaml
|
||||
kubectl -n actions-runner-system wait deploy/controller-manager --for condition=available --timeout 120s
|
||||
kubectl -n actions-runner-system wait deploy/controller-manager --for condition=available --timeout 120s || :
|
||||
fi
|
||||
|
||||
# Restart all ARC pods
|
||||
kubectl -n actions-runner-system delete po -l app.kubernetes.io/name=actions-runner-controller
|
||||
|
||||
echo Waiting for all ARC pods to be up and running after restart
|
||||
|
||||
kubectl -n actions-runner-system wait deploy/actions-runner-controller --for condition=available --timeout 120s
|
||||
|
||||
# Adhocly wait for some time until actions-runner-controller's admission webhook gets ready
|
||||
sleep 20
|
||||
|
||||
RUNNER_LABEL=${RUNNER_LABEL:-self-hosted}
|
||||
|
||||
if [ -n "${TEST_REPO}" ]; then
|
||||
if [ "${USE_RUNNERSET}" -ne "false" ]; then
|
||||
cat acceptance/testdata/repo.runnerset.yaml | envsubst | kubectl apply -f -
|
||||
cat acceptance/testdata/repo.runnerset.hra.yaml | envsubst | kubectl apply -f -
|
||||
else
|
||||
echo 'Deploying runnerdeployment and hra. Set USE_RUNNERSET if you want to deploy runnerset instead.'
|
||||
cat acceptance/testdata/repo.runnerdeploy.yaml | envsubst | kubectl apply -f -
|
||||
cat acceptance/testdata/repo.hra.yaml | envsubst | kubectl apply -f -
|
||||
fi
|
||||
else
|
||||
echo 'Skipped deploying runnerdeployment and hra. Set TEST_REPO to "yourorg/yourrepo" to deploy.'
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ORG}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f -
|
||||
|
||||
if [ -n "${TEST_ORG_GROUP}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f -
|
||||
else
|
||||
echo 'Skipped deploying enterprise runnerdeployment. Set TEST_ORG_GROUP to deploy.'
|
||||
fi
|
||||
else
|
||||
echo 'Skipped deploying organizational runnerdeployment. Set TEST_ORG to deploy.'
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ENTERPRISE}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ORG= TEST_REPO= NAME=enterprise-runnerdeploy envsubst | kubectl apply -f -
|
||||
|
||||
if [ -n "${TEST_ENTERPRISE_GROUP}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ORG= TEST_REPO= TEST_GROUP=${TEST_ENTERPRISE_GROUP} NAME=enterprisegroup-runnerdeploy envsubst | kubectl apply -f -
|
||||
else
|
||||
echo 'Skipped deploying enterprise runnerdeployment. Set TEST_ENTERPRISE_GROUP to deploy.'
|
||||
fi
|
||||
else
|
||||
echo 'Skipped deploying enterprise runnerdeployment. Set TEST_ENTERPRISE to deploy.'
|
||||
fi
|
||||
|
||||
58
acceptance/deploy_runners.sh
Executable file
58
acceptance/deploy_runners.sh
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
OP=${OP:-apply}
|
||||
|
||||
RUNNER_LABEL=${RUNNER_LABEL:-self-hosted}
|
||||
|
||||
if [ -n "${TEST_REPO}" ]; then
|
||||
if [ "${USE_RUNNERSET}" != "false" ]; then
|
||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerset envsubst | kubectl ${OP} -f -
|
||||
else
|
||||
echo "Running ${OP} runnerdeployment and hra. Set USE_RUNNERSET if you want to deploy runnerset instead."
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerdeploy envsubst | kubectl ${OP} -f -
|
||||
fi
|
||||
else
|
||||
echo "Skipped ${OP} for runnerdeployment and hra. Set TEST_REPO to "yourorg/yourrepo" to deploy."
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ORG}" ]; then
|
||||
if [ "${USE_RUNNERSET}" != "false" ]; then
|
||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerset envsubst | kubectl ${OP} -f -
|
||||
else
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl ${OP} -f -
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ORG_GROUP}" ]; then
|
||||
if [ "${USE_RUNNERSET}" != "false" ]; then
|
||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerset envsubst | kubectl ${OP} -f -
|
||||
else
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl ${OP} -f -
|
||||
fi
|
||||
else
|
||||
echo "Skipped ${OP} on enterprise runnerdeployment. Set TEST_ORG_GROUP to ${OP}."
|
||||
fi
|
||||
else
|
||||
echo "Skipped ${OP} on organizational runnerdeployment. Set TEST_ORG to ${OP}."
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ENTERPRISE}" ]; then
|
||||
if [ "${USE_RUNNERSET}" != "false" ]; then
|
||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ORG= TEST_REPO= RUNNER_MIN_REPLICAS=${ENTERPRISE_RUNNER_MIN_REPLICAS} NAME=enterprise-runnerset envsubst | kubectl ${OP} -f -
|
||||
else
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ORG= TEST_REPO= RUNNER_MIN_REPLICAS=${ENTERPRISE_RUNNER_MIN_REPLICAS} NAME=enterprise-runnerdeploy envsubst | kubectl ${OP} -f -
|
||||
fi
|
||||
|
||||
if [ -n "${TEST_ENTERPRISE_GROUP}" ]; then
|
||||
if [ "${USE_RUNNERSET}" != "false" ]; then
|
||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ORG= TEST_REPO= RUNNER_MIN_REPLICAS=${ENTERPRISE_RUNNER_MIN_REPLICAS} TEST_GROUP=${TEST_ENTERPRISE_GROUP} NAME=enterprisegroup-runnerset envsubst | kubectl ${OP} -f -
|
||||
else
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ORG= TEST_REPO= RUNNER_MIN_REPLICAS=${ENTERPRISE_RUNNER_MIN_REPLICAS} TEST_GROUP=${TEST_ENTERPRISE_GROUP} NAME=enterprisegroup-runnerdeploy envsubst | kubectl ${OP} -f -
|
||||
fi
|
||||
else
|
||||
echo "Skipped ${OP} on enterprise runnerdeployment. Set TEST_ENTERPRISE_GROUP to ${OP}."
|
||||
fi
|
||||
else
|
||||
echo "Skipped ${OP} on enterprise runnerdeployment. Set TEST_ENTERPRISE to ${OP}."
|
||||
fi
|
||||
36
acceptance/testdata/org.hra.yaml
vendored
36
acceptance/testdata/org.hra.yaml
vendored
@@ -1,36 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: HorizontalRunnerAutoscaler
|
||||
metadata:
|
||||
name: org
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: org-runnerdeploy
|
||||
scaleUpTriggers:
|
||||
- githubEvent:
|
||||
checkRun:
|
||||
types: ["created"]
|
||||
status: "queued"
|
||||
amount: 1
|
||||
duration: "1m"
|
||||
scheduledOverrides:
|
||||
- startTime: "2021-05-11T16:05:00+09:00"
|
||||
endTime: "2021-05-11T16:40:00+09:00"
|
||||
minReplicas: 2
|
||||
- startTime: "2021-05-01T00:00:00+09:00"
|
||||
endTime: "2021-05-03T00:00:00+09:00"
|
||||
recurrenceRule:
|
||||
frequency: Weekly
|
||||
untilTime: "2022-05-01T00:00:00+09:00"
|
||||
minReplicas: 0
|
||||
minReplicas: 0
|
||||
maxReplicas: 5
|
||||
# Used to test that HRA is working for org runners
|
||||
metrics:
|
||||
- type: PercentageRunnersBusy
|
||||
scaleUpThreshold: '0.75'
|
||||
scaleDownThreshold: '0.3'
|
||||
scaleUpFactor: '2'
|
||||
scaleDownFactor: '0.5'
|
||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
||||
repositoryNames:
|
||||
- ${TEST_ORG_REPO}
|
||||
44
acceptance/testdata/org.runnerdeploy.yaml
vendored
44
acceptance/testdata/org.runnerdeploy.yaml
vendored
@@ -1,44 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerDeployment
|
||||
metadata:
|
||||
name: org-runnerdeploy
|
||||
spec:
|
||||
# replicas: 1
|
||||
template:
|
||||
spec:
|
||||
organization: ${TEST_ORG}
|
||||
|
||||
#
|
||||
# Custom runner image
|
||||
#
|
||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
||||
# Whether to pass --ephemeral (true) or --once (false, deprecated)
|
||||
env:
|
||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
||||
|
||||
#
|
||||
# dockerd within runner container
|
||||
#
|
||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
#image: mumoshu/actions-runner-dind:dev
|
||||
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
#
|
||||
#dockerMTU: 1450
|
||||
|
||||
#Runner group
|
||||
# labels:
|
||||
# - "mylabel 1"
|
||||
# - "mylabel 2"
|
||||
labels:
|
||||
- "${RUNNER_LABEL}"
|
||||
|
||||
#
|
||||
# Non-standard working directory
|
||||
#
|
||||
# workDir: "/"
|
||||
25
acceptance/testdata/repo.hra.yaml
vendored
25
acceptance/testdata/repo.hra.yaml
vendored
@@ -1,25 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: HorizontalRunnerAutoscaler
|
||||
metadata:
|
||||
name: actions-runner-aos-autoscaler
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: example-runnerdeploy
|
||||
scaleUpTriggers:
|
||||
- githubEvent:
|
||||
checkRun:
|
||||
types: ["created"]
|
||||
status: "queued"
|
||||
amount: 1
|
||||
duration: "1m"
|
||||
minReplicas: 0
|
||||
maxReplicas: 5
|
||||
metrics:
|
||||
- type: PercentageRunnersBusy
|
||||
scaleUpThreshold: '0.75'
|
||||
scaleDownThreshold: '0.3'
|
||||
scaleUpFactor: '2'
|
||||
scaleDownFactor: '0.5'
|
||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
||||
repositoryNames:
|
||||
- ${TEST_REPO}
|
||||
44
acceptance/testdata/repo.runnerdeploy.yaml
vendored
44
acceptance/testdata/repo.runnerdeploy.yaml
vendored
@@ -1,44 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerDeployment
|
||||
metadata:
|
||||
name: example-runnerdeploy
|
||||
spec:
|
||||
# replicas: 1
|
||||
template:
|
||||
spec:
|
||||
repository: ${TEST_REPO}
|
||||
|
||||
#
|
||||
# Custom runner image
|
||||
#
|
||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
||||
# Whether to pass --ephemeral (true) or --once (false, deprecated)
|
||||
env:
|
||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
||||
|
||||
#
|
||||
# dockerd within runner container
|
||||
#
|
||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
#image: mumoshu/actions-runner-dind:dev
|
||||
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
#
|
||||
#dockerMTU: 1450
|
||||
|
||||
#Runner group
|
||||
# labels:
|
||||
# - "mylabel 1"
|
||||
# - "mylabel 2"
|
||||
labels:
|
||||
- "${RUNNER_LABEL}"
|
||||
|
||||
#
|
||||
# Non-standard working directory
|
||||
#
|
||||
# workDir: "/"
|
||||
29
acceptance/testdata/repo.runnerset.hra.yaml
vendored
29
acceptance/testdata/repo.runnerset.hra.yaml
vendored
@@ -1,29 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: HorizontalRunnerAutoscaler
|
||||
metadata:
|
||||
name: example-runnerset
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
kind: RunnerSet
|
||||
name: example-runnerset
|
||||
scaleUpTriggers:
|
||||
- githubEvent:
|
||||
checkRun:
|
||||
types: ["created"]
|
||||
status: "queued"
|
||||
amount: 1
|
||||
duration: "1m"
|
||||
# RunnerSet doesn't support scale from/to zero yet
|
||||
minReplicas: 1
|
||||
maxReplicas: 5
|
||||
# This should be less than 600(seconds, the default) for faster testing
|
||||
scaleDownDelaySecondsAfterScaleOut: 60
|
||||
metrics:
|
||||
- type: PercentageRunnersBusy
|
||||
scaleUpThreshold: '0.75'
|
||||
scaleDownThreshold: '0.3'
|
||||
scaleUpFactor: '2'
|
||||
scaleDownFactor: '0.5'
|
||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
||||
repositoryNames:
|
||||
- ${TEST_REPO}
|
||||
59
acceptance/testdata/repo.runnerset.yaml
vendored
59
acceptance/testdata/repo.runnerset.yaml
vendored
@@ -1,59 +0,0 @@
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerSet
|
||||
metadata:
|
||||
name: example-runnerset
|
||||
spec:
|
||||
# MANDATORY because it is based on StatefulSet: Results in a below error when omitted:
|
||||
# missing required field "selector" in dev.summerwind.actions.v1alpha1.RunnerSet.spec
|
||||
selector:
|
||||
matchLabels:
|
||||
app: example-runnerset
|
||||
|
||||
# MANDATORY because it is based on StatefulSet: Results in a below error when omitted:
|
||||
# missing required field "serviceName" in dev.summerwind.actions.v1alpha1.RunnerSet.spec]
|
||||
serviceName: example-runnerset
|
||||
|
||||
#replicas: 1
|
||||
|
||||
# From my limited testing, `ephemeral: true` is more reliable.
|
||||
# Seomtimes, updating already deployed runners from `ephemeral: false` to `ephemeral: true` seems to
|
||||
# result in queued jobs hanging forever.
|
||||
ephemeral: ${TEST_EPHEMERAL}
|
||||
|
||||
repository: ${TEST_REPO}
|
||||
#
|
||||
# Custom runner image
|
||||
#
|
||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
||||
#
|
||||
# dockerd within runner container
|
||||
#
|
||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
#
|
||||
#dockerMTU: 1450
|
||||
#Runner group
|
||||
# labels:
|
||||
# - "mylabel 1"
|
||||
# - "mylabel 2"
|
||||
labels:
|
||||
- "${RUNNER_LABEL}"
|
||||
#
|
||||
# Non-standard working directory
|
||||
#
|
||||
# workDir: "/"
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: example-runnerset
|
||||
spec:
|
||||
containers:
|
||||
- name: runner
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
||||
#- name: docker
|
||||
# #image: mumoshu/actions-runner-dind:dev
|
||||
14
acceptance/testdata/runnerdeploy.envsubst.yaml
vendored
14
acceptance/testdata/runnerdeploy.envsubst.yaml
vendored
@@ -17,10 +17,7 @@ spec:
|
||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
||||
# Whether to pass --ephemeral (true) or --once (false, deprecated)
|
||||
env:
|
||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
||||
ephemeral: ${TEST_EPHEMERAL}
|
||||
|
||||
#
|
||||
# dockerd within runner container
|
||||
@@ -28,6 +25,7 @@ spec:
|
||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
#image: mumoshu/actions-runner-dind:dev
|
||||
dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER}
|
||||
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
@@ -54,8 +52,10 @@ spec:
|
||||
scaleTargetRef:
|
||||
name: ${NAME}
|
||||
scaleUpTriggers:
|
||||
- githubEvent: {}
|
||||
- githubEvent:
|
||||
workflowJob: {}
|
||||
amount: 1
|
||||
duration: "1m"
|
||||
minReplicas: 0
|
||||
duration: "10m"
|
||||
minReplicas: ${RUNNER_MIN_REPLICAS}
|
||||
maxReplicas: 10
|
||||
scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}
|
||||
|
||||
263
acceptance/testdata/runnerset.envsubst.yaml
vendored
Normal file
263
acceptance/testdata/runnerset.envsubst.yaml
vendored
Normal file
@@ -0,0 +1,263 @@
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: ${NAME}-runner-work-dir
|
||||
labels:
|
||||
content: ${NAME}-runner-work-dir
|
||||
provisioner: rancher.io/local-path
|
||||
reclaimPolicy: Delete
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: ${NAME}
|
||||
# In kind environments, the provider writes:
|
||||
# /var/lib/docker/volumes/KIND_NODE_CONTAINER_VOL_ID/_data/local-path-provisioner/PV_NAME
|
||||
# It can be hundreds of gigabytes depending on what you cache in the test workflow. Beware to not encounter `no space left on device` errors!
|
||||
# If you did encounter no space errorrs try:
|
||||
# docker system prune
|
||||
# docker buildx prune #=> frees up /var/lib/docker/volumes/buildx_buildkit_container-builder0_state
|
||||
# sudo rm -rf /var/lib/docker/volumes/KIND_NODE_CONTAINER_VOL_ID/_data/local-path-provisioner #=> frees up local-path-provisioner's data
|
||||
provisioner: rancher.io/local-path
|
||||
reclaimPolicy: Retain
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: ${NAME}-var-lib-docker
|
||||
labels:
|
||||
content: ${NAME}-var-lib-docker
|
||||
provisioner: rancher.io/local-path
|
||||
reclaimPolicy: Retain
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: ${NAME}-cache
|
||||
labels:
|
||||
content: ${NAME}-cache
|
||||
provisioner: rancher.io/local-path
|
||||
reclaimPolicy: Retain
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
---
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: ${NAME}-runner-tool-cache
|
||||
labels:
|
||||
content: ${NAME}-runner-tool-cache
|
||||
provisioner: rancher.io/local-path
|
||||
reclaimPolicy: Retain
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
---
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: RunnerSet
|
||||
metadata:
|
||||
name: ${NAME}
|
||||
spec:
|
||||
# MANDATORY because it is based on StatefulSet: Results in a below error when omitted:
|
||||
# missing required field "selector" in dev.summerwind.actions.v1alpha1.RunnerSet.spec
|
||||
selector:
|
||||
matchLabels:
|
||||
app: ${NAME}
|
||||
|
||||
# MANDATORY because it is based on StatefulSet: Results in a below error when omitted:
|
||||
# missing required field "serviceName" in dev.summerwind.actions.v1alpha1.RunnerSet.spec]
|
||||
serviceName: ${NAME}
|
||||
|
||||
#replicas: 1
|
||||
|
||||
# From my limited testing, `ephemeral: true` is more reliable.
|
||||
# Seomtimes, updating already deployed runners from `ephemeral: false` to `ephemeral: true` seems to
|
||||
# result in queued jobs hanging forever.
|
||||
ephemeral: ${TEST_EPHEMERAL}
|
||||
|
||||
enterprise: ${TEST_ENTERPRISE}
|
||||
group: ${TEST_GROUP}
|
||||
organization: ${TEST_ORG}
|
||||
repository: ${TEST_REPO}
|
||||
|
||||
#
|
||||
# Custom runner image
|
||||
#
|
||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
||||
|
||||
#
|
||||
# dockerd within runner container
|
||||
#
|
||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER}
|
||||
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
#
|
||||
#dockerMTU: 1450
|
||||
#Runner group
|
||||
# labels:
|
||||
# - "mylabel 1"
|
||||
# - "mylabel 2"
|
||||
labels:
|
||||
- "${RUNNER_LABEL}"
|
||||
#
|
||||
# Non-standard working directory
|
||||
#
|
||||
# workDir: "/"
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ${NAME}
|
||||
spec:
|
||||
containers:
|
||||
- name: runner
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
||||
- name: GOMODCACHE
|
||||
value: "/home/runner/.cache/go-mod"
|
||||
# PV-backed runner work dir
|
||||
volumeMounts:
|
||||
# Comment out the ephemeral work volume if you're going to test the kubernetes container mode
|
||||
# The volume and mount with the same names will be created by workVolumeClaimTemplate and the kubernetes container mode support.
|
||||
# - name: work
|
||||
# mountPath: /runner/_work
|
||||
# Cache docker image layers, in case dockerdWithinRunnerContainer=true
|
||||
- name: var-lib-docker
|
||||
mountPath: /var/lib/docker
|
||||
# Cache go modules and builds
|
||||
# - name: gocache
|
||||
# # Run `goenv | grep GOCACHE` to verify the path is correct for your env
|
||||
# mountPath: /home/runner/.cache/go-build
|
||||
# - name: gomodcache
|
||||
# # Run `goenv | grep GOMODCACHE` to verify the path is correct for your env
|
||||
# # mountPath: /home/runner/go/pkg/mod
|
||||
- name: cache
|
||||
# go: could not create module cache: stat /home/runner/.cache/go-mod: permission denied
|
||||
mountPath: "/home/runner/.cache"
|
||||
- name: runner-tool-cache
|
||||
# This corresponds to our runner image's default setting of RUNNER_TOOL_CACHE=/opt/hostedtoolcache.
|
||||
#
|
||||
# In case you customize the envvar in both runner and docker containers of the runner pod spec,
|
||||
# You'd need to change this mountPath accordingly.
|
||||
#
|
||||
# The tool cache directory is defined in actions/toolkit's tool-cache module:
|
||||
# https://github.com/actions/toolkit/blob/2f164000dcd42fb08287824a3bc3030dbed33687/packages/tool-cache/src/tool-cache.ts#L621-L638
|
||||
#
|
||||
# Many setup-* actions like setup-go utilizes the tool-cache module to download and cache installed binaries:
|
||||
# https://github.com/actions/setup-go/blob/56a61c9834b4a4950dbbf4740af0b8a98c73b768/src/installer.ts#L144
|
||||
mountPath: "/opt/hostedtoolcache"
|
||||
# Valid only when dockerdWithinRunnerContainer=false
|
||||
- name: docker
|
||||
# PV-backed runner work dir
|
||||
volumeMounts:
|
||||
- name: work
|
||||
mountPath: /runner/_work
|
||||
# Cache docker image layers, in case dockerdWithinRunnerContainer=false
|
||||
- name: var-lib-docker
|
||||
mountPath: /var/lib/docker
|
||||
# image: mumoshu/actions-runner-dind:dev
|
||||
|
||||
# For buildx cache
|
||||
- name: cache
|
||||
mountPath: "/home/runner/.cache"
|
||||
# Comment out the ephemeral work volume if you're going to test the kubernetes container mode
|
||||
# volumes:
|
||||
# - name: work
|
||||
# ephemeral:
|
||||
# volumeClaimTemplate:
|
||||
# spec:
|
||||
# accessModes:
|
||||
# - ReadWriteOnce
|
||||
# storageClassName: "${NAME}-runner-work-dir"
|
||||
# resources:
|
||||
# requests:
|
||||
# storage: 10Gi
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: vol1
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Mi
|
||||
storageClassName: ${NAME}
|
||||
## Dunno which provider supports auto-provisioning with selector.
|
||||
## At least the rancher local path provider stopped with:
|
||||
## waiting for a volume to be created, either by external provisioner "rancher.io/local-path" or manually created by system administrator
|
||||
# selector:
|
||||
# matchLabels:
|
||||
# runnerset-volume-id: ${NAME}-vol1
|
||||
- metadata:
|
||||
name: vol2
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Mi
|
||||
storageClassName: ${NAME}
|
||||
# selector:
|
||||
# matchLabels:
|
||||
# runnerset-volume-id: ${NAME}-vol2
|
||||
- metadata:
|
||||
name: var-lib-docker
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Mi
|
||||
storageClassName: ${NAME}-var-lib-docker
|
||||
- metadata:
|
||||
name: cache
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Mi
|
||||
storageClassName: ${NAME}-cache
|
||||
- metadata:
|
||||
name: runner-tool-cache
|
||||
# It turns out labels doesn't distinguish PVs across PVCs and the
|
||||
# end result is PVs are reused by wrong PVCs.
|
||||
# The correct way seems to be to differentiate storage class per pvc template.
|
||||
# labels:
|
||||
# id: runner-tool-cache
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Mi
|
||||
storageClassName: ${NAME}-runner-tool-cache
|
||||
---
|
||||
apiVersion: actions.summerwind.dev/v1alpha1
|
||||
kind: HorizontalRunnerAutoscaler
|
||||
metadata:
|
||||
name: ${NAME}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
kind: RunnerSet
|
||||
name: ${NAME}
|
||||
scaleUpTriggers:
|
||||
- githubEvent:
|
||||
workflowJob: {}
|
||||
amount: 1
|
||||
duration: "10m"
|
||||
minReplicas: ${RUNNER_MIN_REPLICAS}
|
||||
maxReplicas: 10
|
||||
scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}
|
||||
# Comment out the whole metrics if you'd like to solely test webhook-based scaling
|
||||
metrics:
|
||||
- type: PercentageRunnersBusy
|
||||
scaleUpThreshold: '0.75'
|
||||
scaleDownThreshold: '0.25'
|
||||
scaleUpFactor: '2'
|
||||
scaleDownFactor: '0.5'
|
||||
@@ -1,7 +1,14 @@
|
||||
# Set actions-runner-controller settings for testing
|
||||
githubAPICacheDuration: 10s
|
||||
logLevel: "-4"
|
||||
imagePullSecrets:
|
||||
- name:
|
||||
image:
|
||||
actionsRunnerImagePullSecrets:
|
||||
- name:
|
||||
githubWebhookServer:
|
||||
logLevel: debug
|
||||
imagePullSecrets:
|
||||
- name:
|
||||
logLevel: "-4"
|
||||
enabled: true
|
||||
labels: {}
|
||||
replicaCount: 1
|
||||
|
||||
@@ -72,10 +72,12 @@ type GitHubEventScaleUpTriggerSpec struct {
|
||||
CheckRun *CheckRunSpec `json:"checkRun,omitempty"`
|
||||
PullRequest *PullRequestSpec `json:"pullRequest,omitempty"`
|
||||
Push *PushSpec `json:"push,omitempty"`
|
||||
WorkflowJob *WorkflowJobSpec `json:"workflowJob,omitempty"`
|
||||
}
|
||||
|
||||
// https://docs.github.com/en/actions/reference/events-that-trigger-workflows#check_run
|
||||
type CheckRunSpec struct {
|
||||
// One of: created, rerequested, or completed
|
||||
Types []string `json:"types,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
|
||||
@@ -90,6 +92,10 @@ type CheckRunSpec struct {
|
||||
Repositories []string `json:"repositories,omitempty"`
|
||||
}
|
||||
|
||||
// https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_job
|
||||
type WorkflowJobSpec struct {
|
||||
}
|
||||
|
||||
// https://docs.github.com/en/actions/reference/events-that-trigger-workflows#pull_request
|
||||
type PullRequestSpec struct {
|
||||
Types []string `json:"types,omitempty"`
|
||||
@@ -107,6 +113,9 @@ type CapacityReservation struct {
|
||||
Name string `json:"name,omitempty"`
|
||||
ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
|
||||
Replicas int `json:"replicas,omitempty"`
|
||||
|
||||
// +optional
|
||||
EffectiveTime metav1.Time `json:"effectiveTime,omitempty"`
|
||||
}
|
||||
|
||||
type ScaleTargetRef struct {
|
||||
|
||||
@@ -18,8 +18,10 @@ package v1alpha1
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/util/validation/field"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@@ -71,6 +73,9 @@ type RunnerConfig struct {
|
||||
VolumeSizeLimit *resource.Quantity `json:"volumeSizeLimit,omitempty"`
|
||||
// +optional
|
||||
VolumeStorageMedium *string `json:"volumeStorageMedium,omitempty"`
|
||||
|
||||
// +optional
|
||||
ContainerMode string `json:"containerMode,omitempty"`
|
||||
}
|
||||
|
||||
// RunnerPodSpec defines the desired pod spec fields of the runner pod
|
||||
@@ -135,6 +140,9 @@ type RunnerPodSpec struct {
|
||||
// +optional
|
||||
Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
|
||||
|
||||
// +optional
|
||||
PriorityClassName string `json:"priorityClassName,omitempty"`
|
||||
|
||||
// +optional
|
||||
TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty"`
|
||||
|
||||
@@ -145,7 +153,7 @@ type RunnerPodSpec struct {
|
||||
HostAliases []corev1.HostAlias `json:"hostAliases,omitempty"`
|
||||
|
||||
// +optional
|
||||
TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraint,omitempty"`
|
||||
TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
|
||||
|
||||
// RuntimeClassName is the container runtime configuration that containers should run under.
|
||||
// More info: https://kubernetes.io/docs/concepts/containers/runtime-class
|
||||
@@ -153,11 +161,38 @@ type RunnerPodSpec struct {
|
||||
RuntimeClassName *string `json:"runtimeClassName,omitempty"`
|
||||
|
||||
// +optional
|
||||
DnsConfig []corev1.PodDNSConfig `json:"dnsConfig,omitempty"`
|
||||
DnsConfig *corev1.PodDNSConfig `json:"dnsConfig,omitempty"`
|
||||
|
||||
// +optional
|
||||
WorkVolumeClaimTemplate *WorkVolumeClaimTemplate `json:"workVolumeClaimTemplate,omitempty"`
|
||||
}
|
||||
|
||||
func (rs *RunnerSpec) Validate(rootPath *field.Path) field.ErrorList {
|
||||
var (
|
||||
errList field.ErrorList
|
||||
err error
|
||||
)
|
||||
|
||||
err = rs.validateRepository()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(rootPath.Child("repository"), rs.Repository, err.Error()))
|
||||
}
|
||||
|
||||
err = rs.validateWorkVolumeClaimTemplate()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(rootPath.Child("workVolumeClaimTemplate"), rs.WorkVolumeClaimTemplate, err.Error()))
|
||||
}
|
||||
|
||||
err = rs.validateIsServiceAccountNameSet()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(rootPath.Child("serviceAccountName"), rs.ServiceAccountName, err.Error()))
|
||||
}
|
||||
|
||||
return errList
|
||||
}
|
||||
|
||||
// ValidateRepository validates repository field.
|
||||
func (rs *RunnerSpec) ValidateRepository() error {
|
||||
func (rs *RunnerSpec) validateRepository() error {
|
||||
// Enterprise, Organization and repository are both exclusive.
|
||||
foundCount := 0
|
||||
if len(rs.Organization) > 0 {
|
||||
@@ -179,8 +214,34 @@ func (rs *RunnerSpec) ValidateRepository() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rs *RunnerSpec) validateWorkVolumeClaimTemplate() error {
|
||||
if rs.ContainerMode != "kubernetes" {
|
||||
return nil
|
||||
}
|
||||
|
||||
if rs.WorkVolumeClaimTemplate == nil {
|
||||
return errors.New("Spec.ContainerMode: kubernetes must have workVolumeClaimTemplate field specified")
|
||||
}
|
||||
|
||||
return rs.WorkVolumeClaimTemplate.validate()
|
||||
}
|
||||
|
||||
func (rs *RunnerSpec) validateIsServiceAccountNameSet() error {
|
||||
if rs.ContainerMode != "kubernetes" {
|
||||
return nil
|
||||
}
|
||||
|
||||
if rs.ServiceAccountName == "" {
|
||||
return errors.New("service account name is required if container mode is kubernetes")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunnerStatus defines the observed state of Runner
|
||||
type RunnerStatus struct {
|
||||
// Turns true only if the runner pod is ready.
|
||||
// +optional
|
||||
Ready bool `json:"ready"`
|
||||
// +optional
|
||||
Registration RunnerStatusRegistration `json:"registration"`
|
||||
// +optional
|
||||
@@ -204,6 +265,51 @@ type RunnerStatusRegistration struct {
|
||||
ExpiresAt metav1.Time `json:"expiresAt"`
|
||||
}
|
||||
|
||||
type WorkVolumeClaimTemplate struct {
|
||||
StorageClassName string `json:"storageClassName"`
|
||||
AccessModes []corev1.PersistentVolumeAccessMode `json:"accessModes"`
|
||||
Resources corev1.ResourceRequirements `json:"resources"`
|
||||
}
|
||||
|
||||
func (w *WorkVolumeClaimTemplate) validate() error {
|
||||
if w.AccessModes == nil || len(w.AccessModes) == 0 {
|
||||
return errors.New("Access mode should have at least one mode specified")
|
||||
}
|
||||
|
||||
for _, accessMode := range w.AccessModes {
|
||||
switch accessMode {
|
||||
case corev1.ReadWriteOnce, corev1.ReadWriteMany:
|
||||
default:
|
||||
return fmt.Errorf("Access mode %v is not supported", accessMode)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WorkVolumeClaimTemplate) V1Volume() corev1.Volume {
|
||||
return corev1.Volume{
|
||||
Name: "work",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
Ephemeral: &corev1.EphemeralVolumeSource{
|
||||
VolumeClaimTemplate: &corev1.PersistentVolumeClaimTemplate{
|
||||
Spec: corev1.PersistentVolumeClaimSpec{
|
||||
AccessModes: w.AccessModes,
|
||||
StorageClassName: &w.StorageClassName,
|
||||
Resources: w.Resources,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WorkVolumeClaimTemplate) V1VolumeMount(mountPath string) corev1.VolumeMount {
|
||||
return corev1.VolumeMount{
|
||||
MountPath: mountPath,
|
||||
Name: "work",
|
||||
}
|
||||
}
|
||||
|
||||
// +kubebuilder:object:root=true
|
||||
// +kubebuilder:subresource:status
|
||||
// +kubebuilder:printcolumn:JSONPath=".spec.enterprise",name=Enterprise,type=string
|
||||
|
||||
@@ -66,15 +66,7 @@ func (r *Runner) ValidateDelete() error {
|
||||
|
||||
// Validate validates resource spec.
|
||||
func (r *Runner) Validate() error {
|
||||
var (
|
||||
errList field.ErrorList
|
||||
err error
|
||||
)
|
||||
|
||||
err = r.Spec.ValidateRepository()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(field.NewPath("spec", "repository"), r.Spec.Repository, err.Error()))
|
||||
}
|
||||
errList := r.Spec.Validate(field.NewPath("spec"))
|
||||
|
||||
if len(errList) > 0 {
|
||||
return apierrors.NewInvalid(r.GroupVersionKind().GroupKind(), r.Name, errList)
|
||||
|
||||
@@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct {
|
||||
// +nullable
|
||||
Replicas *int `json:"replicas,omitempty"`
|
||||
|
||||
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||
// It is usually populated by the webhook-based autoscaler via HRA.
|
||||
// The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||
//
|
||||
// +optional
|
||||
// +nullable
|
||||
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||
|
||||
// +optional
|
||||
// +nullable
|
||||
Selector *metav1.LabelSelector `json:"selector"`
|
||||
|
||||
@@ -26,7 +26,7 @@ import (
|
||||
)
|
||||
|
||||
// log is for logging in this package.
|
||||
var runenrDeploymentLog = logf.Log.WithName("runnerdeployment-resource")
|
||||
var runnerDeploymentLog = logf.Log.WithName("runnerdeployment-resource")
|
||||
|
||||
func (r *RunnerDeployment) SetupWebhookWithManager(mgr ctrl.Manager) error {
|
||||
return ctrl.NewWebhookManagedBy(mgr).
|
||||
@@ -49,13 +49,13 @@ var _ webhook.Validator = &RunnerDeployment{}
|
||||
|
||||
// ValidateCreate implements webhook.Validator so a webhook will be registered for the type
|
||||
func (r *RunnerDeployment) ValidateCreate() error {
|
||||
runenrDeploymentLog.Info("validate resource to be created", "name", r.Name)
|
||||
runnerDeploymentLog.Info("validate resource to be created", "name", r.Name)
|
||||
return r.Validate()
|
||||
}
|
||||
|
||||
// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
|
||||
func (r *RunnerDeployment) ValidateUpdate(old runtime.Object) error {
|
||||
runenrDeploymentLog.Info("validate resource to be updated", "name", r.Name)
|
||||
runnerDeploymentLog.Info("validate resource to be updated", "name", r.Name)
|
||||
return r.Validate()
|
||||
}
|
||||
|
||||
@@ -66,15 +66,7 @@ func (r *RunnerDeployment) ValidateDelete() error {
|
||||
|
||||
// Validate validates resource spec.
|
||||
func (r *RunnerDeployment) Validate() error {
|
||||
var (
|
||||
errList field.ErrorList
|
||||
err error
|
||||
)
|
||||
|
||||
err = r.Spec.Template.Spec.ValidateRepository()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(field.NewPath("spec", "template", "spec", "repository"), r.Spec.Template.Spec.Repository, err.Error()))
|
||||
}
|
||||
errList := r.Spec.Template.Spec.Validate(field.NewPath("spec", "template", "spec"))
|
||||
|
||||
if len(errList) > 0 {
|
||||
return apierrors.NewInvalid(r.GroupVersionKind().GroupKind(), r.Name, errList)
|
||||
|
||||
@@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct {
|
||||
// +nullable
|
||||
Replicas *int `json:"replicas,omitempty"`
|
||||
|
||||
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||
// It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment.
|
||||
// The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners
|
||||
// based on potentially outdated Replicas value.
|
||||
//
|
||||
// +optional
|
||||
// +nullable
|
||||
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||
|
||||
// +optional
|
||||
// +nullable
|
||||
Selector *metav1.LabelSelector `json:"selector"`
|
||||
|
||||
@@ -66,15 +66,7 @@ func (r *RunnerReplicaSet) ValidateDelete() error {
|
||||
|
||||
// Validate validates resource spec.
|
||||
func (r *RunnerReplicaSet) Validate() error {
|
||||
var (
|
||||
errList field.ErrorList
|
||||
err error
|
||||
)
|
||||
|
||||
err = r.Spec.Template.Spec.ValidateRepository()
|
||||
if err != nil {
|
||||
errList = append(errList, field.Invalid(field.NewPath("spec", "template", "spec", "repository"), r.Spec.Template.Spec.Repository, err.Error()))
|
||||
}
|
||||
errList := r.Spec.Template.Spec.Validate(field.NewPath("spec", "template", "spec"))
|
||||
|
||||
if len(errList) > 0 {
|
||||
return apierrors.NewInvalid(r.GroupVersionKind().GroupKind(), r.Name, errList)
|
||||
|
||||
@@ -25,6 +25,20 @@ import (
|
||||
type RunnerSetSpec struct {
|
||||
RunnerConfig `json:",inline"`
|
||||
|
||||
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||
// It is usually populated by the webhook-based autoscaler via HRA.
|
||||
// It is used to prevent ephemeral runners from unnecessarily recreated.
|
||||
//
|
||||
// +optional
|
||||
// +nullable
|
||||
EffectiveTime *metav1.Time `json:"effectiveTime,omitempty"`
|
||||
|
||||
// +optional
|
||||
ServiceAccountName string `json:"serviceAccountName,omitempty"`
|
||||
|
||||
// +optional
|
||||
WorkVolumeClaimTemplate *WorkVolumeClaimTemplate `json:"workVolumeClaimTemplate,omitempty"`
|
||||
|
||||
appsv1.StatefulSetSpec `json:",inline"`
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry {
|
||||
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
|
||||
*out = *in
|
||||
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
|
||||
in.EffectiveTime.DeepCopyInto(&out.EffectiveTime)
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
|
||||
@@ -107,6 +108,11 @@ func (in *GitHubEventScaleUpTriggerSpec) DeepCopyInto(out *GitHubEventScaleUpTri
|
||||
*out = new(PushSpec)
|
||||
**out = **in
|
||||
}
|
||||
if in.WorkflowJob != nil {
|
||||
in, out := &in.WorkflowJob, &out.WorkflowJob
|
||||
*out = new(WorkflowJobSpec)
|
||||
**out = **in
|
||||
}
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GitHubEventScaleUpTriggerSpec.
|
||||
@@ -498,6 +504,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) {
|
||||
*out = new(int)
|
||||
**out = **in
|
||||
}
|
||||
if in.EffectiveTime != nil {
|
||||
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
if in.Selector != nil {
|
||||
in, out := &in.Selector, &out.Selector
|
||||
*out = new(metav1.LabelSelector)
|
||||
@@ -728,10 +738,13 @@ func (in *RunnerPodSpec) DeepCopyInto(out *RunnerPodSpec) {
|
||||
}
|
||||
if in.DnsConfig != nil {
|
||||
in, out := &in.DnsConfig, &out.DnsConfig
|
||||
*out = make([]v1.PodDNSConfig, len(*in))
|
||||
for i := range *in {
|
||||
(*in)[i].DeepCopyInto(&(*out)[i])
|
||||
}
|
||||
*out = new(v1.PodDNSConfig)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
if in.WorkVolumeClaimTemplate != nil {
|
||||
in, out := &in.WorkVolumeClaimTemplate, &out.WorkVolumeClaimTemplate
|
||||
*out = new(WorkVolumeClaimTemplate)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -812,6 +825,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) {
|
||||
*out = new(int)
|
||||
**out = **in
|
||||
}
|
||||
if in.EffectiveTime != nil {
|
||||
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
if in.Selector != nil {
|
||||
in, out := &in.Selector, &out.Selector
|
||||
*out = new(metav1.LabelSelector)
|
||||
@@ -923,6 +940,15 @@ func (in *RunnerSetList) DeepCopyObject() runtime.Object {
|
||||
func (in *RunnerSetSpec) DeepCopyInto(out *RunnerSetSpec) {
|
||||
*out = *in
|
||||
in.RunnerConfig.DeepCopyInto(&out.RunnerConfig)
|
||||
if in.EffectiveTime != nil {
|
||||
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
if in.WorkVolumeClaimTemplate != nil {
|
||||
in, out := &in.WorkVolumeClaimTemplate, &out.WorkVolumeClaimTemplate
|
||||
*out = new(WorkVolumeClaimTemplate)
|
||||
(*in).DeepCopyInto(*out)
|
||||
}
|
||||
in.StatefulSetSpec.DeepCopyInto(&out.StatefulSetSpec)
|
||||
}
|
||||
|
||||
@@ -1109,3 +1135,39 @@ func (in *ScheduledOverride) DeepCopy() *ScheduledOverride {
|
||||
in.DeepCopyInto(out)
|
||||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *WorkVolumeClaimTemplate) DeepCopyInto(out *WorkVolumeClaimTemplate) {
|
||||
*out = *in
|
||||
if in.AccessModes != nil {
|
||||
in, out := &in.AccessModes, &out.AccessModes
|
||||
*out = make([]v1.PersistentVolumeAccessMode, len(*in))
|
||||
copy(*out, *in)
|
||||
}
|
||||
in.Resources.DeepCopyInto(&out.Resources)
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkVolumeClaimTemplate.
|
||||
func (in *WorkVolumeClaimTemplate) DeepCopy() *WorkVolumeClaimTemplate {
|
||||
if in == nil {
|
||||
return nil
|
||||
}
|
||||
out := new(WorkVolumeClaimTemplate)
|
||||
in.DeepCopyInto(out)
|
||||
return out
|
||||
}
|
||||
|
||||
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||
func (in *WorkflowJobSpec) DeepCopyInto(out *WorkflowJobSpec) {
|
||||
*out = *in
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkflowJobSpec.
|
||||
func (in *WorkflowJobSpec) DeepCopy() *WorkflowJobSpec {
|
||||
if in == nil {
|
||||
return nil
|
||||
}
|
||||
out := new(WorkflowJobSpec)
|
||||
in.DeepCopyInto(out)
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.15.3
|
||||
version: 0.20.0
|
||||
|
||||
# Used as the default manager tag value when no tag property is provided in the values.yaml
|
||||
appVersion: 0.20.4
|
||||
appVersion: 0.25.0
|
||||
|
||||
home: https://github.com/actions-runner-controller/actions-runner-controller
|
||||
|
||||
|
||||
@@ -4,18 +4,18 @@ All additional docs are kept in the `docs/` folder, this README is solely for do
|
||||
|
||||
## Values
|
||||
|
||||
**_The values are documented as of HEAD, to review the configuration options for your chart version ensure you view this file at the relevent [tag](https://github.com/actions-runner-controller/actions-runner-controller/tags)_**
|
||||
**_The values are documented as of HEAD, to review the configuration options for your chart version ensure you view this file at the relevant [tag](https://github.com/actions-runner-controller/actions-runner-controller/tags)_**
|
||||
|
||||
> _Default values are the defaults set in the charts values.yaml, some properties have default configurations in the code for when the property is omitted or invalid_
|
||||
> _Default values are the defaults set in the charts `values.yaml`, some properties have default configurations in the code for when the property is omitted or invalid_
|
||||
|
||||
| Key | Description | Default |
|
||||
|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------|
|
||||
| `labels` | Set labels to apply to all resources in the chart | |
|
||||
| `replicaCount` | Set the number of controller pods | 1 |
|
||||
| `webhookPort` | Set the containerPort for the webhook Pod | 9443 |
|
||||
| `syncPeriod` | Set the period in which the controler reconciles the desired runners count | 10m |
|
||||
| `enableLeaderElection` | Enable election configuration | true |
|
||||
| `leaderElectionId` | Set the election ID for the controller group | |
|
||||
| `githubAPICacheDuration` | Set the cache period for API calls | |
|
||||
| `githubEnterpriseServerURL` | Set the URL for a self-hosted GitHub Enterprise Server | |
|
||||
| `githubURL` | Override GitHub URL to be used for GitHub API calls | |
|
||||
| `githubUploadURL` | Override GitHub Upload URL to be used for GitHub API calls | |
|
||||
@@ -33,6 +33,7 @@ All additional docs are kept in the `docs/` folder, this README is solely for do
|
||||
| `authSecret.github_basicauth_username` | Username for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
|
||||
| `authSecret.github_basicauth_password` | Password for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
|
||||
| `dockerRegistryMirror` | The default Docker Registry Mirror used by runners. | |
|
||||
| `hostNetwork` | The "hostNetwork" of the controller container | false |
|
||||
| `image.repository` | The "repository/image" of the controller container | summerwind/actions-runner-controller |
|
||||
| `image.tag` | The tag of the controller container | |
|
||||
| `image.actionsRunnerRepositoryAndTag` | The "repository/image" of the actions runner container | summerwind/actions-runner:latest |
|
||||
@@ -49,7 +50,7 @@ All additional docs are kept in the `docs/` folder, this README is solely for do
|
||||
| `imagePullSecrets` | Specifies the secret to be used when pulling the controller pod containers | |
|
||||
| `fullnameOverride` | Override the full resource names | |
|
||||
| `nameOverride` | Override the resource name prefix | |
|
||||
| `serviceAccont.annotations` | Set annotations to the service account | |
|
||||
| `serviceAccount.annotations` | Set annotations to the service account | |
|
||||
| `serviceAccount.create` | Deploy the controller pod under a service account | true |
|
||||
| `podAnnotations` | Set annotations for the controller pod | |
|
||||
| `podLabels` | Set labels for the controller pod | |
|
||||
|
||||
@@ -49,6 +49,9 @@ spec:
|
||||
items:
|
||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||
properties:
|
||||
effectiveTime:
|
||||
format: date-time
|
||||
type: string
|
||||
expirationTime:
|
||||
format: date-time
|
||||
type: string
|
||||
@@ -138,6 +141,7 @@ spec:
|
||||
status:
|
||||
type: string
|
||||
types:
|
||||
description: 'One of: created, rerequested, or completed'
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
@@ -157,6 +161,9 @@ spec:
|
||||
push:
|
||||
description: PushSpec is the condition for triggering scale-up on push event Also see https://docs.github.com/en/actions/reference/events-that-trigger-workflows#push
|
||||
type: object
|
||||
workflowJob:
|
||||
description: https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_job
|
||||
type: object
|
||||
type: object
|
||||
type: object
|
||||
type: array
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -18,20 +18,23 @@ Due to the above you can't just do a `helm upgrade` to release the latest versio
|
||||
|
||||
## Steps
|
||||
|
||||
1. Upgrade CRDs
|
||||
1. Upgrade CRDs, this isn't optional, the CRDs you are using must be those that correspond with the version of the controller you are installing
|
||||
|
||||
```shell
|
||||
# REMEMBER TO UPDATE THE CHART_VERSION TO RELEVANT CHART VERISON!!!!
|
||||
CHART_VERSION=0.14.0
|
||||
# REMEMBER TO UPDATE THE CHART_VERSION TO RELEVANT CHART VERISON!!!!
|
||||
CHART_VERSION=0.18.0
|
||||
|
||||
curl -L https://github.com/actions-runner-controller/actions-runner-controller/releases/download/actions-runner-controller-${CHART_VERSION}/actions-runner-controller-${CHART_VERSION}.tgz | tar zxv --strip 1 actions-runner-controller/crds
|
||||
|
||||
kubectl apply -f crds/
|
||||
kubectl replace -f crds/
|
||||
```
|
||||
|
||||
2. Upgrade the Helm release
|
||||
|
||||
```shell
|
||||
# helm repo [command]
|
||||
helm repo update
|
||||
|
||||
# helm upgrade [RELEASE] [CHART] [flags]
|
||||
helm upgrade actions-runner-controller \
|
||||
actions-runner-controller/actions-runner-controller \
|
||||
|
||||
@@ -14,6 +14,7 @@ spec:
|
||||
metadata:
|
||||
{{- with .Values.podAnnotations }}
|
||||
annotations:
|
||||
kubectl.kubernetes.io/default-logs-container: "manager"
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
@@ -43,7 +44,9 @@ spec:
|
||||
{{- if .Values.leaderElectionId }}
|
||||
- "--leader-election-id={{ .Values.leaderElectionId }}"
|
||||
{{- end }}
|
||||
- "--port={{ .Values.webhookPort }}"
|
||||
- "--sync-period={{ .Values.syncPeriod }}"
|
||||
- "--default-scale-down-delay={{ .Values.defaultScaleDownDelay }}"
|
||||
- "--docker-image={{ .Values.image.dindSidecarRepositoryAndTag }}"
|
||||
- "--runner-image={{ .Values.image.actionsRunnerRepositoryAndTag }}"
|
||||
{{- range .Values.image.actionsRunnerImagePullSecrets }}
|
||||
@@ -104,17 +107,16 @@ spec:
|
||||
key: github_app_private_key
|
||||
name: {{ include "actions-runner-controller.secretName" . }}
|
||||
optional: true
|
||||
{{- if .Values.authSecret.github_basicauth_username }}
|
||||
{{- if .Values.authSecret.github_basicauth_username }}
|
||||
- name: GITHUB_BASICAUTH_USERNAME
|
||||
value: {{ .Values.authSecret.github_basicauth_username }}
|
||||
{{- end }}
|
||||
{{- if .Values.authSecret.github_basicauth_password }}
|
||||
- name: GITHUB_BASICAUTH_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: github_basicauth_password
|
||||
name: {{ include "actions-runner-controller.secretName" . }}
|
||||
{{- end }}
|
||||
optional: true
|
||||
{{- end }}
|
||||
{{- range $key, $val := .Values.env }}
|
||||
- name: {{ $key }}
|
||||
@@ -124,7 +126,7 @@ spec:
|
||||
name: manager
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
ports:
|
||||
- containerPort: 9443
|
||||
- containerPort: {{ .Values.webhookPort }}
|
||||
name: webhook-server
|
||||
protocol: TCP
|
||||
{{- if not .Values.metrics.proxy.enabled }}
|
||||
@@ -199,3 +201,6 @@ spec:
|
||||
topologySpreadConstraints:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.hostNetwork }}
|
||||
hostNetwork: {{ .Values.hostNetwork }}
|
||||
{{- end }}
|
||||
|
||||
@@ -15,6 +15,7 @@ spec:
|
||||
metadata:
|
||||
{{- with .Values.githubWebhookServer.podAnnotations }}
|
||||
annotations:
|
||||
kubectl.kubernetes.io/default-logs-container: "github-webhook-server"
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
@@ -94,17 +95,16 @@ spec:
|
||||
key: github_app_private_key
|
||||
name: {{ include "actions-runner-controller.githubWebhookServerSecretName" . }}
|
||||
optional: true
|
||||
{{- if .Values.authSecret.github_basicauth_username }}
|
||||
{{- if .Values.authSecret.github_basicauth_username }}
|
||||
- name: GITHUB_BASICAUTH_USERNAME
|
||||
value: {{ .Values.authSecret.github_basicauth_username }}
|
||||
{{- end }}
|
||||
{{- if .Values.authSecret.github_basicauth_password }}
|
||||
- name: GITHUB_BASICAUTH_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: github_basicauth_password
|
||||
name: {{ include "actions-runner-controller.secretName" . }}
|
||||
{{- end }}
|
||||
optional: true
|
||||
{{- end }}
|
||||
{{- range $key, $val := .Values.githubWebhookServer.env }}
|
||||
- name: {{ $key }}
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
{{- if .Values.githubWebhookServer.ingress.enabled -}}
|
||||
{{- $fullName := include "actions-runner-controller-github-webhook-server.fullname" . -}}
|
||||
{{- $svcPort := (index .Values.githubWebhookServer.service.ports 0).port -}}
|
||||
{{- if .Capabilities.APIVersions.Has "networking.k8s.io/v1" }}
|
||||
{{- if .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1" }}
|
||||
{{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
|
||||
apiVersion: networking.k8s.io/v1beta1
|
||||
{{- else if .Capabilities.APIVersions.Has "extensions/v1beta1" }}
|
||||
{{- else if .Capabilities.APIVersions.Has "extensions/v1beta1/Ingress" }}
|
||||
apiVersion: extensions/v1beta1
|
||||
{{- end }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ $fullName }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "actions-runner-controller.labels" . | nindent 4 }}
|
||||
{{- with .Values.githubWebhookServer.ingress.annotations }}
|
||||
@@ -36,13 +37,16 @@ spec:
|
||||
- host: {{ .host | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- if .extraPaths }}
|
||||
{{- toYaml .extraPaths | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- range .paths }}
|
||||
- path: {{ .path }}
|
||||
{{- if $.Capabilities.APIVersions.Has "networking.k8s.io/v1" }}
|
||||
{{- if $.Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }}
|
||||
pathType: {{ .pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if $.Capabilities.APIVersions.Has "networking.k8s.io/v1" }}
|
||||
{{- if $.Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
|
||||
@@ -12,5 +12,17 @@ data:
|
||||
{{- if .Values.githubWebhookServer.secret.github_webhook_secret_token }}
|
||||
github_webhook_secret_token: {{ .Values.githubWebhookServer.secret.github_webhook_secret_token | toString | b64enc }}
|
||||
{{- end }}
|
||||
{{- if .Values.githubWebhookServer.secret.github_app_id }}
|
||||
github_app_id: {{ .Values.githubWebhookServer.secret.github_app_id | toString | b64enc }}
|
||||
{{- end }}
|
||||
{{- if .Values.githubWebhookServer.secret.github_app_installation_id }}
|
||||
github_app_installation_id: {{ .Values.githubWebhookServer.secret.github_app_installation_id | toString | b64enc }}
|
||||
{{- end }}
|
||||
{{- if .Values.githubWebhookServer.secret.github_app_private_key }}
|
||||
github_app_private_key: {{ .Values.githubWebhookServer.secret.github_app_private_key | toString | b64enc }}
|
||||
{{- end }}
|
||||
{{- if .Values.githubWebhookServer.secret.github_token }}
|
||||
github_token: {{ .Values.githubWebhookServer.secret.github_token | toString | b64enc }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
@@ -195,6 +195,28 @@ rules:
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- persistentvolumeclaims
|
||||
verbs:
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- persistentvolumes
|
||||
verbs:
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
@@ -228,3 +250,11 @@ rules:
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
|
||||
@@ -12,6 +12,11 @@ metadata:
|
||||
webhooks:
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ quote .Values.admissionWebHooks.caBundle }}
|
||||
@@ -35,6 +40,11 @@ webhooks:
|
||||
sideEffects: None
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
@@ -58,6 +68,11 @@ webhooks:
|
||||
sideEffects: None
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
@@ -81,6 +96,11 @@ webhooks:
|
||||
sideEffects: None
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
@@ -117,6 +137,11 @@ metadata:
|
||||
webhooks:
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
@@ -140,6 +165,11 @@ webhooks:
|
||||
sideEffects: None
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
@@ -163,6 +193,11 @@ webhooks:
|
||||
sideEffects: None
|
||||
- admissionReviewVersions:
|
||||
- v1beta1
|
||||
{{- if .Values.scope.singleNamespace }}
|
||||
namespaceSelector:
|
||||
matchLabels:
|
||||
name: {{ default .Release.Namespace .Values.scope.watchNamespace }}
|
||||
{{- end }}
|
||||
clientConfig:
|
||||
{{- if .Values.admissionWebHooks.caBundle }}
|
||||
caBundle: {{ .Values.admissionWebHooks.caBundle }}
|
||||
|
||||
@@ -13,7 +13,7 @@ spec:
|
||||
type: {{ .Values.service.type }}
|
||||
ports:
|
||||
- port: 443
|
||||
targetPort: 9443
|
||||
targetPort: {{ .Values.webhookPort }}
|
||||
protocol: TCP
|
||||
name: https
|
||||
selector:
|
||||
|
||||
@@ -6,13 +6,16 @@ labels: {}
|
||||
|
||||
replicaCount: 1
|
||||
|
||||
syncPeriod: 10m
|
||||
webhookPort: 9443
|
||||
syncPeriod: 1m
|
||||
defaultScaleDownDelay: 10m
|
||||
|
||||
enableLeaderElection: true
|
||||
# Specifies the controller id for leader election.
|
||||
# Must be unique if more than one controller installed onto the same namespace.
|
||||
#leaderElectionId: "actions-runner-controller"
|
||||
|
||||
# DEPRECATED: This has been removed as unnecessary in #1192
|
||||
# The controller tries its best not to repeat the duplicate GitHub API call
|
||||
# within this duration.
|
||||
# Defaults to syncPeriod - 10s.
|
||||
@@ -106,7 +109,7 @@ metrics:
|
||||
enabled: true
|
||||
image:
|
||||
repository: quay.io/brancz/kube-rbac-proxy
|
||||
tag: v0.11.0
|
||||
tag: v0.13.0
|
||||
|
||||
resources:
|
||||
{}
|
||||
@@ -165,6 +168,10 @@ admissionWebHooks:
|
||||
{}
|
||||
#caBundle: "Ci0tLS0tQk...<base64-encoded PEM bundle containing the CA that signed the webhook's serving certificate>...tLS0K"
|
||||
|
||||
# There may be alternatives to setting `hostNetwork: true`, see
|
||||
# https://github.com/actions-runner-controller/actions-runner-controller/issues/1005#issuecomment-993097155
|
||||
#hostNetwork: true
|
||||
|
||||
githubWebhookServer:
|
||||
enabled: false
|
||||
replicaCount: 1
|
||||
@@ -176,6 +183,13 @@ githubWebhookServer:
|
||||
name: "github-webhook-server"
|
||||
### GitHub Webhook Configuration
|
||||
github_webhook_secret_token: ""
|
||||
### GitHub Apps Configuration
|
||||
## NOTE: IDs MUST be strings, use quotes
|
||||
#github_app_id: ""
|
||||
#github_app_installation_id: ""
|
||||
#github_app_private_key: |
|
||||
### GitHub PAT Configuration
|
||||
#github_token: ""
|
||||
imagePullSecrets: []
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
@@ -217,6 +231,20 @@ githubWebhookServer:
|
||||
paths: []
|
||||
# - path: /*
|
||||
# pathType: ImplementationSpecific
|
||||
# Extra paths that are not automatically connected to the server. This is useful when working with annotation based services.
|
||||
extraPaths: []
|
||||
# - path: /*
|
||||
# backend:
|
||||
# serviceName: ssl-redirect
|
||||
# servicePort: use-annotation
|
||||
## for Kubernetes >=1.19 (when "networking.k8s.io/v1" is used)
|
||||
# - path: /*
|
||||
# pathType: Prefix
|
||||
# backend:
|
||||
# service:
|
||||
# name: ssl-redirect
|
||||
# port:
|
||||
# name: use-annotation
|
||||
tls: []
|
||||
# - secretName: chart-example-tls
|
||||
# hosts:
|
||||
|
||||
@@ -29,15 +29,14 @@ import (
|
||||
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/controllers"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/logging"
|
||||
"github.com/kelseyhightower/envconfig"
|
||||
zaplib "go.uber.org/zap"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/exec"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
// +kubebuilder:scaffold:imports
|
||||
)
|
||||
|
||||
@@ -47,11 +46,6 @@ var (
|
||||
)
|
||||
|
||||
const (
|
||||
logLevelDebug = "debug"
|
||||
logLevelInfo = "info"
|
||||
logLevelWarn = "warn"
|
||||
logLevelError = "error"
|
||||
|
||||
webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN"
|
||||
)
|
||||
|
||||
@@ -78,6 +72,7 @@ func main() {
|
||||
enableLeaderElection bool
|
||||
syncPeriod time.Duration
|
||||
logLevel string
|
||||
queueLimit int
|
||||
|
||||
ghClient *github.Client
|
||||
)
|
||||
@@ -97,7 +92,8 @@ func main() {
|
||||
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
|
||||
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
|
||||
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change")
|
||||
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.IntVar(&queueLimit, "queue-limit", controllers.DefaultQueueLimit, `The maximum length of the scale operation queue. The scale opration is enqueued per every matching webhook event, and the server returns a 500 HTTP status when the queue was already full on enqueue attempt.`)
|
||||
flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.")
|
||||
flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.")
|
||||
flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.")
|
||||
@@ -126,23 +122,9 @@ func main() {
|
||||
setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.")
|
||||
}
|
||||
|
||||
logger := zap.New(func(o *zap.Options) {
|
||||
switch logLevel {
|
||||
case logLevelDebug:
|
||||
o.Development = true
|
||||
lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2)
|
||||
o.Level = &lvl
|
||||
case logLevelInfo:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelWarn:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelError:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
|
||||
o.Level = &lvl
|
||||
}
|
||||
})
|
||||
logger := logging.NewLogger(logLevel)
|
||||
|
||||
ctrl.SetLogger(logger)
|
||||
|
||||
// In order to support runner groups with custom visibility (selected repositories), we need to perform some GitHub API calls.
|
||||
// Let the user define if they want to opt-in supporting this option by providing the proper GitHub authentication parameters
|
||||
@@ -150,6 +132,8 @@ func main() {
|
||||
// That is, all runner groups managed by ARC are assumed to be visible to any repositories,
|
||||
// which is wrong when you have one or more non-default runner groups in your organization or enterprise.
|
||||
if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) {
|
||||
c.Log = &logger
|
||||
|
||||
ghClient, err = c.NewClient()
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err)
|
||||
@@ -160,8 +144,6 @@ func main() {
|
||||
setupLog.Info("GitHub client is not initialized. Runner groups with custom visibility are not supported. If needed, please provide GitHub authentication. This will incur in extra GitHub API calls")
|
||||
}
|
||||
|
||||
ctrl.SetLogger(logger)
|
||||
|
||||
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
|
||||
Scheme: scheme,
|
||||
SyncPeriod: &syncPeriod,
|
||||
@@ -184,6 +166,7 @@ func main() {
|
||||
SecretKeyBytes: []byte(webhookSecretToken),
|
||||
Namespace: watchNamespace,
|
||||
GitHubClient: ghClient,
|
||||
QueueLimit: queueLimit,
|
||||
}
|
||||
|
||||
if err = hraGitHubWebhook.SetupWithManager(mgr); err != nil {
|
||||
|
||||
@@ -49,6 +49,9 @@ spec:
|
||||
items:
|
||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||
properties:
|
||||
effectiveTime:
|
||||
format: date-time
|
||||
type: string
|
||||
expirationTime:
|
||||
format: date-time
|
||||
type: string
|
||||
@@ -138,6 +141,7 @@ spec:
|
||||
status:
|
||||
type: string
|
||||
types:
|
||||
description: 'One of: created, rerequested, or completed'
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
@@ -157,6 +161,9 @@ spec:
|
||||
push:
|
||||
description: PushSpec is the condition for triggering scale-up on push event Also see https://docs.github.com/en/actions/reference/events-that-trigger-workflows#push
|
||||
type: object
|
||||
workflowJob:
|
||||
description: https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_job
|
||||
type: object
|
||||
type: object
|
||||
type: object
|
||||
type: array
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ spec:
|
||||
conversion:
|
||||
strategy: Webhook
|
||||
webhook:
|
||||
conversionReviewVersions: ["v1","v1beta1"]
|
||||
clientConfig:
|
||||
# this is "\n" used as a placeholder, otherwise it will be rejected by the apiserver for being blank,
|
||||
# but we're going to set it later using the cert-manager (or potentially a patch if not using cert-manager)
|
||||
|
||||
@@ -20,19 +20,20 @@ bases:
|
||||
- ../webhook
|
||||
# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
|
||||
- ../certmanager
|
||||
# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
|
||||
# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
|
||||
#- ../prometheus
|
||||
|
||||
patchesStrategicMerge:
|
||||
# Protect the /metrics endpoint by putting it behind auth.
|
||||
# Only one of manager_auth_proxy_patch.yaml and
|
||||
# manager_prometheus_metrics_patch.yaml should be enabled.
|
||||
# Protect the /metrics endpoint by putting it behind auth.
|
||||
# Only one of manager_auth_proxy_patch.yaml and
|
||||
# manager_prometheus_metrics_patch.yaml should be enabled.
|
||||
- manager_auth_proxy_patch.yaml
|
||||
# If you want your controller-manager to expose the /metrics
|
||||
# endpoint w/o any authn/z, uncomment the following line and
|
||||
# comment manager_auth_proxy_patch.yaml.
|
||||
# Only one of manager_auth_proxy_patch.yaml and
|
||||
# manager_prometheus_metrics_patch.yaml should be enabled.
|
||||
|
||||
# If you want your controller-manager to expose the /metrics
|
||||
# endpoint w/o any authn/z, uncomment the following line and
|
||||
# comment manager_auth_proxy_patch.yaml.
|
||||
# Only one of manager_auth_proxy_patch.yaml and
|
||||
# manager_prometheus_metrics_patch.yaml should be enabled.
|
||||
#- manager_prometheus_metrics_patch.yaml
|
||||
|
||||
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in crd/kustomization.yaml
|
||||
|
||||
@@ -23,4 +23,3 @@ spec:
|
||||
args:
|
||||
- "--metrics-addr=127.0.0.1:8080"
|
||||
- "--enable-leader-election"
|
||||
- "--sync-period=10m"
|
||||
|
||||
37
config/github-webhook-server/deployment.yaml
Normal file
37
config/github-webhook-server/deployment.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
name: github-webhook-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
spec:
|
||||
containers:
|
||||
- name: github-webhook-server
|
||||
image: controller:latest
|
||||
command:
|
||||
- '/github-webhook-server'
|
||||
env:
|
||||
- name: GITHUB_WEBHOOK_SECRET_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: github_webhook_secret_token
|
||||
name: github-webhook-server
|
||||
optional: true
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
protocol: TCP
|
||||
serviceAccountName: github-webhook-server
|
||||
terminationGracePeriodSeconds: 10
|
||||
@@ -0,0 +1,23 @@
|
||||
# This patch injects an HTTP proxy sidecar container that performs RBAC
|
||||
# authorization against the Kubernetes API using SubjectAccessReviews.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: github-webhook-server
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: kube-rbac-proxy
|
||||
image: quay.io/brancz/kube-rbac-proxy:v0.10.0
|
||||
args:
|
||||
- '--secure-listen-address=0.0.0.0:8443'
|
||||
- '--upstream=http://127.0.0.1:8080/'
|
||||
- '--logtostderr=true'
|
||||
- '--v=10'
|
||||
ports:
|
||||
- containerPort: 8443
|
||||
name: https
|
||||
- name: github-webhook-server
|
||||
args:
|
||||
- '--metrics-addr=127.0.0.1:8080'
|
||||
15
config/github-webhook-server/kustomization.yaml
Normal file
15
config/github-webhook-server/kustomization.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
images:
|
||||
- name: controller
|
||||
newName: summerwind/actions-runner-controller
|
||||
newTag: latest
|
||||
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- rbac.yaml
|
||||
- service.yaml
|
||||
|
||||
patchesStrategicMerge:
|
||||
- gh-webhook-server-auth-proxy-patch.yaml
|
||||
113
config/github-webhook-server/rbac.yaml
Normal file
113
config/github-webhook-server/rbac.yaml
Normal file
@@ -0,0 +1,113 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
name: github-webhook-server
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
name: github-webhook-server
|
||||
rules:
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- horizontalrunnerautoscalers
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- horizontalrunnerautoscalers/finalizers
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- horizontalrunnerautoscalers/status
|
||||
verbs:
|
||||
- get
|
||||
- patch
|
||||
- update
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- runnersets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- runnerdeployments
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- runnerdeployments/finalizers
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- actions.summerwind.dev
|
||||
resources:
|
||||
- runnerdeployments/status
|
||||
verbs:
|
||||
- get
|
||||
- patch
|
||||
- update
|
||||
- apiGroups:
|
||||
- authentication.k8s.io
|
||||
resources:
|
||||
- tokenreviews
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- authorization.k8s.io
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
verbs:
|
||||
- create
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
name: github-webhook-server
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: github-webhook-server
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: github-webhook-server
|
||||
16
config/github-webhook-server/service.yaml
Normal file
16
config/github-webhook-server/service.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
name: github-webhook-server
|
||||
spec:
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
app.kubernetes.io/component: github-webhook-server
|
||||
app.kubernetes.io/part-of: actions-runner-controller
|
||||
@@ -202,6 +202,29 @@ rules:
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- persistentvolumeclaims
|
||||
verbs:
|
||||
- create
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- persistentvolumes
|
||||
verbs:
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
@@ -226,3 +249,12 @@ rules:
|
||||
- patch
|
||||
- update
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- delete
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
|
||||
@@ -7,10 +7,11 @@ import (
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -20,47 +21,6 @@ const (
|
||||
defaultScaleDownFactor = 0.7
|
||||
)
|
||||
|
||||
func getValueAvailableAt(now time.Time, from, to *time.Time, reservedValue int) *int {
|
||||
if to != nil && now.After(*to) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if from != nil && now.Before(*from) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &reservedValue
|
||||
}
|
||||
|
||||
func (r *HorizontalRunnerAutoscalerReconciler) fetchSuggestedReplicasFromCache(hra v1alpha1.HorizontalRunnerAutoscaler) *int {
|
||||
var entry *v1alpha1.CacheEntry
|
||||
|
||||
for i := range hra.Status.CacheEntries {
|
||||
ent := hra.Status.CacheEntries[i]
|
||||
|
||||
if ent.Key != v1alpha1.CacheEntryKeyDesiredReplicas {
|
||||
continue
|
||||
}
|
||||
|
||||
if !time.Now().Before(ent.ExpirationTime.Time) {
|
||||
continue
|
||||
}
|
||||
|
||||
entry = &ent
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if entry != nil {
|
||||
v := getValueAvailableAt(time.Now(), nil, &entry.ExpirationTime.Time, entry.Value)
|
||||
if v != nil {
|
||||
return v
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *HorizontalRunnerAutoscalerReconciler) suggestDesiredReplicas(st scaleTarget, hra v1alpha1.HorizontalRunnerAutoscaler) (*int, error) {
|
||||
if hra.Spec.MinReplicas == nil {
|
||||
return nil, fmt.Errorf("horizontalrunnerautoscaler %s/%s is missing minReplicas", hra.Namespace, hra.Name)
|
||||
@@ -71,10 +31,8 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestDesiredReplicas(st scaleTa
|
||||
metrics := hra.Spec.Metrics
|
||||
numMetrics := len(metrics)
|
||||
if numMetrics == 0 {
|
||||
if len(hra.Spec.ScaleUpTriggers) == 0 {
|
||||
return r.suggestReplicasByQueuedAndInProgressWorkflowRuns(st, hra, nil)
|
||||
}
|
||||
|
||||
// We don't default to anything since ARC 0.23.0
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/728
|
||||
return nil, nil
|
||||
} else if numMetrics > 2 {
|
||||
return nil, fmt.Errorf("too many autoscaling metrics configured: It must be 0 to 2, but got %d", numMetrics)
|
||||
@@ -182,7 +140,29 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByQueuedAndInProgr
|
||||
if len(allJobs) == 0 {
|
||||
fallback_cb()
|
||||
} else {
|
||||
JOB:
|
||||
for _, job := range allJobs {
|
||||
runnerLabels := make(map[string]struct{}, len(st.labels))
|
||||
for _, l := range st.labels {
|
||||
runnerLabels[l] = struct{}{}
|
||||
}
|
||||
|
||||
if len(job.Labels) == 0 {
|
||||
// This shouldn't usually happen
|
||||
r.Log.Info("Detected job with no labels, which is not supported by ARC. Skipping anyway.", "labels", job.Labels, "run_id", job.GetRunID(), "job_id", job.GetID())
|
||||
continue JOB
|
||||
}
|
||||
|
||||
for _, l := range job.Labels {
|
||||
if l == "self-hosted" {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, ok := runnerLabels[l]; !ok {
|
||||
continue JOB
|
||||
}
|
||||
}
|
||||
|
||||
switch job.GetStatus() {
|
||||
case "completed":
|
||||
// We add a case for `completed` so it is not counted in `unknown`.
|
||||
@@ -336,22 +316,52 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner
|
||||
numRunners int
|
||||
numRunnersRegistered int
|
||||
numRunnersBusy int
|
||||
numTerminatingBusy int
|
||||
)
|
||||
|
||||
numRunners = len(runnerMap)
|
||||
|
||||
busyTerminatingRunnerPods := map[string]struct{}{}
|
||||
|
||||
kindLabel := LabelKeyRunnerDeploymentName
|
||||
if hra.Spec.ScaleTargetRef.Kind == "RunnerSet" {
|
||||
kindLabel = LabelKeyRunnerSetName
|
||||
}
|
||||
|
||||
var runnerPodList corev1.PodList
|
||||
if err := r.Client.List(ctx, &runnerPodList, client.InNamespace(hra.Namespace), client.MatchingLabels(map[string]string{
|
||||
kindLabel: hra.Spec.ScaleTargetRef.Name,
|
||||
})); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, p := range runnerPodList.Items {
|
||||
if p.Annotations[AnnotationKeyUnregistrationFailureMessage] != "" {
|
||||
busyTerminatingRunnerPods[p.Name] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
for _, runner := range runners {
|
||||
if _, ok := runnerMap[*runner.Name]; ok {
|
||||
numRunnersRegistered++
|
||||
|
||||
if runner.GetBusy() {
|
||||
numRunnersBusy++
|
||||
} else if _, ok := busyTerminatingRunnerPods[*runner.Name]; ok {
|
||||
numTerminatingBusy++
|
||||
}
|
||||
|
||||
delete(busyTerminatingRunnerPods, *runner.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Remaining busyTerminatingRunnerPods are runners that were not on the ListRunners API response yet
|
||||
for range busyTerminatingRunnerPods {
|
||||
numTerminatingBusy++
|
||||
}
|
||||
|
||||
var desiredReplicas int
|
||||
fractionBusy := float64(numRunnersBusy) / float64(desiredReplicasBefore)
|
||||
fractionBusy := float64(numRunnersBusy+numTerminatingBusy) / float64(desiredReplicasBefore)
|
||||
if fractionBusy >= scaleUpThreshold {
|
||||
if scaleUpAdjustment > 0 {
|
||||
desiredReplicas = desiredReplicasBefore + scaleUpAdjustment
|
||||
@@ -380,6 +390,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunner
|
||||
"num_runners", numRunners,
|
||||
"num_runners_registered", numRunnersRegistered,
|
||||
"num_runners_busy", numRunnersBusy,
|
||||
"num_terminating_busy", numTerminatingBusy,
|
||||
"namespace", hra.Namespace,
|
||||
"kind", st.kind,
|
||||
"name", st.st,
|
||||
|
||||
@@ -41,8 +41,12 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
|
||||
metav1Now := metav1.Now()
|
||||
testcases := []struct {
|
||||
repo string
|
||||
org string
|
||||
description string
|
||||
|
||||
repo string
|
||||
org string
|
||||
labels []string
|
||||
|
||||
fixed *int
|
||||
max *int
|
||||
min *int
|
||||
@@ -68,6 +72,19 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"status":"in_progress"}, {"status":"in_progress"}]}"`,
|
||||
want: 3,
|
||||
},
|
||||
// Explicitly speified the default `self-hosted` label which is ignored by the simulator,
|
||||
// as we assume that GitHub Actions automatically associates the `self-hosted` label to every self-hosted runner.
|
||||
// 3 demanded, max at 3
|
||||
{
|
||||
repo: "test/valid",
|
||||
labels: []string{"self-hosted"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(3),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"status":"queued"}, {"status":"in_progress"}, {"status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"status":"in_progress"}, {"status":"in_progress"}]}"`,
|
||||
want: 3,
|
||||
},
|
||||
// 2 demanded, max at 3, currently 3, delay scaling down due to grace period
|
||||
{
|
||||
repo: "test/valid",
|
||||
@@ -152,9 +169,40 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
want: 3,
|
||||
},
|
||||
|
||||
// Job-level autoscaling
|
||||
// 5 requested from 3 workflows
|
||||
{
|
||||
description: "Job-level autoscaling with no explicit runner label (runners have implicit self-hosted, requested self-hosted, 5 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"completed", "labels":["self-hosted"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling with no explicit runner label (runners have implicit self-hosted, requested self-hosted+custom, 0 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling with no label (runners have implicit self-hosted, jobs had no labels, 0 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
@@ -166,6 +214,91 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
2: `{"jobs": [{"status": "in_progress"}, {"status":"completed"}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress"}, {"status":"queued"}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling with default runner label (runners have self-hosted only, requested self-hosted+custom, 0 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
labels: []string{"self-hosted"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling with custom runner label (runners have custom2, requested self-hosted+custom, 0 jobs from 5 workflows",
|
||||
repo: "test/valid",
|
||||
labels: []string{"custom2"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling with default runner label (runners have self-hosted, requested managed-runner-label, 0 jobs from 3 runs)",
|
||||
repo: "test/valid",
|
||||
labels: []string{"self-hosted"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["managed-runner-label"]}, {"status":"queued", "labels":["managed-runner-label"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["managed-runner-label"]}, {"status":"completed", "labels":["managed-runner-label"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["managed-runner-label"]}, {"status":"queued", "labels":["managed-runner-label"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Job-level autoscaling with default + custom runner label (runners have self-hosted+custom, requested self-hosted+custom, 5 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
labels: []string{"self-hosted", "custom"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Job-level autoscaling with custom runner label (runners have custom, requested self-hosted+custom, 5 jobs from 3 workflows)",
|
||||
repo: "test/valid",
|
||||
labels: []string{"custom"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
}
|
||||
@@ -181,7 +314,12 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
_ = clientgoscheme.AddToScheme(scheme)
|
||||
_ = v1alpha1.AddToScheme(scheme)
|
||||
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
testName := fmt.Sprintf("case %d", i)
|
||||
if tc.description != "" {
|
||||
testName = tc.description
|
||||
}
|
||||
|
||||
t.Run(testName, func(t *testing.T) {
|
||||
server := fake.NewServer(
|
||||
fake.WithListRepositoryWorkflowRunsResponse(200, tc.workflowRuns, tc.workflowRuns_queued, tc.workflowRuns_in_progress),
|
||||
fake.WithListWorkflowJobsResponse(200, tc.workflowJobs),
|
||||
@@ -191,9 +329,10 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
client := newGithubClient(server)
|
||||
|
||||
h := &HorizontalRunnerAutoscalerReconciler{
|
||||
Log: log,
|
||||
GitHubClient: client,
|
||||
Scheme: scheme,
|
||||
Log: log,
|
||||
GitHubClient: client,
|
||||
Scheme: scheme,
|
||||
DefaultScaleDownDelay: DefaultScaleDownDelay,
|
||||
}
|
||||
|
||||
rd := v1alpha1.RunnerDeployment{
|
||||
@@ -206,6 +345,7 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
Spec: v1alpha1.RunnerSpec{
|
||||
RunnerConfig: v1alpha1.RunnerConfig{
|
||||
Repository: tc.repo,
|
||||
Labels: tc.labels,
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -220,6 +360,11 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
Spec: v1alpha1.HorizontalRunnerAutoscalerSpec{
|
||||
MaxReplicas: tc.max,
|
||||
MinReplicas: tc.min,
|
||||
Metrics: []v1alpha1.MetricSpec{
|
||||
{
|
||||
Type: "TotalNumberOfQueuedAndInProgressWorkflowRuns",
|
||||
},
|
||||
},
|
||||
},
|
||||
Status: v1alpha1.HorizontalRunnerAutoscalerStatus{
|
||||
DesiredReplicas: tc.sReplicas,
|
||||
@@ -234,7 +379,7 @@ func TestDetermineDesiredReplicas_RepositoryRunner(t *testing.T) {
|
||||
|
||||
st := h.scaleTargetFromRD(context.Background(), rd)
|
||||
|
||||
got, _, _, err := h.computeReplicasWithCache(log, metav1Now.Time, st, hra, minReplicas)
|
||||
got, err := h.computeReplicasWithCache(log, metav1Now.Time, st, hra, minReplicas)
|
||||
if err != nil {
|
||||
if tc.err == "" {
|
||||
t.Fatalf("unexpected error: expected none, got %v", err)
|
||||
@@ -258,8 +403,12 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
|
||||
metav1Now := metav1.Now()
|
||||
testcases := []struct {
|
||||
repos []string
|
||||
org string
|
||||
description string
|
||||
|
||||
repos []string
|
||||
org string
|
||||
labels []string
|
||||
|
||||
fixed *int
|
||||
max *int
|
||||
min *int
|
||||
@@ -399,9 +548,43 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
err: "validating autoscaling metrics: spec.autoscaling.metrics[].repositoryNames is required and must have one more more entries for organizational runner deployment",
|
||||
},
|
||||
|
||||
// Job-level autoscaling
|
||||
// 5 requested from 3 workflows
|
||||
{
|
||||
description: "Job-level autoscaling (runners have implicit self-hosted, requested self-hosted, 5 jobs from 3 runs)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"completed", "labels":["self-hosted"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Job-level autoscaling (runners have explicit self-hosted, requested self-hosted, 5 jobs from 3 runs)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
labels: []string{"self-hosted"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"completed", "labels":["self-hosted"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted"]}, {"status":"queued", "labels":["self-hosted"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling (jobs lack labels, 0 requested from 3 workflows)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
min: intPtr(2),
|
||||
@@ -414,8 +597,97 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
2: `{"jobs": [{"status": "in_progress"}, {"status":"completed"}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress"}, {"status":"queued"}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling (runners have valid and implicit self-hosted, requested self-hosted+custom, 0 jobs from 3 runs)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling (runners have self-hosted, requested self-hosted+custom, 0 jobs from 3 workflows)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
labels: []string{"self-hosted"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Job-level autoscaling (runners have custom, requested self-hosted+custom, 5 requested from 3 workflows)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
labels: []string{"custom"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Job-level autoscaling (runners have custom, requested custom, 5 requested from 3 workflows)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
labels: []string{"custom"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["custom"]}, {"status":"queued", "labels":["custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["custom"]}, {"status":"completed", "labels":["custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["custom"]}, {"status":"queued", "labels":["custom"]}]}`,
|
||||
},
|
||||
want: 5,
|
||||
},
|
||||
|
||||
{
|
||||
description: "Skipped job-level autoscaling (specified custom2, 0 requested from 3 workflows)",
|
||||
org: "test",
|
||||
repos: []string{"valid"},
|
||||
labels: []string{"custom2"},
|
||||
min: intPtr(2),
|
||||
max: intPtr(10),
|
||||
workflowRuns: `{"total_count": 4, "workflow_runs":[{"id": 1, "status":"queued"}, {"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowRuns_queued: `{"total_count": 1, "workflow_runs":[{"id": 1, "status":"queued"}]}"`,
|
||||
workflowRuns_in_progress: `{"total_count": 2, "workflow_runs":[{"id": 2, "status":"in_progress"}, {"id": 3, "status":"in_progress"}, {"status":"completed"}]}"`,
|
||||
workflowJobs: map[int]string{
|
||||
1: `{"jobs": [{"status":"queued", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
2: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"completed", "labels":["self-hosted", "custom"]}]}`,
|
||||
3: `{"jobs": [{"status": "in_progress", "labels":["self-hosted", "custom"]}, {"status":"queued", "labels":["self-hosted", "custom"]}]}`,
|
||||
},
|
||||
want: 2,
|
||||
},
|
||||
}
|
||||
|
||||
for i := range testcases {
|
||||
@@ -429,7 +701,12 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
_ = clientgoscheme.AddToScheme(scheme)
|
||||
_ = v1alpha1.AddToScheme(scheme)
|
||||
|
||||
t.Run(fmt.Sprintf("case %d", i), func(t *testing.T) {
|
||||
testName := fmt.Sprintf("case %d", i)
|
||||
if tc.description != "" {
|
||||
testName = tc.description
|
||||
}
|
||||
|
||||
t.Run(testName, func(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
server := fake.NewServer(
|
||||
@@ -441,9 +718,10 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
client := newGithubClient(server)
|
||||
|
||||
h := &HorizontalRunnerAutoscalerReconciler{
|
||||
Log: log,
|
||||
Scheme: scheme,
|
||||
GitHubClient: client,
|
||||
Log: log,
|
||||
Scheme: scheme,
|
||||
GitHubClient: client,
|
||||
DefaultScaleDownDelay: DefaultScaleDownDelay,
|
||||
}
|
||||
|
||||
rd := v1alpha1.RunnerDeployment{
|
||||
@@ -465,6 +743,7 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
Spec: v1alpha1.RunnerSpec{
|
||||
RunnerConfig: v1alpha1.RunnerConfig{
|
||||
Organization: tc.org,
|
||||
Labels: tc.labels,
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -502,7 +781,7 @@ func TestDetermineDesiredReplicas_OrganizationalRunner(t *testing.T) {
|
||||
|
||||
st := h.scaleTargetFromRD(context.Background(), rd)
|
||||
|
||||
got, _, _, err := h.computeReplicasWithCache(log, metav1Now.Time, st, hra, minReplicas)
|
||||
got, err := h.computeReplicasWithCache(log, metav1Now.Time, st, hra, minReplicas)
|
||||
if err != nil {
|
||||
if tc.err == "" {
|
||||
t.Fatalf("unexpected error: expected none, got %v", err)
|
||||
|
||||
72
controllers/constants.go
Normal file
72
controllers/constants.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package controllers
|
||||
|
||||
import "time"
|
||||
|
||||
const (
|
||||
LabelKeyRunnerSetName = "runnerset-name"
|
||||
LabelKeyRunner = "actions-runner"
|
||||
)
|
||||
|
||||
const (
|
||||
// This names requires at least one slash to work.
|
||||
// See https://github.com/google/knative-gcp/issues/378
|
||||
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
|
||||
runnerLinkedResourcesFinalizerName = "actions.summerwind.dev/linked-resources"
|
||||
|
||||
annotationKeyPrefix = "actions-runner/"
|
||||
|
||||
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
|
||||
|
||||
// AnnotationKeyUnregistrationFailureMessage is the annotation that is added onto the pod once it failed to be unregistered from GitHub due to e.g. 422 error
|
||||
AnnotationKeyUnregistrationFailureMessage = annotationKeyPrefix + "unregistration-failure-message"
|
||||
|
||||
// AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
|
||||
AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp"
|
||||
|
||||
// AnnotationKeyRunnerCompletionWaitStartTimestamp is the annotation that is added onto the pod when
|
||||
// ARC decided to wait until the pod to complete by itself, without the need for ARC to unregister the corresponding runner.
|
||||
AnnotationKeyRunnerCompletionWaitStartTimestamp = annotationKeyPrefix + "runner-completion-wait-start-timestamp"
|
||||
|
||||
// unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started
|
||||
AnnotationKeyUnregistrationStartTimestamp = annotationKeyPrefix + "unregistration-start-timestamp"
|
||||
|
||||
// AnnotationKeyUnregistrationRequestTimestamp is the annotation that contains the time that the unregistration has been requested.
|
||||
// This doesn't immediately start the unregistration. Instead, ARC will first check if the runner has already been registered.
|
||||
// If not, ARC will hold on until the registration to complete first, and only after that it starts the unregistration process.
|
||||
// This is crucial to avoid a race between ARC marking the runner pod for deletion while the actions-runner registers itself to GitHub, leaving the assigned job
|
||||
// hang like forever.
|
||||
AnnotationKeyUnregistrationRequestTimestamp = annotationKeyPrefix + "unregistration-request-timestamp"
|
||||
|
||||
AnnotationKeyRunnerID = annotationKeyPrefix + "id"
|
||||
|
||||
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
|
||||
DefaultUnregistrationRetryDelay = time.Minute
|
||||
|
||||
// RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation.
|
||||
// Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token
|
||||
// for creating a broader scoped runner token, like organizationa or enterprise runner token.
|
||||
// Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value.
|
||||
RetryDelayOnCreateRegistrationError = 3 * time.Minute
|
||||
|
||||
// registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
|
||||
// A pod that is timed out can be terminated if needed.
|
||||
registrationTimeout = 10 * time.Minute
|
||||
|
||||
// DefaultRunnerPodRecreationDelayAfterWebhookScale is the delay until syncing the runners with the desired replicas
|
||||
// after a webhook-based scale up.
|
||||
// This is used to prevent ARC from recreating completed runner pods that are deleted soon without being used at all.
|
||||
// In other words, this is used as a timer to wait for the completed runner to emit the next `workflow_job` webhook event to decrease the desired replicas.
|
||||
// So if we set 30 seconds for this, you are basically saying that you would assume GitHub and your installation of ARC to
|
||||
// emit and propagate a workflow_job completion event down to the RunnerSet or RunnerReplicaSet, vha ARC's github webhook server and HRA, in approximately 30 seconds.
|
||||
// In case it actually took more than DefaultRunnerPodRecreationDelayAfterWebhookScale for the workflow_job completion event to arrive,
|
||||
// ARC will recreate the completed runner(s), assuming something went wrong in either GitHub, your K8s cluster, or ARC, so ARC needs to resync anyway.
|
||||
//
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/pull/1180
|
||||
DefaultRunnerPodRecreationDelayAfterWebhookScale = 10 * time.Minute
|
||||
|
||||
EnvVarRunnerName = "RUNNER_NAME"
|
||||
EnvVarRunnerToken = "RUNNER_TOKEN"
|
||||
|
||||
// defaultHookPath is path to the hook script used when the "containerMode: kubernetes" is specified
|
||||
defaultRunnerHookPath = "/runner/k8s/index.js"
|
||||
)
|
||||
207
controllers/horizontal_runner_autoscaler_batch_scale.go
Normal file
207
controllers/horizontal_runner_autoscaler_batch_scale.go
Normal file
@@ -0,0 +1,207 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/go-logr/logr"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
type batchScaler struct {
|
||||
Ctx context.Context
|
||||
Client client.Client
|
||||
Log logr.Logger
|
||||
interval time.Duration
|
||||
|
||||
queue chan *ScaleTarget
|
||||
workerStart sync.Once
|
||||
}
|
||||
|
||||
func newBatchScaler(ctx context.Context, client client.Client, log logr.Logger) *batchScaler {
|
||||
return &batchScaler{
|
||||
Ctx: ctx,
|
||||
Client: client,
|
||||
Log: log,
|
||||
interval: 3 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
type batchScaleOperation struct {
|
||||
namespacedName types.NamespacedName
|
||||
scaleOps []scaleOperation
|
||||
}
|
||||
|
||||
type scaleOperation struct {
|
||||
trigger v1alpha1.ScaleUpTrigger
|
||||
log logr.Logger
|
||||
}
|
||||
|
||||
// Add the scale target to the unbounded queue, blocking until the target is successfully added to the queue.
|
||||
// All the targets in the queue are dequeued every 3 seconds, grouped by the HRA, and applied.
|
||||
// In a happy path, batchScaler update each HRA only once, even though the HRA had two or more associated webhook events in the 3 seconds interval,
|
||||
// which results in less K8s API calls and less HRA update conflicts in case your ARC installation receives a lot of webhook events
|
||||
func (s *batchScaler) Add(st *ScaleTarget) {
|
||||
if st == nil {
|
||||
return
|
||||
}
|
||||
|
||||
s.workerStart.Do(func() {
|
||||
var expBackoff = []time.Duration{time.Second, 2 * time.Second, 4 * time.Second, 8 * time.Second, 16 * time.Second}
|
||||
|
||||
s.queue = make(chan *ScaleTarget)
|
||||
|
||||
log := s.Log
|
||||
|
||||
go func() {
|
||||
log.Info("Starting batch worker")
|
||||
defer log.Info("Stopped batch worker")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.Ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
log.V(2).Info("Batch worker is dequeueing operations")
|
||||
|
||||
batches := map[types.NamespacedName]batchScaleOperation{}
|
||||
after := time.After(s.interval)
|
||||
var ops uint
|
||||
|
||||
batch:
|
||||
for {
|
||||
select {
|
||||
case <-after:
|
||||
after = nil
|
||||
break batch
|
||||
case st := <-s.queue:
|
||||
nsName := types.NamespacedName{
|
||||
Namespace: st.HorizontalRunnerAutoscaler.Namespace,
|
||||
Name: st.HorizontalRunnerAutoscaler.Name,
|
||||
}
|
||||
b, ok := batches[nsName]
|
||||
if !ok {
|
||||
b = batchScaleOperation{
|
||||
namespacedName: nsName,
|
||||
}
|
||||
}
|
||||
b.scaleOps = append(b.scaleOps, scaleOperation{
|
||||
log: *st.log,
|
||||
trigger: st.ScaleUpTrigger,
|
||||
})
|
||||
batches[nsName] = b
|
||||
ops++
|
||||
}
|
||||
}
|
||||
|
||||
log.V(2).Info("Batch worker dequeued operations", "ops", ops, "batches", len(batches))
|
||||
|
||||
retry:
|
||||
for i := 0; ; i++ {
|
||||
failed := map[types.NamespacedName]batchScaleOperation{}
|
||||
|
||||
for nsName, b := range batches {
|
||||
b := b
|
||||
if err := s.batchScale(context.Background(), b); err != nil {
|
||||
log.V(2).Info("Failed to scale due to error", "error", err)
|
||||
failed[nsName] = b
|
||||
} else {
|
||||
log.V(2).Info("Successfully ran batch scale", "hra", b.namespacedName)
|
||||
}
|
||||
}
|
||||
|
||||
if len(failed) == 0 {
|
||||
break retry
|
||||
}
|
||||
|
||||
batches = failed
|
||||
|
||||
delay := 16 * time.Second
|
||||
if i < len(expBackoff) {
|
||||
delay = expBackoff[i]
|
||||
}
|
||||
time.Sleep(delay)
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
s.queue <- st
|
||||
}
|
||||
|
||||
func (s *batchScaler) batchScale(ctx context.Context, batch batchScaleOperation) error {
|
||||
var hra v1alpha1.HorizontalRunnerAutoscaler
|
||||
|
||||
if err := s.Client.Get(ctx, batch.namespacedName, &hra); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
copy := hra.DeepCopy()
|
||||
|
||||
copy.Spec.CapacityReservations = getValidCapacityReservations(copy)
|
||||
|
||||
var added, completed int
|
||||
|
||||
for _, scale := range batch.scaleOps {
|
||||
amount := 1
|
||||
|
||||
if scale.trigger.Amount != 0 {
|
||||
amount = scale.trigger.Amount
|
||||
}
|
||||
|
||||
scale.log.V(2).Info("Adding capacity reservation", "amount", amount)
|
||||
|
||||
if amount > 0 {
|
||||
now := time.Now()
|
||||
copy.Spec.CapacityReservations = append(copy.Spec.CapacityReservations, v1alpha1.CapacityReservation{
|
||||
EffectiveTime: metav1.Time{Time: now},
|
||||
ExpirationTime: metav1.Time{Time: now.Add(scale.trigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
|
||||
added += amount
|
||||
} else if amount < 0 {
|
||||
var reservations []v1alpha1.CapacityReservation
|
||||
|
||||
var found bool
|
||||
|
||||
for _, r := range copy.Spec.CapacityReservations {
|
||||
if !found && r.Replicas+amount == 0 {
|
||||
found = true
|
||||
} else {
|
||||
reservations = append(reservations, r)
|
||||
}
|
||||
}
|
||||
|
||||
copy.Spec.CapacityReservations = reservations
|
||||
|
||||
completed += amount
|
||||
}
|
||||
}
|
||||
|
||||
before := len(hra.Spec.CapacityReservations)
|
||||
expired := before - len(copy.Spec.CapacityReservations)
|
||||
after := len(copy.Spec.CapacityReservations)
|
||||
|
||||
s.Log.V(1).Info(
|
||||
fmt.Sprintf("Updating hra %s for capacityReservations update", hra.Name),
|
||||
"before", before,
|
||||
"expired", expired,
|
||||
"added", added,
|
||||
"completed", completed,
|
||||
"after", after,
|
||||
)
|
||||
|
||||
if err := s.Client.Update(ctx, copy); err != nil {
|
||||
return fmt.Errorf("updating horizontalrunnerautoscaler to add capacity reservation: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -23,14 +23,14 @@ import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v39/github"
|
||||
gogithub "github.com/google/go-github/v45/github"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/tools/record"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
@@ -46,6 +46,8 @@ const (
|
||||
|
||||
keyPrefixEnterprise = "enterprises/"
|
||||
keyRunnerGroup = "/group/"
|
||||
|
||||
DefaultQueueLimit = 100
|
||||
)
|
||||
|
||||
// HorizontalRunnerAutoscalerGitHubWebhook autoscales a HorizontalRunnerAutoscaler and the RunnerDeployment on each
|
||||
@@ -68,6 +70,15 @@ type HorizontalRunnerAutoscalerGitHubWebhook struct {
|
||||
// Set to empty for letting it watch for all namespaces.
|
||||
Namespace string
|
||||
Name string
|
||||
|
||||
// QueueLimit is the maximum length of the bounded queue of scale targets and their associated operations
|
||||
// A scale target is enqueued on each retrieval of each eligible webhook event, so that it is processed asynchronously.
|
||||
QueueLimit int
|
||||
|
||||
worker *worker
|
||||
workerInit sync.Once
|
||||
workerStart sync.Once
|
||||
batchCh chan *ScaleTarget
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Reconcile(_ context.Context, request reconcile.Request) (reconcile.Result, error) {
|
||||
@@ -242,18 +253,23 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
enterpriseSlug,
|
||||
labels,
|
||||
)
|
||||
|
||||
if target != nil {
|
||||
if e.GetAction() == "queued" {
|
||||
target.Amount = 1
|
||||
} else if e.GetAction() == "completed" {
|
||||
// A nagative amount is processed in the tryScale func as a scale-down request,
|
||||
// that erasese the oldest CapacityReservation with the same amount.
|
||||
// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
|
||||
// so that the resulting desired replicas decreases by 1.
|
||||
target.Amount = -1
|
||||
}
|
||||
if target == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if e.GetAction() == "queued" {
|
||||
target.Amount = 1
|
||||
break
|
||||
} else if e.GetAction() == "completed" && e.GetWorkflowJob().GetConclusion() != "skipped" {
|
||||
// A nagative amount is processed in the tryScale func as a scale-down request,
|
||||
// that erasese the oldest CapacityReservation with the same amount.
|
||||
// If the first CapacityReservation was with Replicas=1, this negative scale target erases that,
|
||||
// so that the resulting desired replicas decreases by 1.
|
||||
target.Amount = -1
|
||||
break
|
||||
}
|
||||
// If the conclusion is "skipped", we will ignore it and fallthrough to the default case.
|
||||
fallthrough
|
||||
default:
|
||||
ok = true
|
||||
|
||||
@@ -307,9 +323,19 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) Handle(w http.Respons
|
||||
return
|
||||
}
|
||||
|
||||
if err := autoscaler.tryScale(context.TODO(), target); err != nil {
|
||||
log.Error(err, "could not scale up")
|
||||
autoscaler.workerInit.Do(func() {
|
||||
batchScaler := newBatchScaler(context.Background(), autoscaler.Client, autoscaler.Log)
|
||||
|
||||
queueLimit := autoscaler.QueueLimit
|
||||
if queueLimit == 0 {
|
||||
queueLimit = DefaultQueueLimit
|
||||
}
|
||||
autoscaler.worker = newWorker(context.Background(), queueLimit, batchScaler.Add)
|
||||
})
|
||||
|
||||
target.log = &log
|
||||
if ok := autoscaler.worker.Add(target); !ok {
|
||||
log.Error(err, "Could not scale up due to queue full")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -351,9 +377,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) findHRAsByKey(ctx con
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, d := range hraList.Items {
|
||||
hras = append(hras, d)
|
||||
}
|
||||
hras = append(hras, hraList.Items...)
|
||||
}
|
||||
|
||||
return hras, nil
|
||||
@@ -380,6 +404,8 @@ func matchTriggerConditionAgainstEvent(types []string, eventAction *string) bool
|
||||
type ScaleTarget struct {
|
||||
v1alpha1.HorizontalRunnerAutoscaler
|
||||
v1alpha1.ScaleUpTrigger
|
||||
|
||||
log *logr.Logger
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) searchScaleTargets(hras []v1alpha1.HorizontalRunnerAutoscaler, f func(v1alpha1.ScaleUpTrigger) bool) []ScaleTarget {
|
||||
@@ -498,6 +524,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getScaleUpTargetWithF
|
||||
if autoscaler.GitHubClient != nil {
|
||||
simu := &simulator.Simulator{
|
||||
Client: autoscaler.GitHubClient,
|
||||
Log: log,
|
||||
}
|
||||
// Get available organization runner groups and enterprise runner groups for a repository
|
||||
// These are the sum of runner groups with repository access = All repositories and runner groups
|
||||
@@ -614,11 +641,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getManagedRunnerGroup
|
||||
return nil, fmt.Errorf("unsupported scale target kind: %v", kind)
|
||||
}
|
||||
|
||||
if g == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if e == "" && o == "" {
|
||||
if g != "" && e == "" && o == "" {
|
||||
autoscaler.Log.V(1).Info(
|
||||
"invalid runner group config in scale target: spec.group must be set along with either spec.enterprise or spec.organization",
|
||||
"scaleTargetKind", kind,
|
||||
@@ -631,6 +654,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) getManagedRunnerGroup
|
||||
}
|
||||
|
||||
if e != enterprise && o != org {
|
||||
autoscaler.Log.V(1).Info(
|
||||
"Skipped scale target irrelevant to event",
|
||||
"eventOrganization", org,
|
||||
"eventEnterprise", enterprise,
|
||||
"scaleTargetKind", kind,
|
||||
"scaleTargetGroup", g,
|
||||
"scaleTargetEnterprise", e,
|
||||
"scaleTargetOrganization", o,
|
||||
)
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -659,16 +692,29 @@ HRA:
|
||||
|
||||
if len(hra.Spec.ScaleUpTriggers) > 1 {
|
||||
autoscaler.Log.V(1).Info("Skipping this HRA as it has too many ScaleUpTriggers to be used in workflow_job based scaling", "hra", hra.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(hra.Spec.ScaleUpTriggers) == 0 {
|
||||
autoscaler.Log.V(1).Info("Skipping this HRA as it has no ScaleUpTriggers configured", "hra", hra.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
scaleUpTrigger := hra.Spec.ScaleUpTriggers[0]
|
||||
|
||||
if scaleUpTrigger.GitHubEvent == nil {
|
||||
autoscaler.Log.V(1).Info("Skipping this HRA as it has no `githubEvent` scale trigger configured", "hra", hra.Name)
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
var duration metav1.Duration
|
||||
if scaleUpTrigger.GitHubEvent.WorkflowJob == nil {
|
||||
autoscaler.Log.V(1).Info("Skipping this HRA as it has no `githubEvent.workflowJob` scale trigger configured", "hra", hra.Name)
|
||||
|
||||
if len(hra.Spec.ScaleUpTriggers) > 0 {
|
||||
duration = hra.Spec.ScaleUpTriggers[0].Duration
|
||||
continue
|
||||
}
|
||||
|
||||
duration := scaleUpTrigger.Duration
|
||||
if duration.Duration <= 0 {
|
||||
// Try to release the reserved capacity after at least 10 minutes by default,
|
||||
// we won't end up in the reserved capacity remained forever in case GitHub somehow stopped sending us "completed" workflow_job events.
|
||||
@@ -748,55 +794,6 @@ HRA:
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.Context, target *ScaleTarget) error {
|
||||
if target == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
copy := target.HorizontalRunnerAutoscaler.DeepCopy()
|
||||
|
||||
amount := 1
|
||||
|
||||
if target.ScaleUpTrigger.Amount != 0 {
|
||||
amount = target.ScaleUpTrigger.Amount
|
||||
}
|
||||
|
||||
capacityReservations := getValidCapacityReservations(copy)
|
||||
|
||||
if amount > 0 {
|
||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
} else if amount < 0 {
|
||||
var reservations []v1alpha1.CapacityReservation
|
||||
|
||||
var found bool
|
||||
|
||||
for _, r := range capacityReservations {
|
||||
if !found && r.Replicas+amount == 0 {
|
||||
found = true
|
||||
} else {
|
||||
reservations = append(reservations, r)
|
||||
}
|
||||
}
|
||||
|
||||
copy.Spec.CapacityReservations = reservations
|
||||
}
|
||||
|
||||
autoscaler.Log.Info(
|
||||
"Patching hra for capacityReservations update",
|
||||
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
|
||||
"after", copy.Spec.CapacityReservations,
|
||||
)
|
||||
|
||||
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
|
||||
return fmt.Errorf("patching horizontalrunnerautoscaler to add capacity reservation: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getValidCapacityReservations(autoscaler *v1alpha1.HorizontalRunnerAutoscaler) []v1alpha1.CapacityReservation {
|
||||
var capacityReservations []v1alpha1.CapacityReservation
|
||||
|
||||
@@ -823,6 +820,7 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) SetupWithManager(mgr
|
||||
hra := rawObj.(*v1alpha1.HorizontalRunnerAutoscaler)
|
||||
|
||||
if hra.Spec.ScaleTargetRef.Name == "" {
|
||||
autoscaler.Log.V(1).Info(fmt.Sprintf("scale target ref name not set for hra %s", hra.Name))
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ package controllers
|
||||
import (
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/pkg/actionsglob"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
)
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) MatchCheckRunEvent(event *github.CheckRunEvent) func(scaleUpTrigger v1alpha1.ScaleUpTrigger) bool {
|
||||
|
||||
@@ -2,7 +2,7 @@ package controllers
|
||||
|
||||
import (
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
)
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) MatchPullRequestEvent(event *github.PullRequestEvent) func(scaleUpTrigger v1alpha1.ScaleUpTrigger) bool {
|
||||
|
||||
@@ -2,7 +2,7 @@ package controllers
|
||||
|
||||
import (
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
)
|
||||
|
||||
func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) MatchPushEvent(event *github.PushEvent) func(scaleUpTrigger v1alpha1.ScaleUpTrigger) bool {
|
||||
@@ -15,10 +15,6 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) MatchPushEvent(event
|
||||
|
||||
push := g.Push
|
||||
|
||||
if push == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
return push != nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ import (
|
||||
|
||||
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/go-logr/logr"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
@@ -138,6 +138,13 @@ func TestWebhookWorkflowJob(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -177,6 +184,13 @@ func TestWebhookWorkflowJob(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -217,6 +231,13 @@ func TestWebhookWorkflowJob(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -277,6 +298,13 @@ func TestWebhookWorkflowJobWithSelfHostedLabel(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -316,6 +344,13 @@ func TestWebhookWorkflowJobWithSelfHostedLabel(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -356,6 +391,13 @@ func TestWebhookWorkflowJobWithSelfHostedLabel(t *testing.T) {
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: "test-name",
|
||||
},
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
55
controllers/horizontal_runner_autoscaler_webhook_worker.go
Normal file
55
controllers/horizontal_runner_autoscaler_webhook_worker.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
)
|
||||
|
||||
// worker is a worker that has a non-blocking bounded queue of scale targets, dequeues scale target and executes the scale operation one by one.
|
||||
type worker struct {
|
||||
scaleTargetQueue chan *ScaleTarget
|
||||
work func(*ScaleTarget)
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
func newWorker(ctx context.Context, queueLimit int, work func(*ScaleTarget)) *worker {
|
||||
w := &worker{
|
||||
scaleTargetQueue: make(chan *ScaleTarget, queueLimit),
|
||||
work: work,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer close(w.done)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case t := <-w.scaleTargetQueue:
|
||||
work(t)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return w
|
||||
}
|
||||
|
||||
// Add the scale target to the bounded queue, returning the result as a bool value. It returns true on successful enqueue, and returns false otherwise.
|
||||
// When returned false, the queue is already full so the enqueue operation must be retried later.
|
||||
// If the enqueue was triggered by an external source and there's no intermediate queue that we can use,
|
||||
// you must instruct the source to resend the original request later.
|
||||
// In case you're building a webhook server around this worker, this means that you must return a http error to the webhook server,
|
||||
// so that (hopefully) the sender can resend the webhook event later, or at least the human operator can notice or be notified about the
|
||||
// webhook develiery failure so that a manual retry can be done later.
|
||||
func (w *worker) Add(st *ScaleTarget) bool {
|
||||
select {
|
||||
case w.scaleTargetQueue <- st:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (w *worker) Done() chan struct{} {
|
||||
return w.done
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestWorker_Add(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
w := newWorker(ctx, 2, func(st *ScaleTarget) {})
|
||||
require.True(t, w.Add(&ScaleTarget{}))
|
||||
require.True(t, w.Add(&ScaleTarget{}))
|
||||
require.False(t, w.Add(&ScaleTarget{}))
|
||||
}
|
||||
|
||||
func TestWorker_Work(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var count int
|
||||
|
||||
w := newWorker(ctx, 1, func(st *ScaleTarget) {
|
||||
count++
|
||||
cancel()
|
||||
})
|
||||
require.True(t, w.Add(&ScaleTarget{}))
|
||||
require.False(t, w.Add(&ScaleTarget{}))
|
||||
|
||||
<-w.Done()
|
||||
|
||||
require.Equal(t, count, 1)
|
||||
}
|
||||
@@ -47,13 +47,13 @@ const (
|
||||
// HorizontalRunnerAutoscalerReconciler reconciles a HorizontalRunnerAutoscaler object
|
||||
type HorizontalRunnerAutoscalerReconciler struct {
|
||||
client.Client
|
||||
GitHubClient *github.Client
|
||||
Log logr.Logger
|
||||
Recorder record.EventRecorder
|
||||
Scheme *runtime.Scheme
|
||||
|
||||
CacheDuration time.Duration
|
||||
Name string
|
||||
GitHubClient *github.Client
|
||||
Log logr.Logger
|
||||
Recorder record.EventRecorder
|
||||
Scheme *runtime.Scheme
|
||||
CacheDuration time.Duration
|
||||
DefaultScaleDownDelay time.Duration
|
||||
Name string
|
||||
}
|
||||
|
||||
const defaultReplicas = 1
|
||||
@@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
|
||||
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
|
||||
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
|
||||
|
||||
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
|
||||
|
||||
var effectiveTime *time.Time
|
||||
|
||||
for _, r := range hra.Spec.CapacityReservations {
|
||||
t := r.EffectiveTime
|
||||
if effectiveTime == nil || effectiveTime.Before(t.Time) {
|
||||
effectiveTime = &t.Time
|
||||
}
|
||||
}
|
||||
|
||||
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
||||
if currentDesiredReplicas != newDesiredReplicas {
|
||||
copy := rd.DeepCopy()
|
||||
copy.Spec.Replicas = &newDesiredReplicas
|
||||
|
||||
if ephemeral && effectiveTime != nil {
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
} else if ephemeral && effectiveTime != nil {
|
||||
copy := rd.DeepCopy()
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
@@ -137,6 +159,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
|
||||
org: rs.Spec.Organization,
|
||||
repo: rs.Spec.Repository,
|
||||
replicas: replicas,
|
||||
labels: rs.Spec.RunnerConfig.Labels,
|
||||
getRunnerMap: func() (map[string]struct{}, error) {
|
||||
// return the list of runners in namespace. Horizontal Runner Autoscaler should only be responsible for scaling resources in its own ns.
|
||||
var runnerPodList corev1.PodList
|
||||
@@ -180,15 +203,38 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
|
||||
}
|
||||
currentDesiredReplicas := getIntOrDefault(replicas, defaultReplicas)
|
||||
|
||||
ephemeral := rs.Spec.Ephemeral == nil || *rs.Spec.Ephemeral
|
||||
|
||||
var effectiveTime *time.Time
|
||||
|
||||
for _, r := range hra.Spec.CapacityReservations {
|
||||
t := r.EffectiveTime
|
||||
if effectiveTime == nil || effectiveTime.Before(t.Time) {
|
||||
effectiveTime = &t.Time
|
||||
}
|
||||
}
|
||||
|
||||
if currentDesiredReplicas != newDesiredReplicas {
|
||||
copy := rs.DeepCopy()
|
||||
v := int32(newDesiredReplicas)
|
||||
copy.Spec.Replicas = &v
|
||||
|
||||
if ephemeral && effectiveTime != nil {
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil {
|
||||
return fmt.Errorf("patching runnerset to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
} else if ephemeral && effectiveTime != nil {
|
||||
copy := rs.DeepCopy()
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rs)); err != nil {
|
||||
return fmt.Errorf("patching runnerset to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
@@ -206,6 +252,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) scaleTargetFromRD(ctx context.Con
|
||||
org: rd.Spec.Template.Spec.Organization,
|
||||
repo: rd.Spec.Template.Spec.Repository,
|
||||
replicas: rd.Spec.Replicas,
|
||||
labels: rd.Spec.Template.Spec.RunnerConfig.Labels,
|
||||
getRunnerMap: func() (map[string]struct{}, error) {
|
||||
// return the list of runners in namespace. Horizontal Runner Autoscaler should only be responsible for scaling resources in its own ns.
|
||||
var runnerList v1alpha1.RunnerList
|
||||
@@ -248,6 +295,7 @@ type scaleTarget struct {
|
||||
st, kind string
|
||||
enterprise, repo, org string
|
||||
replicas *int
|
||||
labels []string
|
||||
|
||||
getRunnerMap func() (map[string]struct{}, error)
|
||||
}
|
||||
@@ -262,7 +310,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) reconcile(ctx context.Context, re
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
newDesiredReplicas, computedReplicas, computedReplicasFromCache, err := r.computeReplicasWithCache(log, now, st, hra, minReplicas)
|
||||
newDesiredReplicas, err := r.computeReplicasWithCache(log, now, st, hra, minReplicas)
|
||||
if err != nil {
|
||||
r.Recorder.Event(&hra, corev1.EventTypeNormal, "RunnerAutoscalingFailure", err.Error())
|
||||
|
||||
@@ -287,24 +335,6 @@ func (r *HorizontalRunnerAutoscalerReconciler) reconcile(ctx context.Context, re
|
||||
updated.Status.DesiredReplicas = &newDesiredReplicas
|
||||
}
|
||||
|
||||
if computedReplicasFromCache == nil {
|
||||
cacheEntries := getValidCacheEntries(updated, now)
|
||||
|
||||
var cacheDuration time.Duration
|
||||
|
||||
if r.CacheDuration > 0 {
|
||||
cacheDuration = r.CacheDuration
|
||||
} else {
|
||||
cacheDuration = 10 * time.Minute
|
||||
}
|
||||
|
||||
updated.Status.CacheEntries = append(cacheEntries, v1alpha1.CacheEntry{
|
||||
Key: v1alpha1.CacheEntryKeyDesiredReplicas,
|
||||
Value: computedReplicas,
|
||||
ExpirationTime: metav1.Time{Time: time.Now().Add(cacheDuration)},
|
||||
})
|
||||
}
|
||||
|
||||
var overridesSummary string
|
||||
|
||||
if (active != nil && upcoming == nil) || (active != nil && upcoming != nil && active.Period.EndTime.Before(upcoming.Period.StartTime)) {
|
||||
@@ -339,18 +369,6 @@ func (r *HorizontalRunnerAutoscalerReconciler) reconcile(ctx context.Context, re
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func getValidCacheEntries(hra *v1alpha1.HorizontalRunnerAutoscaler, now time.Time) []v1alpha1.CacheEntry {
|
||||
var cacheEntries []v1alpha1.CacheEntry
|
||||
|
||||
for _, ent := range hra.Status.CacheEntries {
|
||||
if ent.ExpirationTime.After(now) {
|
||||
cacheEntries = append(cacheEntries, ent)
|
||||
}
|
||||
}
|
||||
|
||||
return cacheEntries
|
||||
}
|
||||
|
||||
func (r *HorizontalRunnerAutoscalerReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
name := "horizontalrunnerautoscaler-controller"
|
||||
if r.Name != "" {
|
||||
@@ -443,32 +461,18 @@ func (r *HorizontalRunnerAutoscalerReconciler) getMinReplicas(log logr.Logger, n
|
||||
return minReplicas, active, upcoming, nil
|
||||
}
|
||||
|
||||
func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(log logr.Logger, now time.Time, st scaleTarget, hra v1alpha1.HorizontalRunnerAutoscaler, minReplicas int) (int, int, *int, error) {
|
||||
func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(log logr.Logger, now time.Time, st scaleTarget, hra v1alpha1.HorizontalRunnerAutoscaler, minReplicas int) (int, error) {
|
||||
var suggestedReplicas int
|
||||
|
||||
suggestedReplicasFromCache := r.fetchSuggestedReplicasFromCache(hra)
|
||||
v, err := r.suggestDesiredReplicas(st, hra)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var cached *int
|
||||
|
||||
if suggestedReplicasFromCache != nil {
|
||||
cached = suggestedReplicasFromCache
|
||||
|
||||
if cached == nil {
|
||||
suggestedReplicas = minReplicas
|
||||
} else {
|
||||
suggestedReplicas = *cached
|
||||
}
|
||||
if v == nil {
|
||||
suggestedReplicas = minReplicas
|
||||
} else {
|
||||
v, err := r.suggestDesiredReplicas(st, hra)
|
||||
if err != nil {
|
||||
return 0, 0, nil, err
|
||||
}
|
||||
|
||||
if v == nil {
|
||||
suggestedReplicas = minReplicas
|
||||
} else {
|
||||
suggestedReplicas = *v
|
||||
}
|
||||
suggestedReplicas = *v
|
||||
}
|
||||
|
||||
var reserved int
|
||||
@@ -496,7 +500,7 @@ func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(log logr
|
||||
if hra.Spec.ScaleDownDelaySecondsAfterScaleUp != nil {
|
||||
scaleDownDelay = time.Duration(*hra.Spec.ScaleDownDelaySecondsAfterScaleUp) * time.Second
|
||||
} else {
|
||||
scaleDownDelay = DefaultScaleDownDelay
|
||||
scaleDownDelay = r.DefaultScaleDownDelay
|
||||
}
|
||||
|
||||
var scaleDownDelayUntil *time.Time
|
||||
@@ -527,8 +531,8 @@ func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(log logr
|
||||
"min", minReplicas,
|
||||
}
|
||||
|
||||
if cached != nil {
|
||||
kvs = append(kvs, "cached", *cached)
|
||||
if maxReplicas := hra.Spec.MaxReplicas; maxReplicas != nil {
|
||||
kvs = append(kvs, "max", *maxReplicas)
|
||||
}
|
||||
|
||||
if scaleDownDelayUntil != nil {
|
||||
@@ -536,13 +540,9 @@ func (r *HorizontalRunnerAutoscalerReconciler) computeReplicasWithCache(log logr
|
||||
kvs = append(kvs, "scale_down_delay_until", scaleDownDelayUntil)
|
||||
}
|
||||
|
||||
if maxReplicas := hra.Spec.MaxReplicas; maxReplicas != nil {
|
||||
kvs = append(kvs, "max", *maxReplicas)
|
||||
}
|
||||
|
||||
log.V(1).Info(fmt.Sprintf("Calculated desired replicas of %d", newDesiredReplicas),
|
||||
kvs...,
|
||||
)
|
||||
|
||||
return newDesiredReplicas, suggestedReplicas, suggestedReplicasFromCache, nil
|
||||
return newDesiredReplicas, nil
|
||||
}
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
func TestGetValidCacheEntries(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
hra := &actionsv1alpha1.HorizontalRunnerAutoscaler{
|
||||
Status: actionsv1alpha1.HorizontalRunnerAutoscalerStatus{
|
||||
CacheEntries: []actionsv1alpha1.CacheEntry{
|
||||
{
|
||||
Key: "foo",
|
||||
Value: 1,
|
||||
ExpirationTime: metav1.Time{Time: now.Add(-time.Second)},
|
||||
},
|
||||
{
|
||||
Key: "foo",
|
||||
Value: 2,
|
||||
ExpirationTime: metav1.Time{Time: now},
|
||||
},
|
||||
{
|
||||
Key: "foo",
|
||||
Value: 3,
|
||||
ExpirationTime: metav1.Time{Time: now.Add(time.Second)},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
revs := getValidCacheEntries(hra, now)
|
||||
|
||||
counts := map[string]int{}
|
||||
|
||||
for _, r := range revs {
|
||||
counts[r.Key] += r.Value
|
||||
}
|
||||
|
||||
want := map[string]int{"foo": 3}
|
||||
|
||||
if d := cmp.Diff(want, counts); d != "" {
|
||||
t.Errorf("%s", d)
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"time"
|
||||
|
||||
github2 "github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/google/go-github/v45/github"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github/fake"
|
||||
|
||||
@@ -108,8 +108,9 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
|
||||
RunnerImage: "example/runner:test",
|
||||
DockerImage: "example/docker:test",
|
||||
Name: controllerName("runner"),
|
||||
RegistrationRecheckInterval: time.Millisecond,
|
||||
RegistrationRecheckJitter: time.Millisecond,
|
||||
RegistrationRecheckInterval: time.Millisecond * 100,
|
||||
RegistrationRecheckJitter: time.Millisecond * 10,
|
||||
UnregistrationRetryDelay: 1 * time.Second,
|
||||
}
|
||||
err = runnerController.SetupWithManager(mgr)
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller")
|
||||
@@ -268,7 +269,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2)
|
||||
ExpectHRAStatusCacheEntryLengthEventuallyEquals(ctx, ns.Name, name, 1)
|
||||
}
|
||||
|
||||
{
|
||||
@@ -371,7 +371,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3)
|
||||
ExpectHRAStatusCacheEntryLengthEventuallyEquals(ctx, ns.Name, name, 1)
|
||||
}
|
||||
|
||||
{
|
||||
@@ -538,6 +537,106 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
}
|
||||
})
|
||||
|
||||
It("should create and scale organization's repository runners on workflow_job event", func() {
|
||||
name := "example-runnerdeploy"
|
||||
|
||||
{
|
||||
rd := &actionsv1alpha1.RunnerDeployment{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Namespace: ns.Name,
|
||||
},
|
||||
Spec: actionsv1alpha1.RunnerDeploymentSpec{
|
||||
Replicas: intPtr(1),
|
||||
Selector: &metav1.LabelSelector{
|
||||
MatchLabels: map[string]string{
|
||||
"foo": "bar",
|
||||
},
|
||||
},
|
||||
Template: actionsv1alpha1.RunnerTemplate{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
},
|
||||
},
|
||||
Spec: actionsv1alpha1.RunnerSpec{
|
||||
RunnerConfig: actionsv1alpha1.RunnerConfig{
|
||||
Repository: "test/valid",
|
||||
Image: "bar",
|
||||
Group: "baz",
|
||||
},
|
||||
RunnerPodSpec: actionsv1alpha1.RunnerPodSpec{
|
||||
Env: []corev1.EnvVar{
|
||||
{Name: "FOO", Value: "FOOVALUE"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ExpectCreate(ctx, rd, "test RunnerDeployment")
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
|
||||
}
|
||||
|
||||
// Scale-up to 1 replica via ScaleUpTriggers.GitHubEvent.WorkflowJob based scaling
|
||||
{
|
||||
hra := &actionsv1alpha1.HorizontalRunnerAutoscaler{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
Namespace: ns.Name,
|
||||
},
|
||||
Spec: actionsv1alpha1.HorizontalRunnerAutoscalerSpec{
|
||||
ScaleTargetRef: actionsv1alpha1.ScaleTargetRef{
|
||||
Name: name,
|
||||
},
|
||||
MinReplicas: intPtr(1),
|
||||
MaxReplicas: intPtr(5),
|
||||
ScaleDownDelaySecondsAfterScaleUp: intPtr(1),
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
Amount: 1,
|
||||
Duration: metav1.Duration{Duration: time.Minute},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ExpectCreate(ctx, hra, "test HorizontalRunnerAutoscaler")
|
||||
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 1)
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
|
||||
}
|
||||
|
||||
// Scale-up to 2 replicas on first workflow_job.queued webhook event
|
||||
{
|
||||
env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
|
||||
}
|
||||
|
||||
// Scale-up to 3 replicas on second workflow_job.queued webhook event
|
||||
{
|
||||
env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after second webhook event")
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
|
||||
}
|
||||
|
||||
// Do not scale-up on third workflow_job.queued webhook event
|
||||
// repo "example" doesn't match our Spec
|
||||
{
|
||||
env.SendWorkflowJobEvent("test", "example", "queued", []string{"self-hosted"})
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 3, "runners after third webhook event")
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(3, "count of fake list runners")
|
||||
}
|
||||
})
|
||||
|
||||
It("should create and scale organization's repository runners only on check_run event", func() {
|
||||
name := "example-runnerdeploy"
|
||||
|
||||
@@ -582,9 +681,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(1, "count of fake list runners")
|
||||
}
|
||||
|
||||
// Scale-up to 3 replicas by the default TotalNumberOfQueuedAndInProgressWorkflowRuns-based scaling
|
||||
// See workflowRunsFor3Replicas_queued and workflowRunsFor3Replicas_in_progress for GitHub List-Runners API responses
|
||||
// used while testing.
|
||||
// Scale-up to 1 replica via ScaleUpTriggers.GitHubEvent.CheckRun based scaling
|
||||
{
|
||||
hra := &actionsv1alpha1.HorizontalRunnerAutoscaler{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
@@ -1131,9 +1228,11 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
ScaleDownDelaySecondsAfterScaleUp: intPtr(1),
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{},
|
||||
Amount: 1,
|
||||
Duration: metav1.Duration{Duration: time.Minute},
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
Amount: 1,
|
||||
Duration: metav1.Duration{Duration: time.Minute},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -1151,7 +1250,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
|
||||
// Scale-up to 2 replicas on first workflow_job webhook event
|
||||
{
|
||||
env.SendWorkflowJobEvent("test", "valid", "pending", "created", []string{"self-hosted"})
|
||||
env.SendWorkflowJobEvent("test", "valid", "queued", []string{"self-hosted"})
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
|
||||
@@ -1213,9 +1312,11 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
ScaleDownDelaySecondsAfterScaleUp: intPtr(1),
|
||||
ScaleUpTriggers: []actionsv1alpha1.ScaleUpTrigger{
|
||||
{
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{},
|
||||
Amount: 1,
|
||||
Duration: metav1.Duration{Duration: time.Minute},
|
||||
GitHubEvent: &actionsv1alpha1.GitHubEventScaleUpTriggerSpec{
|
||||
WorkflowJob: &actionsv1alpha1.WorkflowJobSpec{},
|
||||
},
|
||||
Amount: 1,
|
||||
Duration: metav1.Duration{Duration: time.Minute},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -1233,7 +1334,7 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
|
||||
// Scale-up to 2 replicas on first workflow_job webhook event
|
||||
{
|
||||
env.SendWorkflowJobEvent("test", "valid", "pending", "created", []string{"custom-label"})
|
||||
env.SendWorkflowJobEvent("test", "valid", "queued", []string{"custom-label"})
|
||||
ExpectRunnerSetsCountEventuallyEquals(ctx, ns.Name, 1, "runner sets after webhook")
|
||||
ExpectRunnerSetsManagedReplicasCountEventuallyEquals(ctx, ns.Name, 2, "runners after first webhook event")
|
||||
env.ExpectRegisteredNumberCountEventuallyEquals(2, "count of fake list runners")
|
||||
@@ -1243,21 +1344,6 @@ var _ = Context("INTEGRATION: Inside of a new namespace", func() {
|
||||
})
|
||||
})
|
||||
|
||||
func ExpectHRAStatusCacheEntryLengthEventuallyEquals(ctx context.Context, ns string, name string, value int, optionalDescriptions ...interface{}) {
|
||||
EventuallyWithOffset(
|
||||
1,
|
||||
func() int {
|
||||
var hra actionsv1alpha1.HorizontalRunnerAutoscaler
|
||||
|
||||
err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns, Name: name}, &hra)
|
||||
|
||||
ExpectWithOffset(1, err).NotTo(HaveOccurred(), "failed to get test HRA resource")
|
||||
|
||||
return len(hra.Status.CacheEntries)
|
||||
},
|
||||
time.Second*5, time.Millisecond*500).Should(Equal(value), optionalDescriptions...)
|
||||
}
|
||||
|
||||
func ExpectHRADesiredReplicasEquals(ctx context.Context, ns, name string, desired int, optionalDescriptions ...interface{}) {
|
||||
var rd actionsv1alpha1.HorizontalRunnerAutoscaler
|
||||
|
||||
@@ -1281,7 +1367,7 @@ func (env *testEnvironment) ExpectRegisteredNumberCountEventuallyEquals(want int
|
||||
|
||||
return len(rs)
|
||||
},
|
||||
time.Second*5, time.Millisecond*500).Should(Equal(want), optionalDescriptions...)
|
||||
time.Second*10, time.Millisecond*500).Should(Equal(want), optionalDescriptions...)
|
||||
}
|
||||
|
||||
func (env *testEnvironment) SendOrgPullRequestEvent(org, repo, branch, action string) {
|
||||
@@ -1329,6 +1415,30 @@ func (env *testEnvironment) SendOrgCheckRunEvent(org, repo, status, action strin
|
||||
ExpectWithOffset(1, resp.StatusCode).To(Equal(200))
|
||||
}
|
||||
|
||||
func (env *testEnvironment) SendWorkflowJobEvent(org, repo, statusAndAction string, labels []string) {
|
||||
resp, err := sendWebhook(env.webhookServer, "workflow_job", &github.WorkflowJobEvent{
|
||||
WorkflowJob: &github.WorkflowJob{
|
||||
Status: &statusAndAction,
|
||||
Labels: labels,
|
||||
},
|
||||
Org: &github.Organization{
|
||||
Login: github.String(org),
|
||||
},
|
||||
Repo: &github.Repository{
|
||||
Name: github.String(repo),
|
||||
Owner: &github.User{
|
||||
Login: github.String(org),
|
||||
Type: github.String("Organization"),
|
||||
},
|
||||
},
|
||||
Action: github.String(statusAndAction),
|
||||
})
|
||||
|
||||
ExpectWithOffset(1, err).NotTo(HaveOccurred(), "failed to send workflow_job event")
|
||||
|
||||
ExpectWithOffset(1, resp.StatusCode).To(Equal(200))
|
||||
}
|
||||
|
||||
func (env *testEnvironment) SendUserPullRequestEvent(owner, repo, branch, action string) {
|
||||
resp, err := sendWebhook(env.webhookServer, "pull_request", &github.PullRequestEvent{
|
||||
PullRequest: &github.PullRequest{
|
||||
@@ -1371,28 +1481,6 @@ func (env *testEnvironment) SendUserCheckRunEvent(owner, repo, status, action st
|
||||
ExpectWithOffset(1, resp.StatusCode).To(Equal(200))
|
||||
}
|
||||
|
||||
func (env *testEnvironment) SendWorkflowJobEvent(owner, repo, status, action string, labels []string) {
|
||||
resp, err := sendWebhook(env.webhookServer, "workflow_job", &github.WorkflowJobEvent{
|
||||
Org: &github.Organization{
|
||||
Name: github.String(owner),
|
||||
},
|
||||
WorkflowJob: &github.WorkflowJob{
|
||||
Labels: labels,
|
||||
},
|
||||
Action: github.String("queued"),
|
||||
Repo: &github.Repository{
|
||||
Name: github.String(repo),
|
||||
Owner: &github.User{
|
||||
Login: github.String(owner),
|
||||
Type: github.String("Organization"),
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
ExpectWithOffset(1, err).NotTo(HaveOccurred(), "failed to send check_run event")
|
||||
|
||||
ExpectWithOffset(1, resp.StatusCode).To(Equal(200))
|
||||
}
|
||||
func (env *testEnvironment) SyncRunnerRegistrations() {
|
||||
var runnerList actionsv1alpha1.RunnerList
|
||||
|
||||
|
||||
1122
controllers/new_runner_pod_test.go
Normal file
1122
controllers/new_runner_pod_test.go
Normal file
File diff suppressed because it is too large
Load Diff
74
controllers/persistent_volume_claim_controller.go
Normal file
74
controllers/persistent_volume_claim_controller.go
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright 2022 The actions-runner-controller authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/tools/record"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
// RunnerPersistentVolumeClaimReconciler reconciles a PersistentVolume object
|
||||
type RunnerPersistentVolumeClaimReconciler struct {
|
||||
client.Client
|
||||
Log logr.Logger
|
||||
Recorder record.EventRecorder
|
||||
Scheme *runtime.Scheme
|
||||
Name string
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
||||
|
||||
func (r *RunnerPersistentVolumeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
log := r.Log.WithValues("pvc", req.NamespacedName)
|
||||
|
||||
var pvc corev1.PersistentVolumeClaim
|
||||
if err := r.Get(ctx, req.NamespacedName, &pvc); err != nil {
|
||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||
}
|
||||
|
||||
res, err := syncPVC(ctx, r.Client, log, req.Namespace, &pvc)
|
||||
|
||||
if res == nil {
|
||||
res = &ctrl.Result{}
|
||||
}
|
||||
|
||||
return *res, err
|
||||
}
|
||||
|
||||
func (r *RunnerPersistentVolumeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
name := "runnerpersistentvolumeclaim-controller"
|
||||
if r.Name != "" {
|
||||
name = r.Name
|
||||
}
|
||||
|
||||
r.Recorder = mgr.GetEventRecorderFor(name)
|
||||
|
||||
return ctrl.NewControllerManagedBy(mgr).
|
||||
For(&corev1.PersistentVolumeClaim{}).
|
||||
Named(name).
|
||||
Complete(r)
|
||||
}
|
||||
72
controllers/persistent_volume_controller.go
Normal file
72
controllers/persistent_volume_controller.go
Normal file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
Copyright 2022 The actions-runner-controller authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/client-go/tools/record"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
// RunnerPersistentVolumeReconciler reconciles a PersistentVolume object
|
||||
type RunnerPersistentVolumeReconciler struct {
|
||||
client.Client
|
||||
Log logr.Logger
|
||||
Recorder record.EventRecorder
|
||||
Scheme *runtime.Scheme
|
||||
Name string
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
||||
|
||||
func (r *RunnerPersistentVolumeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
log := r.Log.WithValues("pv", req.NamespacedName)
|
||||
|
||||
var pv corev1.PersistentVolume
|
||||
if err := r.Get(ctx, req.NamespacedName, &pv); err != nil {
|
||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||
}
|
||||
|
||||
res, err := syncPV(ctx, r.Client, log, req.Namespace, &pv)
|
||||
if res == nil {
|
||||
res = &ctrl.Result{}
|
||||
}
|
||||
|
||||
return *res, err
|
||||
}
|
||||
|
||||
func (r *RunnerPersistentVolumeReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
name := "runnerpersistentvolume-controller"
|
||||
if r.Name != "" {
|
||||
name = r.Name
|
||||
}
|
||||
|
||||
r.Recorder = mgr.GetEventRecorderFor(name)
|
||||
|
||||
return ctrl.NewControllerManagedBy(mgr).
|
||||
For(&corev1.PersistentVolume{}).
|
||||
Named(name).
|
||||
Complete(r)
|
||||
}
|
||||
@@ -59,9 +59,9 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque
|
||||
return newEmptyResponse()
|
||||
}
|
||||
|
||||
enterprise, okEnterprise := getEnv(runnerContainer, "RUNNER_ENTERPRISE")
|
||||
repo, okRepo := getEnv(runnerContainer, "RUNNER_REPO")
|
||||
org, okOrg := getEnv(runnerContainer, "RUNNER_ORG")
|
||||
enterprise, okEnterprise := getEnv(runnerContainer, EnvVarEnterprise)
|
||||
repo, okRepo := getEnv(runnerContainer, EnvVarRepo)
|
||||
org, okOrg := getEnv(runnerContainer, EnvVarOrg)
|
||||
if !okRepo || !okOrg || !okEnterprise {
|
||||
return newEmptyResponse()
|
||||
}
|
||||
@@ -78,9 +78,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque
|
||||
|
||||
updated.Annotations[AnnotationKeyTokenExpirationDate] = ts
|
||||
|
||||
if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
|
||||
updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
|
||||
}
|
||||
forceRunnerPodRestartPolicyNever(updated)
|
||||
|
||||
buf, err := json.Marshal(updated)
|
||||
if err != nil {
|
||||
|
||||
@@ -20,13 +20,12 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/hash"
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v39/github"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
|
||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
@@ -50,12 +49,11 @@ const (
|
||||
|
||||
retryDelayOnGitHubAPIRateLimitError = 30 * time.Second
|
||||
|
||||
// This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways
|
||||
annotationKeyRegistrationOnly = "actions-runner-controller/registration-only"
|
||||
|
||||
EnvVarOrg = "RUNNER_ORG"
|
||||
EnvVarRepo = "RUNNER_REPO"
|
||||
EnvVarEnterprise = "RUNNER_ENTERPRISE"
|
||||
EnvVarEphemeral = "RUNNER_EPHEMERAL"
|
||||
EnvVarTrue = "true"
|
||||
)
|
||||
|
||||
// RunnerReconciler reconciles a Runner object
|
||||
@@ -72,12 +70,15 @@ type RunnerReconciler struct {
|
||||
Name string
|
||||
RegistrationRecheckInterval time.Duration
|
||||
RegistrationRecheckJitter time.Duration
|
||||
|
||||
UnregistrationRetryDelay time.Duration
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners/finalizers,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners/status,verbs=get;update;patch
|
||||
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=pods/finalizers,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
||||
|
||||
@@ -89,12 +90,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||
}
|
||||
|
||||
err := runner.Validate()
|
||||
if err != nil {
|
||||
log.Info("Failed to validate runner spec", "error", err.Error())
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
if runner.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||
|
||||
@@ -111,35 +106,17 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||
}
|
||||
} else {
|
||||
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
|
||||
return r.processRunnerDeletion(runner, ctx, log)
|
||||
}
|
||||
|
||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
||||
if registrationOnly && runner.Status.Phase != "" {
|
||||
// At this point we are sure that the registration-only runner has successfully configured and
|
||||
// is of `offline` status, because we set runner.Status.Phase to that of the runner pod only after
|
||||
// successful registration.
|
||||
|
||||
var pod corev1.Pod
|
||||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||
if !kerrors.IsNotFound(err) {
|
||||
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
|
||||
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
} else if err := r.Delete(ctx, &pod); err != nil {
|
||||
if !kerrors.IsNotFound(err) {
|
||||
log.Info(fmt.Sprintf("Retrying soon as we failed to delete registration-only runner pod: %v", err))
|
||||
|
||||
log.Info(fmt.Sprintf("Retrying soon as we failed to get runner pod: %v", err))
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
// Pod was not found
|
||||
return r.processRunnerDeletion(runner, ctx, log, nil)
|
||||
}
|
||||
|
||||
log.Info("Successfully deleted registration-only runner pod to free node and cluster resource")
|
||||
|
||||
// Return here to not recreate the deleted pod, because recreating it is the waste of cluster and node resource,
|
||||
// and also defeats the original purpose of scale-from/to-zero we're trying to implement by using the registration-only runner.
|
||||
return ctrl.Result{}, nil
|
||||
return r.processRunnerDeletion(runner, ctx, log, &pod)
|
||||
}
|
||||
|
||||
var pod corev1.Pod
|
||||
@@ -151,15 +128,67 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||
return r.processRunnerCreation(ctx, runner, log)
|
||||
}
|
||||
|
||||
// Pod already exists
|
||||
|
||||
if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
return r.processRunnerPodDeletion(ctx, runner, log, pod)
|
||||
phase := string(pod.Status.Phase)
|
||||
if phase == "" {
|
||||
phase = "Created"
|
||||
}
|
||||
|
||||
ready := runnerPodReady(&pod)
|
||||
|
||||
if runner.Status.Phase != phase || runner.Status.Ready != ready {
|
||||
if pod.Status.Phase == corev1.PodRunning {
|
||||
// Seeing this message, you can expect the runner to become `Running` soon.
|
||||
log.V(1).Info(
|
||||
"Runner appears to have been registered and running.",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
)
|
||||
}
|
||||
|
||||
updated := runner.DeepCopy()
|
||||
updated.Status.Phase = phase
|
||||
updated.Status.Ready = ready
|
||||
updated.Status.Reason = pod.Status.Reason
|
||||
updated.Status.Message = pod.Status.Message
|
||||
|
||||
if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
|
||||
log.Error(err, "Failed to update runner status for Phase/Reason/Message")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func runnerPodReady(pod *corev1.Pod) bool {
|
||||
for _, c := range pod.Status.Conditions {
|
||||
if c.Type != corev1.PodReady {
|
||||
continue
|
||||
}
|
||||
|
||||
return c.Status == corev1.ConditionTrue
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func runnerContainerExitCode(pod *corev1.Pod) *int32 {
|
||||
for _, status := range pod.Status.ContainerStatuses {
|
||||
if status.Name != containerName {
|
||||
continue
|
||||
}
|
||||
|
||||
if status.State.Terminated != nil {
|
||||
return &status.State.Terminated.ExitCode
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
stopped := pod.Status.Phase == corev1.PodSucceeded
|
||||
stopped := pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed
|
||||
|
||||
if !stopped {
|
||||
if pod.Status.Phase == corev1.PodRunning {
|
||||
@@ -168,338 +197,55 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||
continue
|
||||
}
|
||||
|
||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||
if status.State.Terminated != nil {
|
||||
stopped = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
restart := stopped
|
||||
|
||||
if registrationOnly && stopped {
|
||||
restart = false
|
||||
|
||||
log.Info(
|
||||
"Observed that registration-only runner for scaling-from-zero has successfully stopped. " +
|
||||
"Unlike other pods, this one will be recreated only when runner spec changes.",
|
||||
)
|
||||
}
|
||||
|
||||
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
} else if updated {
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
|
||||
newPod, err := r.newPod(runner)
|
||||
if err != nil {
|
||||
log.Error(err, "Could not create pod")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if registrationOnly {
|
||||
newPod.Spec.Containers[0].Env = append(
|
||||
newPod.Spec.Containers[0].Env,
|
||||
corev1.EnvVar{
|
||||
Name: "RUNNER_REGISTRATION_ONLY",
|
||||
Value: "true",
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
var registrationRecheckDelay time.Duration
|
||||
|
||||
// all checks done below only decide whether a restart is needed
|
||||
// if a restart was already decided before, there is no need for the checks
|
||||
// saving API calls and scary log messages
|
||||
if !restart {
|
||||
registrationCheckInterval := time.Minute
|
||||
if r.RegistrationRecheckInterval > 0 {
|
||||
registrationCheckInterval = r.RegistrationRecheckInterval
|
||||
}
|
||||
|
||||
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
||||
// This if block, in conjunction with:
|
||||
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||
// achieves that.
|
||||
if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil {
|
||||
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
||||
now := time.Now()
|
||||
|
||||
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
||||
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
||||
// shorter than 1s
|
||||
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
||||
if requeueAfter > 0 {
|
||||
log.Info(
|
||||
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
||||
"lastRegistrationCheckTime", lastCheckTime,
|
||||
"registrationCheckInterval", registrationCheckInterval,
|
||||
)
|
||||
|
||||
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
||||
// next sync period passes, which can be too much later than nextCheckTime.
|
||||
//
|
||||
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
||||
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
||||
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
||||
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
||||
// spec/status change.
|
||||
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
||||
}
|
||||
}
|
||||
|
||||
notFound := false
|
||||
offline := false
|
||||
|
||||
runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
||||
|
||||
currentTime := time.Now()
|
||||
|
||||
if err != nil {
|
||||
var notFoundException *github.RunnerNotFound
|
||||
var offlineException *github.RunnerOffline
|
||||
if errors.As(err, ¬FoundException) {
|
||||
notFound = true
|
||||
} else if errors.As(err, &offlineException) {
|
||||
offline = true
|
||||
} else {
|
||||
var e *gogithub.RateLimitError
|
||||
if errors.As(err, &e) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
// See the `newPod` function called above for more information
|
||||
// about when this hash changes.
|
||||
curHash := pod.Labels[LabelKeyPodTemplateHash]
|
||||
newHash := newPod.Labels[LabelKeyPodTemplateHash]
|
||||
|
||||
if !runnerBusy && curHash != newHash {
|
||||
restart = true
|
||||
}
|
||||
|
||||
registrationTimeout := 10 * time.Minute
|
||||
durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout))
|
||||
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
||||
|
||||
if notFound {
|
||||
if registrationDidTimeout {
|
||||
log.Info(
|
||||
"Runner failed to register itself to GitHub in timely manner. "+
|
||||
"Recreating the pod to see if it resolves the issue. "+
|
||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
|
||||
restart = true
|
||||
} else {
|
||||
log.V(1).Info(
|
||||
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
||||
"runnerName", runner.Name,
|
||||
)
|
||||
}
|
||||
} else if offline {
|
||||
if registrationOnly {
|
||||
log.Info(
|
||||
"Observed that registration-only runner for scaling-from-zero has successfully been registered.",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
} else if registrationDidTimeout {
|
||||
if runnerBusy {
|
||||
log.Info(
|
||||
"Timeout out while waiting for the runner to be online, but observed that it's busy at the same time."+
|
||||
"This is a known (unintuitive) behaviour of a runner that is already running a job. Please see https://github.com/actions-runner-controller/actions-runner-controller/issues/911",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
} else {
|
||||
log.Info(
|
||||
"Already existing GitHub runner still appears offline . "+
|
||||
"Recreating the pod to see if it resolves the issue. "+
|
||||
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
|
||||
restart = true
|
||||
}
|
||||
} else {
|
||||
log.V(1).Info(
|
||||
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
||||
"runnerName", runner.Name,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (notFound || (offline && !registrationOnly)) && !registrationDidTimeout {
|
||||
registrationRecheckJitter := 10 * time.Second
|
||||
if r.RegistrationRecheckJitter > 0 {
|
||||
registrationRecheckJitter = r.RegistrationRecheckJitter
|
||||
}
|
||||
|
||||
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
// Don't do anything if there's no need to restart the runner
|
||||
if !restart {
|
||||
// This guard enables us to update runner.Status.Phase to `Running` only after
|
||||
// the runner is registered to GitHub.
|
||||
if registrationRecheckDelay > 0 {
|
||||
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
||||
|
||||
updated := runner.DeepCopy()
|
||||
updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()}
|
||||
|
||||
if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
|
||||
log.Error(err, "Failed to update runner status for LastRegistrationCheckTime")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||
}
|
||||
|
||||
if runner.Status.Phase != string(pod.Status.Phase) {
|
||||
if pod.Status.Phase == corev1.PodRunning {
|
||||
// Seeing this message, you can expect the runner to become `Running` soon.
|
||||
log.Info(
|
||||
"Runner appears to have registered and running.",
|
||||
"podCreationTimestamp", pod.CreationTimestamp,
|
||||
)
|
||||
}
|
||||
|
||||
updated := runner.DeepCopy()
|
||||
updated.Status.Phase = string(pod.Status.Phase)
|
||||
updated.Status.Reason = pod.Status.Reason
|
||||
updated.Status.Message = pod.Status.Message
|
||||
|
||||
if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
|
||||
log.Error(err, "Failed to update runner status for Phase/Reason/Message")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// Delete current pod if recreation is needed
|
||||
if err := r.Delete(ctx, &pod); err != nil {
|
||||
log.Error(err, "Failed to delete pod resource")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
||||
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
return stopped
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) {
|
||||
func ephemeralRunnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
|
||||
if getRunnerEnv(pod, "RUNNER_EPHEMERAL") != "true" {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, status := range pod.Status.ContainerStatuses {
|
||||
if status.Name != containerName {
|
||||
continue
|
||||
}
|
||||
|
||||
status := status
|
||||
|
||||
return &status
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
|
||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||
|
||||
if removed {
|
||||
if len(runner.Status.Registration.Token) > 0 {
|
||||
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
||||
if err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
log.V(1).Info("Runner no longer exists on GitHub")
|
||||
}
|
||||
} else {
|
||||
log.V(1).Info("Runner was never registered on GitHub")
|
||||
}
|
||||
|
||||
newRunner := runner.DeepCopy()
|
||||
newRunner.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil {
|
||||
log.Error(err, "Failed to update runner for finalizer removal")
|
||||
log.Error(err, "Unable to remove finalizer")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
log.Info("Removed runner from GitHub", "repository", runner.Spec.Repository, "organization", runner.Spec.Organization)
|
||||
log.Info("Removed finalizer")
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
|
||||
deletionTimeout := 1 * time.Minute
|
||||
currentTime := time.Now()
|
||||
deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0
|
||||
|
||||
if deletionDidTimeout {
|
||||
log.Info(
|
||||
fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
|
||||
"This is typically the case when a Kubernetes node became unreachable "+
|
||||
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
|
||||
"podDeletionTimestamp", pod.DeletionTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredDeletionTimeout", deletionTimeout,
|
||||
)
|
||||
|
||||
var force int64 = 0
|
||||
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
|
||||
if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
|
||||
// probably
|
||||
if !kerrors.IsNotFound(err) {
|
||||
log.Error(err, "Failed to forcefully delete pod resource ...")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
// forceful deletion finally succeeded
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name))
|
||||
log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository)
|
||||
// give kube manager a little time to forcefully delete the stuck pod
|
||||
return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
|
||||
} else {
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
|
||||
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil
|
||||
} else if updated {
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
@@ -528,37 +274,10 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name))
|
||||
log.Info("Created runner pod", "repository", runner.Spec.Repository)
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
||||
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
id := int64(0)
|
||||
for _, runner := range runners {
|
||||
if runner.GetName() == name {
|
||||
if runner.GetBusy() {
|
||||
return false, fmt.Errorf("runner is busy")
|
||||
}
|
||||
id = runner.GetID()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if id == int64(0) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v1alpha1.Runner) (bool, error) {
|
||||
if runner.IsRegisterable() {
|
||||
return false, nil
|
||||
@@ -568,6 +287,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v
|
||||
|
||||
rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
||||
if err != nil {
|
||||
// An error can be a permanent, permission issue like the below:
|
||||
// POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration []
|
||||
// In such case retrying in seconds might not make much sense.
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed")
|
||||
log.Error(err, "Failed to get new registration token")
|
||||
return false, err
|
||||
@@ -626,6 +349,11 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
runner.ObjectMeta.Annotations,
|
||||
runner.Spec,
|
||||
r.GitHubClient.GithubBaseURL,
|
||||
// Token change should trigger replacement.
|
||||
// We need to include this explicitly here because
|
||||
// runner.Spec does not contain the possibly updated token stored in the
|
||||
// runner status yet.
|
||||
runner.Status.Registration.Token,
|
||||
)
|
||||
|
||||
objectMeta := metav1.ObjectMeta{
|
||||
@@ -639,31 +367,66 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
|
||||
if len(runner.Spec.Containers) == 0 {
|
||||
template.Spec.Containers = append(template.Spec.Containers, corev1.Container{
|
||||
Name: "runner",
|
||||
ImagePullPolicy: runner.Spec.ImagePullPolicy,
|
||||
EnvFrom: runner.Spec.EnvFrom,
|
||||
Env: runner.Spec.Env,
|
||||
Resources: runner.Spec.Resources,
|
||||
Name: "runner",
|
||||
})
|
||||
|
||||
if (runner.Spec.DockerEnabled == nil || *runner.Spec.DockerEnabled) && (runner.Spec.DockerdWithinRunnerContainer == nil || !*runner.Spec.DockerdWithinRunnerContainer) {
|
||||
template.Spec.Containers = append(template.Spec.Containers, corev1.Container{
|
||||
Name: "docker",
|
||||
VolumeMounts: runner.Spec.DockerVolumeMounts,
|
||||
Resources: runner.Spec.DockerdContainerResources,
|
||||
Env: runner.Spec.DockerEnv,
|
||||
Name: "docker",
|
||||
})
|
||||
}
|
||||
} else {
|
||||
template.Spec.Containers = runner.Spec.Containers
|
||||
}
|
||||
|
||||
for i, c := range template.Spec.Containers {
|
||||
switch c.Name {
|
||||
case "runner":
|
||||
if c.ImagePullPolicy == "" {
|
||||
template.Spec.Containers[i].ImagePullPolicy = runner.Spec.ImagePullPolicy
|
||||
}
|
||||
if len(c.EnvFrom) == 0 {
|
||||
template.Spec.Containers[i].EnvFrom = runner.Spec.EnvFrom
|
||||
}
|
||||
if len(c.Env) == 0 {
|
||||
template.Spec.Containers[i].Env = runner.Spec.Env
|
||||
}
|
||||
if len(c.Resources.Requests) == 0 {
|
||||
template.Spec.Containers[i].Resources.Requests = runner.Spec.Resources.Requests
|
||||
}
|
||||
if len(c.Resources.Limits) == 0 {
|
||||
template.Spec.Containers[i].Resources.Limits = runner.Spec.Resources.Limits
|
||||
}
|
||||
case "docker":
|
||||
if len(c.VolumeMounts) == 0 {
|
||||
template.Spec.Containers[i].VolumeMounts = runner.Spec.DockerVolumeMounts
|
||||
}
|
||||
if len(c.Resources.Limits) == 0 {
|
||||
template.Spec.Containers[i].Resources.Limits = runner.Spec.DockerdContainerResources.Limits
|
||||
}
|
||||
if len(c.Resources.Requests) == 0 {
|
||||
template.Spec.Containers[i].Resources.Requests = runner.Spec.DockerdContainerResources.Requests
|
||||
}
|
||||
if len(c.Env) == 0 {
|
||||
template.Spec.Containers[i].Env = runner.Spec.DockerEnv
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template.Spec.SecurityContext = runner.Spec.SecurityContext
|
||||
template.Spec.EnableServiceLinks = runner.Spec.EnableServiceLinks
|
||||
|
||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
||||
if runner.Spec.ContainerMode == "kubernetes" {
|
||||
workDir := runner.Spec.WorkDir
|
||||
if workDir == "" {
|
||||
workDir = "/runner/_work"
|
||||
}
|
||||
if err := applyWorkVolumeClaimTemplateToPod(&template, runner.Spec.WorkVolumeClaimTemplate, workDir); err != nil {
|
||||
return corev1.Pod{}, err
|
||||
}
|
||||
}
|
||||
|
||||
pod, err := newRunnerPod(template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly)
|
||||
pod, err := newRunnerPodWithContainerMode(runner.Spec.ContainerMode, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL)
|
||||
if err != nil {
|
||||
return pod, err
|
||||
}
|
||||
@@ -675,6 +438,9 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
// if operater provides a work volume mount, use that
|
||||
isPresent, _ := workVolumeMountPresent(runnerSpec.VolumeMounts)
|
||||
if isPresent {
|
||||
if runnerSpec.ContainerMode == "kubernetes" {
|
||||
return pod, errors.New("volume mount \"work\" should be specified by workVolumeClaimTemplate in container mode kubernetes")
|
||||
}
|
||||
// remove work volume since it will be provided from runnerSpec.Volumes
|
||||
// if we don't remove it here we would get a duplicate key error, i.e. two volumes named work
|
||||
_, index := workVolumeMountPresent(pod.Spec.Containers[0].VolumeMounts)
|
||||
@@ -688,6 +454,9 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
// if operator provides a work volume. use that
|
||||
isPresent, _ := workVolumePresent(runnerSpec.Volumes)
|
||||
if isPresent {
|
||||
if runnerSpec.ContainerMode == "kubernetes" {
|
||||
return pod, errors.New("volume \"work\" should be specified by workVolumeClaimTemplate in container mode kubernetes")
|
||||
}
|
||||
_, index := workVolumePresent(pod.Spec.Volumes)
|
||||
|
||||
// remove work volume since it will be provided from runnerSpec.Volumes
|
||||
@@ -697,6 +466,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
|
||||
pod.Spec.Volumes = append(pod.Spec.Volumes, runnerSpec.Volumes...)
|
||||
}
|
||||
|
||||
if len(runnerSpec.InitContainers) != 0 {
|
||||
pod.Spec.InitContainers = append(pod.Spec.InitContainers, runnerSpec.InitContainers...)
|
||||
}
|
||||
@@ -727,6 +497,10 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
pod.Spec.Tolerations = runnerSpec.Tolerations
|
||||
}
|
||||
|
||||
if runnerSpec.PriorityClassName != "" {
|
||||
pod.Spec.PriorityClassName = runnerSpec.PriorityClassName
|
||||
}
|
||||
|
||||
if len(runnerSpec.TopologySpreadConstraints) != 0 {
|
||||
pod.Spec.TopologySpreadConstraints = runnerSpec.TopologySpreadConstraints
|
||||
}
|
||||
@@ -743,6 +517,10 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
pod.Spec.HostAliases = runnerSpec.HostAliases
|
||||
}
|
||||
|
||||
if runnerSpec.DnsConfig != nil {
|
||||
pod.Spec.DNSConfig = runnerSpec.DnsConfig
|
||||
}
|
||||
|
||||
if runnerSpec.RuntimeClassName != nil {
|
||||
pod.Spec.RuntimeClassName = runnerSpec.RuntimeClassName
|
||||
}
|
||||
@@ -762,25 +540,56 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||
func mutatePod(pod *corev1.Pod, token string) *corev1.Pod {
|
||||
updated := pod.DeepCopy()
|
||||
|
||||
for i := range pod.Spec.Containers {
|
||||
if pod.Spec.Containers[i].Name == "runner" {
|
||||
updated.Spec.Containers[i].Env = append(updated.Spec.Containers[i].Env,
|
||||
corev1.EnvVar{
|
||||
Name: "RUNNER_NAME",
|
||||
Value: pod.ObjectMeta.Name,
|
||||
},
|
||||
corev1.EnvVar{
|
||||
Name: "RUNNER_TOKEN",
|
||||
Value: token,
|
||||
},
|
||||
)
|
||||
}
|
||||
if getRunnerEnv(pod, EnvVarRunnerName) == "" {
|
||||
setRunnerEnv(updated, EnvVarRunnerName, pod.ObjectMeta.Name)
|
||||
}
|
||||
|
||||
if getRunnerEnv(pod, EnvVarRunnerToken) == "" {
|
||||
setRunnerEnv(updated, EnvVarRunnerToken, token)
|
||||
}
|
||||
|
||||
return updated
|
||||
}
|
||||
|
||||
func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) {
|
||||
func runnerHookEnvs(pod *corev1.Pod) ([]corev1.EnvVar, error) {
|
||||
isRequireSameNode, err := isRequireSameNode(pod)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return []corev1.EnvVar{
|
||||
{
|
||||
Name: "ACTIONS_RUNNER_CONTAINER_HOOKS",
|
||||
Value: defaultRunnerHookPath,
|
||||
},
|
||||
{
|
||||
Name: "ACTIONS_RUNNER_REQUIRE_JOB_CONTAINER",
|
||||
Value: "true",
|
||||
},
|
||||
{
|
||||
Name: "ACTIONS_RUNNER_POD_NAME",
|
||||
ValueFrom: &corev1.EnvVarSource{
|
||||
FieldRef: &corev1.ObjectFieldSelector{
|
||||
FieldPath: "metadata.name",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "ACTIONS_RUNNER_JOB_NAMESPACE",
|
||||
ValueFrom: &corev1.EnvVarSource{
|
||||
FieldRef: &corev1.ObjectFieldSelector{
|
||||
FieldPath: "metadata.namespace",
|
||||
},
|
||||
},
|
||||
},
|
||||
corev1.EnvVar{
|
||||
Name: "ACTIONS_RUNNER_REQUIRE_SAME_NODE",
|
||||
Value: strconv.FormatBool(isRequireSameNode),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
|
||||
var (
|
||||
privileged bool = true
|
||||
dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer
|
||||
@@ -789,6 +598,18 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
dockerdInRunnerPrivileged bool = dockerdInRunner
|
||||
)
|
||||
|
||||
if containerMode == "kubernetes" {
|
||||
dockerdInRunner = false
|
||||
dockerEnabled = false
|
||||
dockerdInRunnerPrivileged = false
|
||||
}
|
||||
|
||||
template = *template.DeepCopy()
|
||||
|
||||
// This label selector is used by default when rd.Spec.Selector is empty.
|
||||
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunner, "")
|
||||
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation)
|
||||
|
||||
workDir := runnerSpec.WorkDir
|
||||
if workDir == "" {
|
||||
workDir = "/runner/_work"
|
||||
@@ -841,19 +662,11 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
Value: workDir,
|
||||
},
|
||||
{
|
||||
Name: "RUNNER_EPHEMERAL",
|
||||
Name: EnvVarEphemeral,
|
||||
Value: fmt.Sprintf("%v", ephemeral),
|
||||
},
|
||||
}
|
||||
|
||||
if registrationOnly {
|
||||
env = append(env, corev1.EnvVar{
|
||||
Name: "RUNNER_REGISTRATION_ONLY",
|
||||
Value: "true",
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
var seLinuxOptions *corev1.SELinuxOptions
|
||||
if template.Spec.SecurityContext != nil {
|
||||
seLinuxOptions = template.Spec.SecurityContext.SELinuxOptions
|
||||
@@ -877,6 +690,17 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
}
|
||||
}
|
||||
|
||||
if containerMode == "kubernetes" {
|
||||
if dockerdContainer != nil {
|
||||
template.Spec.Containers = append(template.Spec.Containers[:dockerdContainerIndex], template.Spec.Containers[dockerdContainerIndex+1:]...)
|
||||
}
|
||||
if runnerContainerIndex < runnerContainerIndex {
|
||||
runnerContainerIndex--
|
||||
}
|
||||
dockerdContainer = nil
|
||||
dockerdContainerIndex = -1
|
||||
}
|
||||
|
||||
if runnerContainer == nil {
|
||||
runnerContainerIndex = -1
|
||||
runnerContainer = &corev1.Container{
|
||||
@@ -907,18 +731,26 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
}
|
||||
|
||||
runnerContainer.Env = append(runnerContainer.Env, env...)
|
||||
if containerMode == "kubernetes" {
|
||||
hookEnvs, err := runnerHookEnvs(&template)
|
||||
if err != nil {
|
||||
return corev1.Pod{}, err
|
||||
}
|
||||
runnerContainer.Env = append(runnerContainer.Env, hookEnvs...)
|
||||
}
|
||||
|
||||
if runnerContainer.SecurityContext == nil {
|
||||
runnerContainer.SecurityContext = &corev1.SecurityContext{}
|
||||
}
|
||||
// Runner need to run privileged if it contains DinD
|
||||
runnerContainer.SecurityContext.Privileged = &dockerdInRunnerPrivileged
|
||||
|
||||
if runnerContainer.SecurityContext.Privileged == nil {
|
||||
// Runner need to run privileged if it contains DinD
|
||||
runnerContainer.SecurityContext.Privileged = &dockerdInRunnerPrivileged
|
||||
}
|
||||
|
||||
pod := template.DeepCopy()
|
||||
|
||||
if pod.Spec.RestartPolicy == "" {
|
||||
pod.Spec.RestartPolicy = "OnFailure"
|
||||
}
|
||||
forceRunnerPodRestartPolicyNever(pod)
|
||||
|
||||
if mtu := runnerSpec.DockerMTU; mtu != nil && dockerdInRunner {
|
||||
runnerContainer.Env = append(runnerContainer.Env, []corev1.EnvVar{
|
||||
@@ -996,13 +828,18 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
)
|
||||
}
|
||||
|
||||
pod.Spec.Volumes = append(pod.Spec.Volumes,
|
||||
corev1.Volume{
|
||||
Name: "work",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
if ok, _ := workVolumePresent(pod.Spec.Volumes); !ok {
|
||||
pod.Spec.Volumes = append(pod.Spec.Volumes,
|
||||
corev1.Volume{
|
||||
Name: "work",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
EmptyDir: &corev1.EmptyDirVolumeSource{},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pod.Spec.Volumes = append(pod.Spec.Volumes,
|
||||
corev1.Volume{
|
||||
Name: "certs-client",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
@@ -1011,11 +848,16 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
},
|
||||
)
|
||||
|
||||
if ok, _ := workVolumeMountPresent(runnerContainer.VolumeMounts); !ok {
|
||||
runnerContainer.VolumeMounts = append(runnerContainer.VolumeMounts,
|
||||
corev1.VolumeMount{
|
||||
Name: "work",
|
||||
MountPath: workDir,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
runnerContainer.VolumeMounts = append(runnerContainer.VolumeMounts,
|
||||
corev1.VolumeMount{
|
||||
Name: "work",
|
||||
MountPath: workDir,
|
||||
},
|
||||
corev1.VolumeMount{
|
||||
Name: "certs-client",
|
||||
MountPath: "/certs/client",
|
||||
@@ -1120,6 +962,10 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||
return *pod, nil
|
||||
}
|
||||
|
||||
func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string) (corev1.Pod, error) {
|
||||
return newRunnerPodWithContainerMode("", template, runnerSpec, defaultRunnerImage, defaultRunnerImagePullSecrets, defaultDockerImage, defaultDockerRegistryMirror, githubBaseURL)
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
name := "runner-controller"
|
||||
if r.Name != "" {
|
||||
@@ -1182,3 +1028,71 @@ func workVolumeMountPresent(items []corev1.VolumeMount) (bool, int) {
|
||||
}
|
||||
return false, 0
|
||||
}
|
||||
|
||||
func applyWorkVolumeClaimTemplateToPod(pod *corev1.Pod, workVolumeClaimTemplate *v1alpha1.WorkVolumeClaimTemplate, workDir string) error {
|
||||
if workVolumeClaimTemplate == nil {
|
||||
return errors.New("work volume claim template must be specified in container mode kubernetes")
|
||||
}
|
||||
for i := range pod.Spec.Volumes {
|
||||
if pod.Spec.Volumes[i].Name == "work" {
|
||||
return fmt.Errorf("Work volume should not be specified in container mode kubernetes. workVolumeClaimTemplate field should be used instead.")
|
||||
}
|
||||
}
|
||||
pod.Spec.Volumes = append(pod.Spec.Volumes, workVolumeClaimTemplate.V1Volume())
|
||||
|
||||
var runnerContainer *corev1.Container
|
||||
for i := range pod.Spec.Containers {
|
||||
if pod.Spec.Containers[i].Name == "runner" {
|
||||
runnerContainer = &pod.Spec.Containers[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if runnerContainer == nil {
|
||||
return fmt.Errorf("runner container is not present when applying work volume claim template")
|
||||
}
|
||||
|
||||
if isPresent, _ := workVolumeMountPresent(runnerContainer.VolumeMounts); isPresent {
|
||||
return fmt.Errorf("volume mount \"work\" should not be present on the runner container in container mode kubernetes")
|
||||
}
|
||||
|
||||
runnerContainer.VolumeMounts = append(runnerContainer.VolumeMounts, workVolumeClaimTemplate.V1VolumeMount(workDir))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// isRequireSameNode specifies for the runner in kubernetes mode wether it should
|
||||
// schedule jobs to the same node where the runner is
|
||||
//
|
||||
// This function should only be called in containerMode: kubernetes
|
||||
func isRequireSameNode(pod *corev1.Pod) (bool, error) {
|
||||
isPresent, index := workVolumePresent(pod.Spec.Volumes)
|
||||
if !isPresent {
|
||||
return true, errors.New("internal error: work volume mount must exist in containerMode: kubernetes")
|
||||
}
|
||||
|
||||
if pod.Spec.Volumes[index].Ephemeral == nil || pod.Spec.Volumes[index].Ephemeral.VolumeClaimTemplate == nil {
|
||||
return true, errors.New("containerMode: kubernetes should have pod.Spec.Volumes[].Ephemeral.VolumeClaimTemplate set")
|
||||
}
|
||||
|
||||
for _, accessMode := range pod.Spec.Volumes[index].Ephemeral.VolumeClaimTemplate.Spec.AccessModes {
|
||||
switch accessMode {
|
||||
case corev1.ReadWriteOnce:
|
||||
return true, nil
|
||||
case corev1.ReadWriteMany:
|
||||
default:
|
||||
return true, errors.New("actions-runner-controller supports ReadWriteOnce and ReadWriteMany modes only")
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func overwriteRunnerEnv(runner *v1alpha1.Runner, key string, value string) {
|
||||
for i := range runner.Spec.Env {
|
||||
if runner.Spec.Env[i].Name == key {
|
||||
runner.Spec.Env[i].Value = value
|
||||
return
|
||||
}
|
||||
}
|
||||
runner.Spec.Env = append(runner.Spec.Env, corev1.EnvVar{Name: key, Value: value})
|
||||
}
|
||||
|
||||
423
controllers/runner_graceful_stop.go
Normal file
423
controllers/runner_graceful_stop.go
Normal file
@@ -0,0 +1,423 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v45/github"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
|
||||
// we can delete the runner pod without disrupting a workflow job.
|
||||
//
|
||||
// This function returns a non-nil pointer to corev1.Pod as the first return value
|
||||
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
|
||||
//
|
||||
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
|
||||
// This function is designed to complete a lengthy graceful stop process in a unblocking way.
|
||||
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
|
||||
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
|
||||
func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
|
||||
pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err != nil {
|
||||
return nil, &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
|
||||
return nil, res, err
|
||||
}
|
||||
|
||||
pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err != nil {
|
||||
return nil, &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
return pod, nil, nil
|
||||
}
|
||||
|
||||
// annotatePodOnce annotates the pod if it wasn't.
|
||||
// Returns the provided pod as-is if it was already annotated.
|
||||
// Returns the updated pod if the pod was missing the annotation and the update to add the annotation succeeded.
|
||||
func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod *corev1.Pod, k, v string) (*corev1.Pod, error) {
|
||||
if pod == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if _, ok := getAnnotation(pod, k); ok {
|
||||
return pod, nil
|
||||
}
|
||||
|
||||
updated := pod.DeepCopy()
|
||||
setAnnotation(&updated.ObjectMeta, k, v)
|
||||
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
|
||||
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", k))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Annotated pod", "key", k, "value", v)
|
||||
|
||||
return updated, nil
|
||||
}
|
||||
|
||||
// If the first return value is nil, it's safe to delete the runner pod.
|
||||
func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
|
||||
var runnerID *int64
|
||||
|
||||
if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
|
||||
v, err := strconv.ParseInt(id, 10, 64)
|
||||
if err != nil {
|
||||
return &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
runnerID = &v
|
||||
}
|
||||
|
||||
if runnerID == nil {
|
||||
runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||
if err != nil {
|
||||
return &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if runner != nil && runner.ID != nil {
|
||||
runnerID = runner.ID
|
||||
}
|
||||
}
|
||||
|
||||
code := runnerContainerExitCode(pod)
|
||||
|
||||
if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
|
||||
// If it's already unregistered in the previous reconcilation loop,
|
||||
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
|
||||
log.Info("Runner pod is marked as already unregistered.")
|
||||
} else if runnerID == nil {
|
||||
log.Info(
|
||||
"Unregistration started before runner ID is assigned. " +
|
||||
"Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? " +
|
||||
"Marking unregistration as completed anyway because there's nothing ARC can do.",
|
||||
)
|
||||
} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
|
||||
// If it's an ephemeral runner with the actions/runner container exited with 0,
|
||||
// we can safely assume that it has unregistered itself from GitHub Actions
|
||||
// so it's natural that RemoveRunner fails due to 404.
|
||||
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
log.Info("Runner pod has been stopped with a successful status.")
|
||||
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
|
||||
ct := ephemeralRunnerContainerStatus(pod)
|
||||
if ct == nil {
|
||||
log.Info("Runner pod is annotated to wait for completion, and the runner container is not ephemeral")
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
}
|
||||
|
||||
lts := ct.LastTerminationState.Terminated
|
||||
if lts == nil {
|
||||
log.Info("Runner pod is annotated to wait for completion, and the runner container is not restarting")
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
}
|
||||
|
||||
// Prevent runner pod from stucking in Terminating.
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369
|
||||
log.Info("Deleting runner pod anyway because it has stopped prematurely. This may leave a dangling runner resource in GitHub Actions",
|
||||
"lastState.exitCode", lts.ExitCode,
|
||||
"lastState.message", lts.Message,
|
||||
"pod.phase", pod.Status.Phase,
|
||||
)
|
||||
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, *runnerID); err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)
|
||||
|
||||
var (
|
||||
runnerBusy bool
|
||||
runnerUnregistrationFailureMessage string
|
||||
)
|
||||
|
||||
errRes := &gogithub.ErrorResponse{}
|
||||
if errors.As(err, &errRes) {
|
||||
if errRes.Response.StatusCode == 403 {
|
||||
log.Error(err, "Unable to unregister due to permission error. "+
|
||||
"Perhaps you've changed the permissions of PAT or GitHub App, or you updated authentication method of ARC in a wrong way? "+
|
||||
"ARC considers it as already unregistered and continue removing the pod. "+
|
||||
"You may need to remove the runner on GitHub UI.")
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||
|
||||
var runnerID int64
|
||||
|
||||
if runner != nil && runner.ID != nil {
|
||||
runnerID = *runner.ID
|
||||
}
|
||||
|
||||
runnerBusy = errRes.Response.StatusCode == 422
|
||||
runnerUnregistrationFailureMessage = errRes.Message
|
||||
|
||||
if runnerBusy && code != nil {
|
||||
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
|
||||
"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
|
||||
"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
|
||||
"runnerExitCode", *code,
|
||||
"runnerID", runnerID,
|
||||
)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
if runnerBusy {
|
||||
_, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationFailureMessage, runnerUnregistrationFailureMessage)
|
||||
if err != nil {
|
||||
return &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
|
||||
// work as the reconcilation can happen earlier due to pod status update.
|
||||
// For ephemeral runners, we can expect it to stop and unregister itself on completion.
|
||||
// So we can just wait for the completion without actively retrying unregistration.
|
||||
ephemeral := getRunnerEnv(pod, EnvVarEphemeral)
|
||||
if ephemeral == "true" {
|
||||
_, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err != nil {
|
||||
return &ctrl.Result{}, err
|
||||
}
|
||||
|
||||
return &ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
log.V(2).Info("Retrying runner unregistration because the static runner is still busy")
|
||||
// Otherwise we may end up spamming 422 errors,
|
||||
// each call consuming GitHub API rate limit
|
||||
// https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
}
|
||||
|
||||
return &ctrl.Result{}, err
|
||||
} else if ok {
|
||||
log.Info("Runner has just been unregistered.")
|
||||
} else if pod == nil {
|
||||
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
|
||||
// However, that doesn't always mean the pod can be safely removed.
|
||||
//
|
||||
// If the pod does not exist for the runner,
|
||||
// it may be due to that the runner pod has never been created.
|
||||
// In that case we can safely assume that the runner will never be registered.
|
||||
|
||||
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
|
||||
} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
|
||||
log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+
|
||||
" It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err)
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
} else {
|
||||
// A runner and a runner pod that is created by this version of ARC should match
|
||||
// any of the above branches.
|
||||
//
|
||||
// But we leave this match all branch for potential backward-compatibility.
|
||||
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
|
||||
// and retry later.
|
||||
log.V(1).Info("Runner unregistration is being retried later.")
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
|
||||
_, hasRunnerID := getAnnotation(pod, AnnotationKeyRunnerID)
|
||||
if runnerPodOrContainerIsStopped(pod) || hasRunnerID {
|
||||
return pod, nil, nil
|
||||
}
|
||||
|
||||
r, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||
if err != nil {
|
||||
return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
|
||||
}
|
||||
|
||||
if r == nil || r.ID == nil {
|
||||
return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
|
||||
}
|
||||
|
||||
id := *r.ID
|
||||
|
||||
updated, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerID, fmt.Sprintf("%d", id))
|
||||
if err != nil {
|
||||
return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
|
||||
}
|
||||
|
||||
return updated, nil, nil
|
||||
}
|
||||
|
||||
func getAnnotation(obj client.Object, key string) (string, bool) {
|
||||
if obj.GetAnnotations() == nil {
|
||||
return "", false
|
||||
}
|
||||
|
||||
v, ok := obj.GetAnnotations()[key]
|
||||
|
||||
return v, ok
|
||||
}
|
||||
|
||||
func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
|
||||
if meta.Annotations == nil {
|
||||
meta.Annotations = map[string]string{}
|
||||
}
|
||||
|
||||
meta.Annotations[key] = value
|
||||
}
|
||||
|
||||
func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
|
||||
for _, c := range pod.Status.Conditions {
|
||||
if c.Type == tpe && c.Status == v {
|
||||
return &c.LastTransitionTime
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
|
||||
c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return c.Add(d).Before(time.Now())
|
||||
}
|
||||
|
||||
func podRunnerID(pod *corev1.Pod) string {
|
||||
id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
|
||||
return id
|
||||
}
|
||||
|
||||
func getRunnerEnv(pod *corev1.Pod, key string) string {
|
||||
for _, c := range pod.Spec.Containers {
|
||||
if c.Name == containerName {
|
||||
for _, e := range c.Env {
|
||||
if e.Name == key {
|
||||
return e.Value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func setRunnerEnv(pod *corev1.Pod, key, value string) {
|
||||
for i := range pod.Spec.Containers {
|
||||
c := pod.Spec.Containers[i]
|
||||
if c.Name == containerName {
|
||||
for j, env := range c.Env {
|
||||
if env.Name == key {
|
||||
pod.Spec.Containers[i].Env[j].Value = value
|
||||
return
|
||||
}
|
||||
}
|
||||
pod.Spec.Containers[i].Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
||||
//
|
||||
// This function returns:
|
||||
//
|
||||
// Case 1. (true, nil) when it has successfully unregistered the runner.
|
||||
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
|
||||
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
||||
// an error returned by GitHub API.
|
||||
//
|
||||
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
|
||||
// In other words, all those three sub-cases cannot be distinguished by this function alone.
|
||||
//
|
||||
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
|
||||
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
|
||||
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
|
||||
//
|
||||
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
|
||||
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
|
||||
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
|
||||
//
|
||||
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
|
||||
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
|
||||
//
|
||||
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
|
||||
// as long as you recreate the runner pod.
|
||||
//
|
||||
// If it was "2-3.", you need a workaround to avoid the race condition.
|
||||
//
|
||||
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
|
||||
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
|
||||
//
|
||||
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
|
||||
// There isn't a single right grace period that works for everyone.
|
||||
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
|
||||
// while the shorter the grace period is, the more likely you may encounter the race issue.
|
||||
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo string, id int64) (bool, error) {
|
||||
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
|
||||
// The reason ARC did so was to let a runner running a job to not stop prematurely.
|
||||
//
|
||||
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
|
||||
// so ARC doesn't need to do anything special for a graceful runner stop.
|
||||
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
|
||||
// the runner pod is safe for deletion.
|
||||
//
|
||||
// Trying to remove a busy runner can result in errors like the following:
|
||||
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||
//
|
||||
// # NOTES
|
||||
//
|
||||
// - It can be "status=offline" at the same time but that's another story.
|
||||
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
|
||||
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
|
||||
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
|
||||
// change from 60 seconds.
|
||||
//
|
||||
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
|
||||
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
|
||||
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, runner := range runners {
|
||||
if runner.GetName() == name {
|
||||
return runner, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
22
controllers/runner_pod.go
Normal file
22
controllers/runner_pod.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package controllers
|
||||
|
||||
import corev1 "k8s.io/api/core/v1"
|
||||
|
||||
// Force the runner pod managed by either RunnerDeployment and RunnerSet to have restartPolicy=Never.
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369 for more context.
|
||||
//
|
||||
// This is to prevent runner pods from stucking in Terminating when a K8s node disappeared along with the runnr pod and the runner container within it.
|
||||
//
|
||||
// Previously, we used restartPolicy of OnFailure, it turned wrong later, and therefore we now set Never.
|
||||
//
|
||||
// When the restartPolicy is OnFailure and the node disappeared, runner pods on the node seem to stuck in state.terminated==nil, state.waiting!=nil, and state.lastTerminationState!=nil,
|
||||
// and will ever become Running.
|
||||
// It's probably due to that the node onto which the pods have been scheduled will ever come back, hence the container restart attempt swill ever succeed,
|
||||
// the pods stuck waiting for successful restarts forever.
|
||||
//
|
||||
// By forcing runner pods to never restart, we hope there will be no chances of pods being stuck waiting.
|
||||
func forceRunnerPodRestartPolicyNever(pod *corev1.Pod) {
|
||||
if pod.Spec.RestartPolicy != corev1.RestartPolicyNever {
|
||||
pod.Spec.RestartPolicy = corev1.RestartPolicyNever
|
||||
}
|
||||
}
|
||||
@@ -20,11 +20,10 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v39/github"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
|
||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
@@ -47,17 +46,12 @@ type RunnerPodReconciler struct {
|
||||
Name string
|
||||
RegistrationRecheckInterval time.Duration
|
||||
RegistrationRecheckJitter time.Duration
|
||||
|
||||
UnregistrationRetryDelay time.Duration
|
||||
}
|
||||
|
||||
const (
|
||||
// This names requires at least one slash to work.
|
||||
// See https://github.com/google/knative-gcp/issues/378
|
||||
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
|
||||
|
||||
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
|
||||
)
|
||||
|
||||
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch
|
||||
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
||||
|
||||
func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||
@@ -68,14 +62,28 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||
}
|
||||
|
||||
_, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName]
|
||||
if !isRunnerPod {
|
||||
_, isRunnerPod := runnerPod.Labels[LabelKeyRunner]
|
||||
_, isRunnerSetPod := runnerPod.Labels[LabelKeyRunnerSetName]
|
||||
_, isRunnerDeploymentPod := runnerPod.Labels[LabelKeyRunnerDeploymentName]
|
||||
|
||||
if !isRunnerPod && !isRunnerSetPod && !isRunnerDeploymentPod {
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
var enterprise, org, repo string
|
||||
var envvars []corev1.EnvVar
|
||||
for _, container := range runnerPod.Spec.Containers {
|
||||
if container.Name == "runner" {
|
||||
envvars = container.Env
|
||||
}
|
||||
}
|
||||
|
||||
if len(envvars) == 0 {
|
||||
return ctrl.Result{}, errors.New("Could not determine env vars for runner Pod")
|
||||
}
|
||||
|
||||
var enterprise, org, repo string
|
||||
var isContainerMode bool
|
||||
|
||||
envvars := runnerPod.Spec.Containers[0].Env
|
||||
for _, e := range envvars {
|
||||
switch e.Name {
|
||||
case EnvVarEnterprise:
|
||||
@@ -84,13 +92,20 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||
org = e.Value
|
||||
case EnvVarRepo:
|
||||
repo = e.Value
|
||||
case "ACTIONS_RUNNER_CONTAINER_HOOKS":
|
||||
isContainerMode = true
|
||||
}
|
||||
}
|
||||
|
||||
if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||
|
||||
if added {
|
||||
var cleanupFinalizersAdded bool
|
||||
if isContainerMode {
|
||||
finalizers, cleanupFinalizersAdded = addFinalizer(finalizers, runnerLinkedResourcesFinalizerName)
|
||||
}
|
||||
|
||||
if added || cleanupFinalizersAdded {
|
||||
newRunner := runnerPod.DeepCopy()
|
||||
newRunner.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
@@ -99,44 +114,57 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Added finalizer")
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
} else {
|
||||
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||
log.V(2).Info("Seen deletion-timestamp is already set")
|
||||
|
||||
if removed {
|
||||
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
|
||||
if err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
if finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerLinkedResourcesFinalizerName); removed {
|
||||
if err := r.cleanupRunnerLinkedPods(ctx, &runnerPod, log); err != nil {
|
||||
log.Info("Runner-linked pods clean up that has failed due to an error. If this persists, please manually remove the runner-linked pods to unblock ARC", "err", err.Error())
|
||||
return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil
|
||||
}
|
||||
if err := r.cleanupRunnerLinkedSecrets(ctx, &runnerPod, log); err != nil {
|
||||
log.Info("Runner-linked secrets clean up that has failed due to an error. If this persists, please manually remove the runner-linked secrets to unblock ARC", "err", err.Error())
|
||||
return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil
|
||||
}
|
||||
patchedPod := runnerPod.DeepCopy()
|
||||
patchedPod.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
|
||||
log.Error(err, "Failed to update runner for finalizer linked resources removal")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
log.V(1).Info("Runner no longer exists on GitHub")
|
||||
// Otherwise the subsequent patch request can revive the removed finalizer and it will trigger a unnecessary reconcilation
|
||||
runnerPod = *patchedPod
|
||||
}
|
||||
|
||||
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||
|
||||
if removed {
|
||||
// In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set.
|
||||
// But for the case that the user manually deleted it for whatever reason,
|
||||
// we have to ensure it to gracefully stop now.
|
||||
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
newRunner := runnerPod.DeepCopy()
|
||||
newRunner.ObjectMeta.Finalizers = finalizers
|
||||
patchedPod := updatedPod.DeepCopy()
|
||||
patchedPod.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
||||
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
|
||||
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
|
||||
log.Error(err, "Failed to update runner for finalizer removal")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
log.Info("Removed runner from GitHub", "repository", repo, "organization", org)
|
||||
log.V(2).Info("Removed finalizer")
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
deletionTimeout := 1 * time.Minute
|
||||
@@ -174,246 +202,45 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
|
||||
|
||||
if !stopped {
|
||||
if runnerPod.Status.Phase == corev1.PodRunning {
|
||||
for _, status := range runnerPod.Status.ContainerStatuses {
|
||||
if status.Name != containerName {
|
||||
continue
|
||||
}
|
||||
|
||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||
stopped = true
|
||||
}
|
||||
}
|
||||
}
|
||||
po, res, err := ensureRunnerPodRegistered(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
restart := stopped
|
||||
runnerPod = *po
|
||||
|
||||
var registrationRecheckDelay time.Duration
|
||||
if _, unregistrationRequested := getAnnotation(&runnerPod, AnnotationKeyUnregistrationRequestTimestamp); unregistrationRequested {
|
||||
log.V(2).Info("Progressing unregistration because unregistration-request timestamp is set")
|
||||
|
||||
// all checks done below only decide whether a restart is needed
|
||||
// if a restart was already decided before, there is no need for the checks
|
||||
// saving API calls and scary log messages
|
||||
if !restart {
|
||||
registrationCheckInterval := time.Minute
|
||||
if r.RegistrationRecheckInterval > 0 {
|
||||
registrationCheckInterval = r.RegistrationRecheckInterval
|
||||
// At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller.
|
||||
//
|
||||
// In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all,
|
||||
// so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work.
|
||||
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
|
||||
|
||||
var lastCheckTime *time.Time
|
||||
|
||||
if lastCheckTimeStr != "" {
|
||||
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
|
||||
if err != nil {
|
||||
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
lastCheckTime = &t
|
||||
}
|
||||
|
||||
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
||||
// This if block, in conjunction with:
|
||||
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||
// achieves that.
|
||||
if lastCheckTime != nil {
|
||||
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
||||
now := time.Now()
|
||||
|
||||
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
||||
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
||||
// shorter than 1s
|
||||
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
||||
if requeueAfter > 0 {
|
||||
log.Info(
|
||||
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
||||
"lastRegistrationCheckTime", lastCheckTime,
|
||||
"registrationCheckInterval", registrationCheckInterval,
|
||||
)
|
||||
|
||||
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
||||
// next sync period passes, which can be too much later than nextCheckTime.
|
||||
//
|
||||
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
||||
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
||||
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
||||
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
||||
// spec/status change.
|
||||
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
||||
}
|
||||
}
|
||||
|
||||
notFound := false
|
||||
offline := false
|
||||
|
||||
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
|
||||
|
||||
currentTime := time.Now()
|
||||
|
||||
if err != nil {
|
||||
var notFoundException *github.RunnerNotFound
|
||||
var offlineException *github.RunnerOffline
|
||||
if errors.As(err, ¬FoundException) {
|
||||
notFound = true
|
||||
} else if errors.As(err, &offlineException) {
|
||||
offline = true
|
||||
} else {
|
||||
var e *gogithub.RateLimitError
|
||||
if errors.As(err, &e) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
|
||||
registrationTimeout := 10 * time.Minute
|
||||
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
|
||||
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
||||
|
||||
if notFound {
|
||||
if registrationDidTimeout {
|
||||
log.Info(
|
||||
"Runner failed to register itself to GitHub in timely manner. "+
|
||||
"Recreating the pod to see if it resolves the issue. "+
|
||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
|
||||
restart = true
|
||||
} else {
|
||||
log.V(1).Info(
|
||||
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
||||
"runnerName", runnerPod.Name,
|
||||
)
|
||||
}
|
||||
} else if offline {
|
||||
if registrationDidTimeout {
|
||||
log.Info(
|
||||
"Already existing GitHub runner still appears offline . "+
|
||||
"Recreating the pod to see if it resolves the issue. "+
|
||||
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||
"currentTime", currentTime,
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
|
||||
restart = true
|
||||
} else {
|
||||
log.V(1).Info(
|
||||
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
||||
"runnerName", runnerPod.Name,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (notFound || offline) && !registrationDidTimeout {
|
||||
registrationRecheckJitter := 10 * time.Second
|
||||
if r.RegistrationRecheckJitter > 0 {
|
||||
registrationRecheckJitter = r.RegistrationRecheckJitter
|
||||
}
|
||||
|
||||
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
||||
}
|
||||
}
|
||||
|
||||
// Don't do anything if there's no need to restart the runner
|
||||
if !restart {
|
||||
// This guard enables us to update runner.Status.Phase to `Running` only after
|
||||
// the runner is registered to GitHub.
|
||||
if registrationRecheckDelay > 0 {
|
||||
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
||||
|
||||
updated := runnerPod.DeepCopy()
|
||||
t := time.Now().Format(time.RFC3339)
|
||||
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
|
||||
|
||||
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
|
||||
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||
}
|
||||
|
||||
// Seeing this message, you can expect the runner to become `Running` soon.
|
||||
log.Info(
|
||||
"Runner appears to have registered and running.",
|
||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||
)
|
||||
// At this point we are sure that the runner has successfully unregistered, hence is safe to be deleted.
|
||||
// But we don't delete the pod here. Instead, let the upstream controller/parent object to delete this pod as
|
||||
// a part of a cascade deletion.
|
||||
// This is to avoid a parent object, like statefulset, to recreate the deleted pod.
|
||||
// If the pod was recreated, it will start a registration process and that may race with the statefulset deleting the pod.
|
||||
log.V(2).Info("Unregistration seems complete")
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// Delete current pod if recreation is needed
|
||||
if err := r.Delete(ctx, &runnerPod); err != nil {
|
||||
log.Error(err, "Failed to delete pod resource")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
|
||||
log.Info("Deleted runner pod", "name", runnerPod.Name)
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
||||
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
|
||||
if err != nil {
|
||||
return false, err
|
||||
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
|
||||
retryDelay := DefaultUnregistrationRetryDelay
|
||||
|
||||
if r.UnregistrationRetryDelay > 0 {
|
||||
retryDelay = r.UnregistrationRetryDelay
|
||||
}
|
||||
|
||||
var busy bool
|
||||
|
||||
id := int64(0)
|
||||
for _, runner := range runners {
|
||||
if runner.GetName() == name {
|
||||
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
||||
// Thus removing the condition on status can block the runner pod from being terminated forever.
|
||||
busy = runner.GetBusy()
|
||||
if runner.GetStatus() != "offline" && busy {
|
||||
r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy())
|
||||
return false, fmt.Errorf("runner is busy")
|
||||
}
|
||||
id = runner.GetID()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if id == int64(0) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
||||
// Trying to remove the offline but busy runner can result in errors like the following:
|
||||
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||
if !busy {
|
||||
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
|
||||
return true, nil
|
||||
return retryDelay
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
@@ -429,3 +256,93 @@ func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
Named(name).
|
||||
Complete(r)
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) cleanupRunnerLinkedPods(ctx context.Context, pod *corev1.Pod, log logr.Logger) error {
|
||||
var runnerLinkedPodList corev1.PodList
|
||||
if err := r.List(ctx, &runnerLinkedPodList, client.InNamespace(pod.Namespace), client.MatchingLabels(
|
||||
map[string]string{
|
||||
"runner-pod": pod.ObjectMeta.Name,
|
||||
},
|
||||
)); err != nil {
|
||||
return fmt.Errorf("failed to list runner-linked pods: %w", err)
|
||||
}
|
||||
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
errs []error
|
||||
)
|
||||
for _, p := range runnerLinkedPodList.Items {
|
||||
if !p.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
p := p
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := r.Delete(ctx, &p); err != nil {
|
||||
if kerrors.IsNotFound(err) || kerrors.IsGone(err) {
|
||||
return
|
||||
}
|
||||
errs = append(errs, fmt.Errorf("delete pod %q error: %v", p.ObjectMeta.Name, err))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if len(errs) > 0 {
|
||||
for _, err := range errs {
|
||||
log.Error(err, "failed to remove runner-linked pod")
|
||||
}
|
||||
return errors.New("failed to remove some runner linked pods")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, pod *corev1.Pod, log logr.Logger) error {
|
||||
log.V(2).Info("Listing runner-linked secrets to be deleted", "ns", pod.Namespace)
|
||||
|
||||
var runnerLinkedSecretList corev1.SecretList
|
||||
if err := r.List(ctx, &runnerLinkedSecretList, client.InNamespace(pod.Namespace), client.MatchingLabels(
|
||||
map[string]string{
|
||||
"runner-pod": pod.ObjectMeta.Name,
|
||||
},
|
||||
)); err != nil {
|
||||
return fmt.Errorf("failed to list runner-linked secrets: %w", err)
|
||||
}
|
||||
|
||||
var (
|
||||
wg sync.WaitGroup
|
||||
errs []error
|
||||
)
|
||||
for _, s := range runnerLinkedSecretList.Items {
|
||||
if !s.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
s := s
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := r.Delete(ctx, &s); err != nil {
|
||||
if kerrors.IsNotFound(err) || kerrors.IsGone(err) {
|
||||
return
|
||||
}
|
||||
errs = append(errs, fmt.Errorf("delete secret %q error: %v", s.ObjectMeta.Name, err))
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if len(errs) > 0 {
|
||||
for _, err := range errs {
|
||||
log.Error(err, "failed to remove runner-linked secret")
|
||||
}
|
||||
return errors.New("failed to remove some runner linked secrets")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
600
controllers/runner_pod_owner.go
Normal file
600
controllers/runner_pod_owner.go
Normal file
@@ -0,0 +1,600 @@
|
||||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/go-logr/logr"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
type podsForOwner struct {
|
||||
total int
|
||||
completed int
|
||||
running int
|
||||
terminating int
|
||||
regTimeout int
|
||||
pending int
|
||||
templateHash string
|
||||
runner *v1alpha1.Runner
|
||||
statefulSet *appsv1.StatefulSet
|
||||
owner owner
|
||||
object client.Object
|
||||
synced bool
|
||||
pods []corev1.Pod
|
||||
}
|
||||
|
||||
type owner interface {
|
||||
client.Object
|
||||
|
||||
pods(context.Context, client.Client) ([]corev1.Pod, error)
|
||||
templateHash() (string, bool)
|
||||
withAnnotation(k, v string) client.Object
|
||||
synced() bool
|
||||
}
|
||||
|
||||
type ownerRunner struct {
|
||||
client.Object
|
||||
|
||||
Log logr.Logger
|
||||
Runner *v1alpha1.Runner
|
||||
}
|
||||
|
||||
var _ owner = (*ownerRunner)(nil)
|
||||
|
||||
func (r *ownerRunner) pods(ctx context.Context, c client.Client) ([]corev1.Pod, error) {
|
||||
var pod corev1.Pod
|
||||
|
||||
if err := c.Get(ctx, types.NamespacedName{Namespace: r.Runner.Namespace, Name: r.Runner.Name}, &pod); err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
return nil, nil
|
||||
}
|
||||
r.Log.Error(err, "Failed to get pod managed by runner")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return []corev1.Pod{pod}, nil
|
||||
}
|
||||
|
||||
func (r *ownerRunner) templateHash() (string, bool) {
|
||||
return getRunnerTemplateHash(r.Runner)
|
||||
}
|
||||
|
||||
func (r *ownerRunner) withAnnotation(k, v string) client.Object {
|
||||
copy := r.Runner.DeepCopy()
|
||||
setAnnotation(©.ObjectMeta, k, v)
|
||||
return copy
|
||||
}
|
||||
|
||||
func (r *ownerRunner) synced() bool {
|
||||
return r.Runner.Status.Phase != ""
|
||||
}
|
||||
|
||||
type ownerStatefulSet struct {
|
||||
client.Object
|
||||
|
||||
Log logr.Logger
|
||||
StatefulSet *appsv1.StatefulSet
|
||||
}
|
||||
|
||||
var _ owner = (*ownerStatefulSet)(nil)
|
||||
|
||||
func (s *ownerStatefulSet) pods(ctx context.Context, c client.Client) ([]corev1.Pod, error) {
|
||||
var podList corev1.PodList
|
||||
|
||||
if err := c.List(ctx, &podList, client.MatchingLabels(s.StatefulSet.Spec.Template.ObjectMeta.Labels)); err != nil {
|
||||
s.Log.Error(err, "Failed to list pods managed by statefulset")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var pods []corev1.Pod
|
||||
|
||||
for _, pod := range podList.Items {
|
||||
if owner := metav1.GetControllerOf(&pod); owner == nil || owner.Kind != "StatefulSet" || owner.Name != s.StatefulSet.Name {
|
||||
continue
|
||||
}
|
||||
|
||||
pods = append(pods, pod)
|
||||
}
|
||||
|
||||
return pods, nil
|
||||
}
|
||||
|
||||
func (s *ownerStatefulSet) templateHash() (string, bool) {
|
||||
return getRunnerTemplateHash(s.StatefulSet)
|
||||
}
|
||||
|
||||
func (s *ownerStatefulSet) withAnnotation(k, v string) client.Object {
|
||||
copy := s.StatefulSet.DeepCopy()
|
||||
setAnnotation(©.ObjectMeta, k, v)
|
||||
return copy
|
||||
}
|
||||
|
||||
func (s *ownerStatefulSet) synced() bool {
|
||||
var replicas int32 = 1
|
||||
if s.StatefulSet.Spec.Replicas != nil {
|
||||
replicas = *s.StatefulSet.Spec.Replicas
|
||||
}
|
||||
|
||||
if s.StatefulSet.Status.Replicas != replicas {
|
||||
s.Log.V(2).Info("Waiting for statefulset to sync", "desiredReplicas", replicas, "currentReplicas", s.StatefulSet.Status.Replicas)
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func getPodsForOwner(ctx context.Context, c client.Client, log logr.Logger, o client.Object) (*podsForOwner, error) {
|
||||
var (
|
||||
owner owner
|
||||
runner *v1alpha1.Runner
|
||||
statefulSet *appsv1.StatefulSet
|
||||
object client.Object
|
||||
)
|
||||
|
||||
switch v := o.(type) {
|
||||
case *v1alpha1.Runner:
|
||||
owner = &ownerRunner{
|
||||
Log: log,
|
||||
Runner: v,
|
||||
Object: v,
|
||||
}
|
||||
runner = v
|
||||
object = v
|
||||
case *appsv1.StatefulSet:
|
||||
owner = &ownerStatefulSet{
|
||||
Log: log,
|
||||
StatefulSet: v,
|
||||
Object: v,
|
||||
}
|
||||
statefulSet = v
|
||||
object = v
|
||||
default:
|
||||
return nil, fmt.Errorf("BUG: Unsupported runner pods owner %v(%T)", v, v)
|
||||
}
|
||||
|
||||
pods, err := owner.pods(ctx, c)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var completed, running, terminating, regTimeout, pending, total int
|
||||
|
||||
for _, pod := range pods {
|
||||
total++
|
||||
|
||||
if runnerPodOrContainerIsStopped(&pod) {
|
||||
completed++
|
||||
} else if pod.Status.Phase == corev1.PodRunning {
|
||||
if podRunnerID(&pod) == "" && podConditionTransitionTimeAfter(&pod, corev1.PodReady, registrationTimeout) {
|
||||
log.Info(
|
||||
"Runner failed to register itself to GitHub in timely manner. "+
|
||||
"Recreating the pod to see if it resolves the issue. "+
|
||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
||||
"creationTimestamp", pod.CreationTimestamp,
|
||||
"readyTransitionTime", podConditionTransitionTime(&pod, corev1.PodReady, corev1.ConditionTrue),
|
||||
"configuredRegistrationTimeout", registrationTimeout,
|
||||
)
|
||||
|
||||
regTimeout++
|
||||
} else {
|
||||
running++
|
||||
}
|
||||
} else if !pod.DeletionTimestamp.IsZero() {
|
||||
terminating++
|
||||
} else {
|
||||
// pending includes running but timedout runner's pod too
|
||||
pending++
|
||||
}
|
||||
}
|
||||
|
||||
templateHash, ok := owner.templateHash()
|
||||
if !ok {
|
||||
log.Info("Failed to get template hash of statefulset. It must be in an invalid state. Please manually delete the statefulset so that it is recreated")
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
synced := owner.synced()
|
||||
|
||||
return &podsForOwner{
|
||||
total: total,
|
||||
completed: completed,
|
||||
running: running,
|
||||
terminating: terminating,
|
||||
regTimeout: regTimeout,
|
||||
pending: pending,
|
||||
templateHash: templateHash,
|
||||
runner: runner,
|
||||
statefulSet: statefulSet,
|
||||
owner: owner,
|
||||
object: object,
|
||||
synced: synced,
|
||||
pods: pods,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getRunnerTemplateHash(r client.Object) (string, bool) {
|
||||
hash, ok := r.GetLabels()[LabelKeyRunnerTemplateHash]
|
||||
|
||||
return hash, ok
|
||||
}
|
||||
|
||||
type state struct {
|
||||
podsForOwners map[string][]*podsForOwner
|
||||
lastSyncTime *time.Time
|
||||
}
|
||||
|
||||
type result struct {
|
||||
currentObjects []*podsForOwner
|
||||
}
|
||||
|
||||
// Why `create` must be a function rather than a client.Object? That's becase we use it to create one or more objects on scale up.
|
||||
//
|
||||
// We use client.Create to create a necessary number of client.Object. client.Create mutates the passed object on a successful creation.
|
||||
// It seems to set .Revision at least, and the existence of .Revision let client.Create fail due to K8s restriction that an object being just created
|
||||
// can't have .Revision.
|
||||
// Now, imagine that you are to add 2 runner replicas on scale up.
|
||||
// We create one resource object per a replica that ends up calling 2 client.Create calls.
|
||||
// If we were reusing client.Object to be passed to client.Create calls, only the first call suceeeds.
|
||||
// The second call fails due to the first call mutated the client.Object to have .Revision.
|
||||
// Passing a factory function of client.Object and creating a brand-new client.Object per a client.Create call resolves this issue,
|
||||
// allowing us to create two or more replicas in one reconcilation loop without being rejected by K8s.
|
||||
func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, effectiveTime *metav1.Time, newDesiredReplicas int, create func() client.Object, ephemeral bool, owners []client.Object) (*result, error) {
|
||||
state, err := collectPodsForOwners(ctx, c, log, owners)
|
||||
if err != nil || state == nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
podsForOwnersPerTemplateHash, lastSyncTime := state.podsForOwners, state.lastSyncTime
|
||||
|
||||
// # Why do we recreate statefulsets instead of updating their desired replicas?
|
||||
//
|
||||
// A statefulset cannot add more pods when not all the pods are running.
|
||||
// Our ephemeral runners' pods that have finished running become Completed(Phase=Succeeded).
|
||||
// So creating one statefulset per a batch of ephemeral runners is the only way for us to add more replicas.
|
||||
//
|
||||
// # Why do we recreate statefulsets instead of updating fields other than replicas?
|
||||
//
|
||||
// That's because Kubernetes doesn't allow updating anything other than replicas, template, and updateStrategy.
|
||||
// And the nature of ephemeral runner pods requires you to create a statefulset per a batch of new runner pods so
|
||||
// we have really no other choice.
|
||||
//
|
||||
// If you're curious, the below is the error message you will get when you tried to update forbidden StatefulSet field(s):
|
||||
//
|
||||
// 2021-06-13T07:19:52.760Z ERROR actions-runner-controller.runnerset Failed to patch statefulset
|
||||
// {"runnerset": "default/example-runnerset", "error": "StatefulSet.apps \"example-runnerset\" is invalid: s
|
||||
// pec: Forbidden: updates to statefulset spec for fields other than 'replicas', 'template', and 'updateStrategy'
|
||||
// are forbidden"}
|
||||
//
|
||||
// Even though the error message includes "Forbidden", this error's reason is "Invalid".
|
||||
// So we used to match these errors by using errors.IsInvalid. But that's another story...
|
||||
|
||||
desiredTemplateHash, ok := getRunnerTemplateHash(create())
|
||||
if !ok {
|
||||
log.Info("Failed to get template hash of desired owner resource. It must be in an invalid state. Please manually delete the owner so that it is recreated")
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
currentObjects := podsForOwnersPerTemplateHash[desiredTemplateHash]
|
||||
|
||||
sort.SliceStable(currentObjects, func(i, j int) bool {
|
||||
return currentObjects[i].owner.GetCreationTimestamp().Time.Before(currentObjects[j].owner.GetCreationTimestamp().Time)
|
||||
})
|
||||
|
||||
if len(currentObjects) > 0 {
|
||||
timestampFirst := currentObjects[0].owner.GetCreationTimestamp()
|
||||
timestampLast := currentObjects[len(currentObjects)-1].owner.GetCreationTimestamp()
|
||||
var names []string
|
||||
for _, ss := range currentObjects {
|
||||
names = append(names, ss.owner.GetName())
|
||||
}
|
||||
log.V(2).Info("Detected some current object(s)", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "names", names)
|
||||
}
|
||||
|
||||
var total, terminating, pending, running, regTimeout int
|
||||
|
||||
for _, ss := range currentObjects {
|
||||
total += ss.total
|
||||
terminating += ss.terminating
|
||||
pending += ss.pending
|
||||
running += ss.running
|
||||
regTimeout += ss.regTimeout
|
||||
}
|
||||
|
||||
numOwners := len(owners)
|
||||
|
||||
var hashes []string
|
||||
for h, _ := range state.podsForOwners {
|
||||
hashes = append(hashes, h)
|
||||
}
|
||||
|
||||
log.V(2).Info(
|
||||
"Found some pods across owner(s)",
|
||||
"total", total,
|
||||
"terminating", terminating,
|
||||
"pending", pending,
|
||||
"running", running,
|
||||
"regTimeout", regTimeout,
|
||||
"desired", newDesiredReplicas,
|
||||
"owners", numOwners,
|
||||
)
|
||||
|
||||
maybeRunning := pending + running
|
||||
|
||||
wantMoreRunners := newDesiredReplicas > maybeRunning
|
||||
alreadySyncedAfterEffectiveTime := ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time)
|
||||
runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale))
|
||||
|
||||
log = log.WithValues(
|
||||
"lastSyncTime", lastSyncTime,
|
||||
"effectiveTime", effectiveTime,
|
||||
"templateHashDesired", desiredTemplateHash,
|
||||
"replicasDesired", newDesiredReplicas,
|
||||
"replicasPending", pending,
|
||||
"replicasRunning", running,
|
||||
"replicasMaybeRunning", maybeRunning,
|
||||
"templateHashObserved", hashes,
|
||||
)
|
||||
|
||||
if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale {
|
||||
// This is our special handling of the situation for ephemeral runners only.
|
||||
//
|
||||
// Handling static runners this way results in scale-up to not work at all,
|
||||
// because then any scale up attempts for static runenrs fall within this condition, for two reasons.
|
||||
// First, static(persistent) runners will never restart on their own.
|
||||
// Second, we don't update EffectiveTime for static runners.
|
||||
//
|
||||
// We do need to skip this condition for static runners, and that's why we take the `ephemeral` flag into account when
|
||||
// computing `alreadySyncedAfterEffectiveTime``.
|
||||
|
||||
log.V(2).Info(
|
||||
"Detected that some ephemeral runners have disappeared. " +
|
||||
"Usually this is due to that ephemeral runner completions " +
|
||||
"so ARC does not create new runners until EffectiveTime is updated, or DefaultRunnerPodRecreationDelayAfterWebhookScale is elapsed.")
|
||||
} else if wantMoreRunners {
|
||||
if alreadySyncedAfterEffectiveTime && !runnerPodRecreationDelayAfterWebhookScale {
|
||||
log.V(2).Info("Adding more replicas because DefaultRunnerPodRecreationDelayAfterWebhookScale has been passed")
|
||||
}
|
||||
|
||||
num := newDesiredReplicas - maybeRunning
|
||||
|
||||
for i := 0; i < num; i++ {
|
||||
// Add more replicas
|
||||
if err := c.Create(ctx, create()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
log.V(1).Info("Created replica(s)",
|
||||
"created", num,
|
||||
)
|
||||
|
||||
return nil, nil
|
||||
} else if newDesiredReplicas <= running {
|
||||
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
|
||||
// you're unlikely to fall into this branch.
|
||||
//
|
||||
// That's because all the stakeholders work like this:
|
||||
//
|
||||
// 1. A runner pod completes with the runner container exiting with code 0
|
||||
// 2. ARC runner controller detects the pod completion, marks the owner(runner or statefulset) resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
|
||||
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
|
||||
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
|
||||
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
|
||||
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
|
||||
// 7. (We're here) ARC runnerset or runnerreplicaset controller starts reconciling the owner resource (statefulset or runner)
|
||||
//
|
||||
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
|
||||
// marked for deletion by the runner controller.
|
||||
// This runnerreplicaset controller doesn't count marked runners into the `running` value, hence you're unlikely to
|
||||
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
|
||||
|
||||
var retained int
|
||||
|
||||
var delete []*podsForOwner
|
||||
for i := len(currentObjects) - 1; i >= 0; i-- {
|
||||
ss := currentObjects[i]
|
||||
|
||||
if ss.running == 0 || retained >= newDesiredReplicas {
|
||||
// In case the desired replicas is satisfied until i-1, or this owner has no running pods,
|
||||
// this owner can be considered safe for deletion.
|
||||
// Note that we already waited on this owner to create pods by waiting for
|
||||
// `.Status.Replicas`(=total number of pods managed by owner, regardless of the runner is Running or Completed) to match the desired replicas in a previous step.
|
||||
// So `.running == 0` means "the owner has created the desired number of pods before, and all of them are completed now".
|
||||
delete = append(delete, ss)
|
||||
} else if retained < newDesiredReplicas {
|
||||
retained += ss.running
|
||||
}
|
||||
}
|
||||
|
||||
if retained == newDesiredReplicas {
|
||||
for _, ss := range delete {
|
||||
log := log.WithValues("owner", types.NamespacedName{Namespace: ss.owner.GetNamespace(), Name: ss.owner.GetName()})
|
||||
// Statefulset termination process 1/4: Set unregistrationRequestTimestamp only after all the pods managed by the statefulset have
|
||||
// started unregistreation process.
|
||||
//
|
||||
// NOTE: We just mark it instead of immediately starting the deletion process.
|
||||
// Otherwise, the runner pod may hit termiationGracePeriod before the unregistration completes(the max terminationGracePeriod is limited to 1h by K8s and a job can be run for more than that),
|
||||
// or actions/runner may potentially misbehave on SIGTERM immediately sent by K8s.
|
||||
// We'd better unregister first and then start a pod deletion process.
|
||||
// The annotation works as a mark to start the pod unregistration and deletion process of ours.
|
||||
|
||||
if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); ok {
|
||||
log.V(2).Info("Still waiting for runner pod(s) unregistration to complete")
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
for _, po := range ss.pods {
|
||||
if _, err := annotatePodOnce(ctx, c, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil {
|
||||
log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion")
|
||||
}
|
||||
} else if retained > newDesiredReplicas {
|
||||
log.V(2).Info("Waiting sync before scale down", "retained", retained, "newDesiredReplicas", newDesiredReplicas)
|
||||
|
||||
return nil, nil
|
||||
} else {
|
||||
log.Info("Invalid state", "retained", retained, "newDesiredReplicas", newDesiredReplicas)
|
||||
panic("crashed due to invalid state")
|
||||
}
|
||||
}
|
||||
|
||||
for _, sss := range podsForOwnersPerTemplateHash {
|
||||
for _, ss := range sss {
|
||||
if ss.templateHash != desiredTemplateHash {
|
||||
if ss.owner.GetDeletionTimestamp().IsZero() {
|
||||
if err := c.Delete(ctx, ss.object); err != nil {
|
||||
log.Error(err, "Unable to delete object")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Deleted redundant and outdated object")
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &result{
|
||||
currentObjects: currentObjects,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, owners []client.Object) (*state, error) {
|
||||
podsForOwnerPerTemplateHash := map[string][]*podsForOwner{}
|
||||
|
||||
// lastSyncTime becomes non-nil only when there are one or more owner(s) hence there are same number of runner pods.
|
||||
// It's used to prevent runnerset-controller from recreating "completed ephemeral runners".
|
||||
// This is needed to prevent runners from being terminated prematurely.
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911 for more context.
|
||||
//
|
||||
// This becomes nil when there are zero statefulset(s). That's fine because then there should be zero stateful(s) to be recreated either hence
|
||||
// we don't need to guard with lastSyncTime.
|
||||
var lastSyncTime *time.Time
|
||||
|
||||
for _, ss := range owners {
|
||||
log := log.WithValues("owner", types.NamespacedName{Namespace: ss.GetNamespace(), Name: ss.GetName()})
|
||||
|
||||
res, err := getPodsForOwner(ctx, c, log, ss)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if res.templateHash == "" {
|
||||
log.Info("validation error: runner pod owner must have template hash", "object", res.object)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Statefulset termination process 4/4: Let Kubernetes cascade-delete the statefulset and the pods.
|
||||
//
|
||||
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
||||
// or by this controller (in case it was deleted in the previous reconcilation loop),
|
||||
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
|
||||
// Just hold on, and runners will disappear as long as the runner controller is up and running.
|
||||
if !res.owner.GetDeletionTimestamp().IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods.
|
||||
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); ok {
|
||||
if err := c.Delete(ctx, res.object); err != nil {
|
||||
log.Error(err, "Failed to delete owner")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Started deletion of owner")
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// Statefulset termination process 2/4: Set unregistrationCompleteTimestamp only if all the pods managed by the statefulset
|
||||
// have either unregistered or being deleted.
|
||||
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationRequestTimestamp); ok {
|
||||
var deletionSafe int
|
||||
for _, po := range res.pods {
|
||||
if _, ok := getAnnotation(&po, AnnotationKeyUnregistrationCompleteTimestamp); ok {
|
||||
deletionSafe++
|
||||
} else if !po.DeletionTimestamp.IsZero() {
|
||||
deletionSafe++
|
||||
}
|
||||
}
|
||||
|
||||
if deletionSafe == res.total {
|
||||
log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total)
|
||||
|
||||
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok {
|
||||
updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
||||
|
||||
if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil {
|
||||
log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Redundant owner has been annotated to start the deletion")
|
||||
} else {
|
||||
log.V(2).Info("BUG: Redundant owner was already annotated to start the deletion")
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if annotations := res.owner.GetAnnotations(); annotations != nil {
|
||||
if a, ok := annotations[SyncTimeAnnotationKey]; ok {
|
||||
t, err := time.Parse(time.RFC3339, a)
|
||||
if err == nil {
|
||||
if lastSyncTime == nil || lastSyncTime.Before(t) {
|
||||
lastSyncTime = &t
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A completed owner and a completed runner pod can safely be deleted without
|
||||
// a race condition so delete it here,
|
||||
// so that the later process can be a bit simpler.
|
||||
if res.total > 0 && res.total == res.completed {
|
||||
if err := c.Delete(ctx, ss); err != nil {
|
||||
log.Error(err, "Unable to delete owner")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.V(2).Info("Deleted completed owner")
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if !res.synced {
|
||||
log.V(1).Info("Skipped reconcilation because owner is not synced yet", "pods", res.pods)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
podsForOwnerPerTemplateHash[res.templateHash] = append(podsForOwnerPerTemplateHash[res.templateHash], res)
|
||||
}
|
||||
|
||||
return &state{podsForOwnerPerTemplateHash, lastSyncTime}, nil
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user