Enhance RunnerSet to optionally retain PVs accross restarts (#1340)

* Enhance RunnerSet to optionally retain PVs accross restarts

This is our initial attempt to bring back the ability to retain PVs across runner pod restarts when using RunnerSet.
The implementation is composed of two new controllers, `runnerpersistentvolumeclaim-controller` and `runnerpersistentvolume-controller`.
It all starts from our existing `runnerset-controller`. The controller now tries to mark any PVCs created by StatefulSets created for the RunnerSet.
Once the controller terminated statefulsets, their corresponding PVCs are clean up by `runnerpersistentvolumeclaim-controller`, then PVs are unbound from their corresponding PVCs by `runnerpersistentvolume-controller` so that they can be reused by future PVCs createf for future StatefulSets that shares the same same StorageClass.

Ref #1286

* Update E2E test suite to cover runner, docker, and go caching with RunnerSet + PVs

Ref #1286
This commit is contained in:
Yusuke Kuoka
2022-05-16 09:26:48 +09:00
committed by GitHub
parent adf69bbea0
commit b5194fd75a
10 changed files with 610 additions and 4 deletions

View File

@@ -0,0 +1,76 @@
/*
Copyright 2022 The actions-runner-controller authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
)
// RunnerPersistentVolumeClaimReconciler reconciles a PersistentVolume object
type RunnerPersistentVolumeClaimReconciler struct {
client.Client
Log logr.Logger
Recorder record.EventRecorder
Scheme *runtime.Scheme
Name string
}
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
func (r *RunnerPersistentVolumeClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("pvc", req.NamespacedName)
var pvc corev1.PersistentVolumeClaim
if err := r.Get(ctx, req.NamespacedName, &pvc); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
log.Info("Reconciling runner pvc")
res, err := syncPVC(ctx, r.Client, log, req.Namespace, &pvc)
if res == nil {
res = &ctrl.Result{}
}
return *res, err
}
func (r *RunnerPersistentVolumeClaimReconciler) SetupWithManager(mgr ctrl.Manager) error {
name := "runnerpersistentvolumeclaim-controller"
if r.Name != "" {
name = r.Name
}
r.Recorder = mgr.GetEventRecorderFor(name)
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.PersistentVolumeClaim{}).
Named(name).
Complete(r)
}

View File

@@ -0,0 +1,72 @@
/*
Copyright 2022 The actions-runner-controller authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controllers
import (
"context"
"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
)
// RunnerPersistentVolumeReconciler reconciles a PersistentVolume object
type RunnerPersistentVolumeReconciler struct {
client.Client
Log logr.Logger
Recorder record.EventRecorder
Scheme *runtime.Scheme
Name string
}
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
func (r *RunnerPersistentVolumeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("pv", req.NamespacedName)
var pv corev1.PersistentVolume
if err := r.Get(ctx, req.NamespacedName, &pv); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
res, err := syncPV(ctx, r.Client, log, req.Namespace, &pv)
if res == nil {
res = &ctrl.Result{}
}
return *res, err
}
func (r *RunnerPersistentVolumeReconciler) SetupWithManager(mgr ctrl.Manager) error {
name := "runnerpersistentvolume-controller"
if r.Name != "" {
name = r.Name
}
r.Recorder = mgr.GetEventRecorderFor(name)
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.PersistentVolume{}).
Named(name).
Complete(r)
}

View File

@@ -58,6 +58,7 @@ type RunnerSetReconciler struct {
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnersets/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update
@@ -129,6 +130,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
owners = append(owners, &ss)
}
if res, err := syncVolumes(ctx, r.Client, log, req.Namespace, runnerSet, statefulsets); err != nil {
return ctrl.Result{}, err
} else if res != nil {
return *res, nil
}
res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, newDesiredReplicas, func() client.Object { return create.DeepCopy() }, ephemeral, owners)
if err != nil || res == nil {
return ctrl.Result{}, err

175
controllers/sync_volumes.go Normal file
View File

@@ -0,0 +1,175 @@
package controllers
import (
"context"
"fmt"
"time"
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/go-logr/logr"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
labelKeyCleanup = "pending-cleanup"
labelKeyRunnerStatefulSetName = "runner-statefulset-name"
)
func syncVolumes(ctx context.Context, c client.Client, log logr.Logger, ns string, runnerSet *v1alpha1.RunnerSet, statefulsets []appsv1.StatefulSet) (*ctrl.Result, error) {
log = log.WithValues("ns", ns)
for _, t := range runnerSet.Spec.StatefulSetSpec.VolumeClaimTemplates {
for _, sts := range statefulsets {
pvcName := fmt.Sprintf("%s-%s-0", t.Name, sts.Name)
var pvc corev1.PersistentVolumeClaim
if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: pvcName}, &pvc); err != nil {
if !kerrors.IsNotFound(err) {
return nil, err
}
continue
}
// TODO move this to statefulset reconciler so that we spam this less,
// by starting the loop only after the statefulset got deletionTimestamp set.
// Perhaps you can just wrap this in a finalizer here.
if pvc.Labels[labelKeyRunnerStatefulSetName] == "" {
updated := pvc.DeepCopy()
updated.Labels[labelKeyRunnerStatefulSetName] = sts.Name
if err := c.Update(ctx, updated); err != nil {
return nil, err
}
log.V(1).Info("Added runner-statefulset-name label to PVC", "sts", sts.Name, "pvc", pvcName)
}
}
}
// PVs are not namespaced hence we don't need client.InNamespace(ns).
// If we added that, c.List will silently return zero items.
//
// This `List` needs to be done in a dedicated reconciler that is registered to the manager via the `For` func.
// Otherwise the List func might return outdated contents(I saw status.phase being Bound even after K8s updated it to Released, and it lasted minutes).
//
// cleanupLabels := map[string]string{
// labelKeyCleanup: runnerSet.Name,
// }
// pvList := &corev1.PersistentVolumeList{}
// if err := c.List(ctx, pvList, client.MatchingLabels(cleanupLabels)); err != nil {
// log.Info("retrying pv listing", "ns", ns, "err", err)
// return nil, err
// }
return nil, nil
}
func syncPVC(ctx context.Context, c client.Client, log logr.Logger, ns string, pvc *corev1.PersistentVolumeClaim) (*ctrl.Result, error) {
stsName := pvc.Labels[labelKeyRunnerStatefulSetName]
if stsName == "" {
return nil, nil
}
var sts appsv1.StatefulSet
if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: stsName}, &sts); err != nil {
if !kerrors.IsNotFound(err) {
return nil, err
}
} else {
// We assume that the statefulset is shortly terminated, hence retry forever until it gets removed.
retry := 10 * time.Second
log.V(1).Info("Retrying sync until statefulset gets removed", "requeueAfter", retry)
return &ctrl.Result{RequeueAfter: retry}, nil
}
log = log.WithValues("pvc", pvc.Name, "sts", stsName)
pvName := pvc.Spec.VolumeName
if pvName != "" {
// If we deleted PVC before unsetting pv.spec.claimRef,
// K8s seems to revive the claimRef :thinking:
// So we need to mark PV for claimRef unset first, and delete PVC, and finally unset claimRef on PV.
var pv corev1.PersistentVolume
if err := c.Get(ctx, types.NamespacedName{Namespace: ns, Name: pvName}, &pv); err != nil {
if !kerrors.IsNotFound(err) {
return nil, err
}
return nil, nil
}
pvCopy := pv.DeepCopy()
if pvCopy.Labels == nil {
pvCopy.Labels = map[string]string{}
}
pvCopy.Labels[labelKeyCleanup] = stsName
log.Info("Scheduling to unset PV's claimRef", "pv", pv.Name)
// Apparently K8s doesn't reconcile PV immediately after PVC deletion.
// So we start a relatively busy loop of PV reconcilation slightly before the PVC deletion,
// so that PV can be unbound as soon as possible after the PVC got deleted.
if err := c.Update(ctx, pvCopy); err != nil {
return nil, err
}
// At this point, the PV is still Bound
log.Info("Deleting unused pvc")
if err := c.Delete(ctx, pvc); err != nil {
return nil, err
}
// At this point, the PV is still "Bound", but we are ready to unset pv.spec.claimRef in pv controller.
// Once the pv controller unsets claimRef, the PV becomes "Released", hence available for reuse by another eligible PVC.
}
return nil, nil
}
func syncPV(ctx context.Context, c client.Client, log logr.Logger, ns string, pv *corev1.PersistentVolume) (*ctrl.Result, error) {
log.V(2).Info("checking pv claimRef")
if pv.Spec.ClaimRef == nil {
return nil, nil
}
log.V(2).Info("checking labels")
if pv.Labels[labelKeyCleanup] == "" {
// We assume that the pvc is shortly terminated, hence retry forever until it gets removed.
retry := 10 * time.Second
log.V(1).Info("Retrying sync until pvc gets removed", "requeueAfter", retry)
return &ctrl.Result{RequeueAfter: retry}, nil
}
log.V(2).Info("checking pv phase", "phase", pv.Status.Phase)
if pv.Status.Phase != corev1.VolumeReleased {
// We assume that the pvc is shortly terminated, hence retry forever until it gets removed.
retry := 10 * time.Second
log.V(1).Info("Retrying sync until pvc gets released", "requeueAfter", retry)
return &ctrl.Result{RequeueAfter: retry}, nil
}
// At this point, the PV is still Released
pvCopy := pv.DeepCopy()
delete(pvCopy.Labels, labelKeyCleanup)
pvCopy.Spec.ClaimRef = nil
log.Info("Unsetting PV's claimRef", "pv", pv.Name)
if err := c.Update(ctx, pvCopy); err != nil {
return nil, err
}
// At this point, the PV becomes Available, if it's reclaim policy is "Retain".
// I have not yet tested it with "Delete" but perhaps it's deleted automatically after the update?
// https://kubernetes.io/docs/concepts/storage/persistent-volumes/#retain
return nil, nil
}