2020-02-21 03:01:52 +09:00
/ *
Copyright 2020 The actions - runner - controller authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package controllers
import (
"context"
2021-02-09 10:17:52 +09:00
"errors"
2020-02-21 03:01:52 +09:00
"fmt"
2021-05-21 09:10:47 +09:00
"reflect"
2021-02-09 10:17:52 +09:00
"time"
2020-03-15 18:08:11 +09:00
2021-12-17 09:06:55 +09:00
"github.com/go-logr/logr"
2021-12-11 07:43:40 -05:00
gogithub "github.com/google/go-github/v39/github"
2021-02-25 00:38:55 +01:00
2021-02-09 10:17:52 +09:00
kerrors "k8s.io/apimachinery/pkg/api/errors"
2020-02-21 03:01:52 +09:00
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2021-06-22 17:55:06 +09:00
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/actions-runner-controller/actions-runner-controller/github"
2020-02-21 03:01:52 +09:00
)
2020-03-10 09:14:11 +09:00
// RunnerReplicaSetReconciler reconciles a Runner object
type RunnerReplicaSetReconciler struct {
2020-02-21 03:01:52 +09:00
client . Client
2020-10-05 01:06:37 +01:00
Log logr . Logger
Recorder record . EventRecorder
Scheme * runtime . Scheme
GitHubClient * github . Client
2021-02-19 10:33:04 +09:00
Name string
2020-02-21 03:01:52 +09:00
}
2020-03-15 18:08:11 +09:00
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
2020-10-06 09:23:03 +09:00
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
2020-03-15 18:08:11 +09:00
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
2020-02-21 03:01:52 +09:00
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners/status,verbs=get;update;patch
2020-03-27 23:25:37 +09:00
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
2020-02-21 03:01:52 +09:00
2021-06-22 17:10:09 +09:00
func ( r * RunnerReplicaSetReconciler ) Reconcile ( ctx context . Context , req ctrl . Request ) ( ctrl . Result , error ) {
2021-01-24 10:58:35 +09:00
log := r . Log . WithValues ( "runnerreplicaset" , req . NamespacedName )
2020-02-21 03:01:52 +09:00
2020-03-10 09:14:11 +09:00
var rs v1alpha1 . RunnerReplicaSet
2020-02-21 03:01:52 +09:00
if err := r . Get ( ctx , req . NamespacedName , & rs ) ; err != nil {
return ctrl . Result { } , client . IgnoreNotFound ( err )
}
if ! rs . ObjectMeta . DeletionTimestamp . IsZero ( ) {
return ctrl . Result { } , nil
}
2021-05-05 12:13:51 +09:00
selector , err := metav1 . LabelSelectorAsSelector ( rs . Spec . Selector )
if err != nil {
return ctrl . Result { } , err
}
// Get the Runners managed by the target RunnerReplicaSet
var allRunners v1alpha1 . RunnerList
if err := r . List (
ctx ,
& allRunners ,
client . InNamespace ( req . Namespace ) ,
client . MatchingLabelsSelector { Selector : selector } ,
) ; err != nil {
if ! kerrors . IsNotFound ( err ) {
return ctrl . Result { } , err
}
}
var myRunners [ ] v1alpha1 . Runner
var (
2021-05-21 09:10:47 +09:00
current int
2021-05-05 12:13:51 +09:00
ready int
2021-05-21 09:10:47 +09:00
available int
2021-05-05 12:13:51 +09:00
)
for _ , r := range allRunners . Items {
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
// to not treat all the runners in the namespace as its children.
if metav1 . IsControlledBy ( & r , & rs ) && ! metav1 . HasAnnotation ( r . ObjectMeta , annotationKeyRegistrationOnly ) {
myRunners = append ( myRunners , r )
2021-05-21 09:10:47 +09:00
current += 1
2021-05-05 12:13:51 +09:00
if r . Status . Phase == string ( corev1 . PodRunning ) {
ready += 1
2021-05-21 09:10:47 +09:00
// available is currently the same as ready, as we don't yet have minReadySeconds for runners
available += 1
2021-05-05 12:13:51 +09:00
}
}
}
var desired int
if rs . Spec . Replicas != nil {
desired = * rs . Spec . Replicas
} else {
desired = 1
}
2022-01-07 09:56:21 +09:00
// TODO: remove this registration runner cleanup later (v0.23.0 or v0.24.0)
//
// We had to have a registration-only runner to support scale-from-zero before.
// But since Sep 2021 Actions update on GitHub Cloud and GHES 3.3, it is unneceesary.
// See the below issues for more contexts:
// https://github.com/actions-runner-controller/actions-runner-controller/issues/516
// https://github.com/actions-runner-controller/actions-runner-controller/issues/859
//
// In the below block, we have a logic to remove existing registration-only runners as unnecessary.
// This logic is introduced since actions-runner-controller 0.21.0 and probably last one or two minor releases
// so that actions-runner-controller instance in everyone's cluster won't leave dangling registration-only runners.
feat: Support for scaling from/to zero (#465)
This is an attempt to support scaling from/to zero.
The basic idea is that we create a one-off "registration-only" runner pod on RunnerReplicaSet being scaled to zero, so that there is one "offline" runner, which enables GitHub Actions to queue jobs instead of discarding those.
GitHub Actions seems to immediately throw away the new job when there are no runners at all. Generally, having runners of any status, `busy`, `idle`, or `offline` would prevent GitHub actions from failing jobs. But retaining `busy` or `idle` runners means that we need to keep runner pods running, which conflicts with our desired to scale to/from zero, hence we retain `offline` runners.
In this change, I enhanced the runnerreplicaset controller to create a registration-only runner on very beginning of its reconciliation logic, only when a runnerreplicaset is scaled to zero. The runner controller creates the registration-only runner pod, waits for it to become "offline", and then removes the runner pod. The runner on GitHub stays `offline`, until the runner resource on K8s is deleted. As we remove the registration-only runner pod as soon as it registers, this doesn't block cluster-autoscaler.
Related to #447
2021-05-02 16:11:36 +09:00
registrationOnlyRunnerNsName := req . NamespacedName
registrationOnlyRunnerNsName . Name = registrationOnlyRunnerNameFor ( rs . Name )
2021-05-05 12:13:51 +09:00
registrationOnlyRunner := v1alpha1 . Runner { }
feat: Support for scaling from/to zero (#465)
This is an attempt to support scaling from/to zero.
The basic idea is that we create a one-off "registration-only" runner pod on RunnerReplicaSet being scaled to zero, so that there is one "offline" runner, which enables GitHub Actions to queue jobs instead of discarding those.
GitHub Actions seems to immediately throw away the new job when there are no runners at all. Generally, having runners of any status, `busy`, `idle`, or `offline` would prevent GitHub actions from failing jobs. But retaining `busy` or `idle` runners means that we need to keep runner pods running, which conflicts with our desired to scale to/from zero, hence we retain `offline` runners.
In this change, I enhanced the runnerreplicaset controller to create a registration-only runner on very beginning of its reconciliation logic, only when a runnerreplicaset is scaled to zero. The runner controller creates the registration-only runner pod, waits for it to become "offline", and then removes the runner pod. The runner on GitHub stays `offline`, until the runner resource on K8s is deleted. As we remove the registration-only runner pod as soon as it registers, this doesn't block cluster-autoscaler.
Related to #447
2021-05-02 16:11:36 +09:00
registrationOnlyRunnerExists := false
if err := r . Get (
ctx ,
registrationOnlyRunnerNsName ,
& registrationOnlyRunner ,
) ; err != nil {
if ! kerrors . IsNotFound ( err ) {
return ctrl . Result { } , err
}
} else {
registrationOnlyRunnerExists = true
}
2022-01-07 09:56:21 +09:00
if registrationOnlyRunnerExists {
if err := r . Client . Delete ( ctx , & registrationOnlyRunner ) ; err != nil {
log . Error ( err , "Retrying soon because we failed to delete registration-only runner" )
2021-05-05 12:13:51 +09:00
2022-01-07 09:56:21 +09:00
return ctrl . Result { Requeue : true } , nil
feat: Support for scaling from/to zero (#465)
This is an attempt to support scaling from/to zero.
The basic idea is that we create a one-off "registration-only" runner pod on RunnerReplicaSet being scaled to zero, so that there is one "offline" runner, which enables GitHub Actions to queue jobs instead of discarding those.
GitHub Actions seems to immediately throw away the new job when there are no runners at all. Generally, having runners of any status, `busy`, `idle`, or `offline` would prevent GitHub actions from failing jobs. But retaining `busy` or `idle` runners means that we need to keep runner pods running, which conflicts with our desired to scale to/from zero, hence we retain `offline` runners.
In this change, I enhanced the runnerreplicaset controller to create a registration-only runner on very beginning of its reconciliation logic, only when a runnerreplicaset is scaled to zero. The runner controller creates the registration-only runner pod, waits for it to become "offline", and then removes the runner pod. The runner on GitHub stays `offline`, until the runner resource on K8s is deleted. As we remove the registration-only runner pod as soon as it registers, this doesn't block cluster-autoscaler.
Related to #447
2021-05-02 16:11:36 +09:00
}
}
2021-05-21 09:10:47 +09:00
if current > desired {
n := current - desired
2020-02-21 03:01:52 +09:00
2021-05-21 09:10:47 +09:00
log . V ( 0 ) . Info ( fmt . Sprintf ( "Deleting %d runners" , n ) , "desired" , desired , "current" , current , "ready" , ready )
2021-03-20 07:34:25 +09:00
// get runners that are currently offline/not busy/timed-out to register
var deletionCandidates [ ] v1alpha1 . Runner
2021-03-05 10:15:39 +09:00
for _ , runner := range allRunners . Items {
2021-02-09 10:17:52 +09:00
busy , err := r . GitHubClient . IsRunnerBusy ( ctx , runner . Spec . Enterprise , runner . Spec . Organization , runner . Spec . Repository , runner . Name )
2020-10-05 01:06:37 +01:00
if err != nil {
2021-02-15 01:36:49 +01:00
notRegistered := false
2021-02-22 02:08:04 +01:00
offline := false
2021-02-15 01:36:49 +01:00
2021-02-22 02:08:04 +01:00
var notFoundException * github . RunnerNotFound
var offlineException * github . RunnerOffline
if errors . As ( err , & notFoundException ) {
log . V ( 1 ) . Info ( "Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time." , "runnerName" , runner . Name )
2021-02-15 01:36:49 +01:00
notRegistered = true
2021-02-22 02:08:04 +01:00
} else if errors . As ( err , & offlineException ) {
offline = true
2021-02-15 01:36:49 +01:00
} else {
var e * gogithub . RateLimitError
if errors . As ( err , & e ) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log . Error (
err ,
fmt . Sprintf (
"Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls" ,
retryDelayOnGitHubAPIRateLimitError ,
) ,
)
return ctrl . Result { RequeueAfter : retryDelayOnGitHubAPIRateLimitError } , err
2021-02-09 10:17:52 +09:00
}
2021-02-15 01:36:49 +01:00
return ctrl . Result { } , err
2021-02-09 10:17:52 +09:00
}
registrationTimeout := 15 * time . Minute
currentTime := time . Now ( )
2021-02-12 10:00:20 +09:00
registrationDidTimeout := currentTime . Sub ( runner . CreationTimestamp . Add ( registrationTimeout ) ) > 0
2021-02-09 10:17:52 +09:00
2021-02-15 01:36:49 +01:00
if notRegistered && registrationDidTimeout {
2021-02-09 10:17:52 +09:00
log . Info (
"Runner failed to register itself to GitHub in timely manner. " +
2021-02-22 02:08:04 +01:00
"Marking the runner for scale down. " +
2021-02-09 10:17:52 +09:00
"CAUTION: If you see this a lot, you should investigate the root cause. " +
2021-06-22 17:55:06 +09:00
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288" ,
2021-02-09 10:17:52 +09:00
"runnerCreationTimestamp" , runner . CreationTimestamp ,
"currentTime" , currentTime ,
"configuredRegistrationTimeout" , registrationTimeout ,
)
2021-03-20 07:34:25 +09:00
deletionCandidates = append ( deletionCandidates , runner )
2021-02-09 10:17:52 +09:00
}
2021-02-22 02:08:04 +01:00
// offline runners should always be a great target for scale down
if offline {
2021-03-20 07:34:25 +09:00
deletionCandidates = append ( deletionCandidates , runner )
2021-02-22 02:08:04 +01:00
}
2021-02-09 10:17:52 +09:00
} else if ! busy {
2021-03-20 07:34:25 +09:00
deletionCandidates = append ( deletionCandidates , runner )
2020-10-05 01:06:37 +01:00
}
}
2021-03-20 07:34:25 +09:00
if len ( deletionCandidates ) < n {
n = len ( deletionCandidates )
2020-10-05 01:06:37 +01:00
}
2021-05-21 09:10:47 +09:00
log . V ( 0 ) . Info ( fmt . Sprintf ( "Deleting %d runner(s)" , n ) , "desired" , desired , "current" , current , "ready" , ready )
2021-05-05 12:13:51 +09:00
2020-02-21 03:01:52 +09:00
for i := 0 ; i < n ; i ++ {
2021-03-20 07:34:25 +09:00
if err := r . Client . Delete ( ctx , & deletionCandidates [ i ] ) ; client . IgnoreNotFound ( err ) != nil {
2020-02-21 03:01:52 +09:00
log . Error ( err , "Failed to delete runner resource" )
return ctrl . Result { } , err
}
2021-03-20 07:34:25 +09:00
r . Recorder . Event ( & rs , corev1 . EventTypeNormal , "RunnerDeleted" , fmt . Sprintf ( "Deleted runner '%s'" , deletionCandidates [ i ] . Name ) )
log . Info ( "Deleted runner" )
2020-02-21 03:01:52 +09:00
}
2021-05-21 09:10:47 +09:00
} else if desired > current {
n := desired - current
2020-02-21 03:01:52 +09:00
2021-05-21 09:10:47 +09:00
log . V ( 0 ) . Info ( fmt . Sprintf ( "Creating %d runner(s)" , n ) , "desired" , desired , "available" , current , "ready" , ready )
2021-03-20 07:34:25 +09:00
2020-02-21 03:01:52 +09:00
for i := 0 ; i < n ; i ++ {
newRunner , err := r . newRunner ( rs )
if err != nil {
log . Error ( err , "Could not create runner" )
return ctrl . Result { } , err
}
if err := r . Client . Create ( ctx , & newRunner ) ; err != nil {
log . Error ( err , "Failed to create runner resource" )
return ctrl . Result { } , err
}
}
}
2021-05-21 09:10:47 +09:00
var status v1alpha1 . RunnerReplicaSetStatus
status . Replicas = & current
status . AvailableReplicas = & available
status . ReadyReplicas = & ready
if ! reflect . DeepEqual ( rs . Status , status ) {
2020-02-21 03:01:52 +09:00
updated := rs . DeepCopy ( )
2021-05-21 09:10:47 +09:00
updated . Status = status
2020-02-21 03:01:52 +09:00
2021-05-21 09:10:47 +09:00
if err := r . Status ( ) . Patch ( ctx , updated , client . MergeFrom ( & rs ) ) ; err != nil {
log . Info ( "Failed to update runnerreplicaset status. Retrying immediately" , "error" , err . Error ( ) )
2021-02-25 09:01:02 +09:00
return ctrl . Result {
Requeue : true ,
} , nil
2020-02-21 03:01:52 +09:00
}
}
return ctrl . Result { } , nil
}
2020-03-10 09:14:11 +09:00
func ( r * RunnerReplicaSetReconciler ) newRunner ( rs v1alpha1 . RunnerReplicaSet ) ( v1alpha1 . Runner , error ) {
2020-02-26 21:23:23 +09:00
objectMeta := rs . Spec . Template . ObjectMeta . DeepCopy ( )
2020-03-15 21:50:45 +09:00
objectMeta . GenerateName = rs . ObjectMeta . Name + "-"
2020-02-26 21:23:23 +09:00
objectMeta . Namespace = rs . ObjectMeta . Namespace
2020-02-21 03:01:52 +09:00
runner := v1alpha1 . Runner {
2020-02-26 21:23:23 +09:00
TypeMeta : metav1 . TypeMeta { } ,
ObjectMeta : * objectMeta ,
Spec : rs . Spec . Template . Spec ,
2020-02-21 03:01:52 +09:00
}
if err := ctrl . SetControllerReference ( & rs , & runner , r . Scheme ) ; err != nil {
return runner , err
}
return runner , nil
}
2020-03-10 09:14:11 +09:00
func ( r * RunnerReplicaSetReconciler ) SetupWithManager ( mgr ctrl . Manager ) error {
2021-02-16 18:51:33 +09:00
name := "runnerreplicaset-controller"
2021-02-19 10:33:04 +09:00
if r . Name != "" {
name = r . Name
}
2021-02-16 18:51:33 +09:00
r . Recorder = mgr . GetEventRecorderFor ( name )
2020-02-21 03:01:52 +09:00
return ctrl . NewControllerManagedBy ( mgr ) .
2020-03-10 09:14:11 +09:00
For ( & v1alpha1 . RunnerReplicaSet { } ) .
2020-02-21 03:01:52 +09:00
Owns ( & v1alpha1 . Runner { } ) .
2021-02-16 18:51:33 +09:00
Named ( name ) .
2020-02-21 03:01:52 +09:00
Complete ( r )
}
feat: Support for scaling from/to zero (#465)
This is an attempt to support scaling from/to zero.
The basic idea is that we create a one-off "registration-only" runner pod on RunnerReplicaSet being scaled to zero, so that there is one "offline" runner, which enables GitHub Actions to queue jobs instead of discarding those.
GitHub Actions seems to immediately throw away the new job when there are no runners at all. Generally, having runners of any status, `busy`, `idle`, or `offline` would prevent GitHub actions from failing jobs. But retaining `busy` or `idle` runners means that we need to keep runner pods running, which conflicts with our desired to scale to/from zero, hence we retain `offline` runners.
In this change, I enhanced the runnerreplicaset controller to create a registration-only runner on very beginning of its reconciliation logic, only when a runnerreplicaset is scaled to zero. The runner controller creates the registration-only runner pod, waits for it to become "offline", and then removes the runner pod. The runner on GitHub stays `offline`, until the runner resource on K8s is deleted. As we remove the registration-only runner pod as soon as it registers, this doesn't block cluster-autoscaler.
Related to #447
2021-05-02 16:11:36 +09:00
func registrationOnlyRunnerNameFor ( rsName string ) string {
return rsName + "-registration-only"
}