2021-06-24 20:39:37 +09:00
/ *
Copyright 2020 The actions - runner - controller authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package controllers
import (
"context"
"errors"
"fmt"
"time"
2021-12-17 09:06:55 +09:00
"github.com/go-logr/logr"
2021-12-11 07:43:40 -05:00
gogithub "github.com/google/go-github/v39/github"
2021-06-24 20:39:37 +09:00
"k8s.io/apimachinery/pkg/util/wait"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// RunnerPodReconciler reconciles a Runner object
type RunnerPodReconciler struct {
client . Client
Log logr . Logger
Recorder record . EventRecorder
Scheme * runtime . Scheme
GitHubClient * github . Client
Name string
RegistrationRecheckInterval time . Duration
RegistrationRecheckJitter time . Duration
}
const (
2021-09-26 07:34:55 +02:00
// This names requires at least one slash to work.
2021-06-24 20:39:37 +09:00
// See https://github.com/google/knative-gcp/issues/378
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
)
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
func ( r * RunnerPodReconciler ) Reconcile ( ctx context . Context , req ctrl . Request ) ( ctrl . Result , error ) {
log := r . Log . WithValues ( "runnerpod" , req . NamespacedName )
var runnerPod corev1 . Pod
if err := r . Get ( ctx , req . NamespacedName , & runnerPod ) ; err != nil {
return ctrl . Result { } , client . IgnoreNotFound ( err )
}
_ , isRunnerPod := runnerPod . Labels [ LabelKeyRunnerSetName ]
if ! isRunnerPod {
return ctrl . Result { } , nil
}
var enterprise , org , repo string
envvars := runnerPod . Spec . Containers [ 0 ] . Env
for _ , e := range envvars {
switch e . Name {
case EnvVarEnterprise :
enterprise = e . Value
case EnvVarOrg :
org = e . Value
case EnvVarRepo :
repo = e . Value
}
}
if runnerPod . ObjectMeta . DeletionTimestamp . IsZero ( ) {
finalizers , added := addFinalizer ( runnerPod . ObjectMeta . Finalizers , runnerPodFinalizerName )
if added {
newRunner := runnerPod . DeepCopy ( )
newRunner . ObjectMeta . Finalizers = finalizers
if err := r . Patch ( ctx , newRunner , client . MergeFrom ( & runnerPod ) ) ; err != nil {
log . Error ( err , "Failed to update runner" )
return ctrl . Result { } , err
}
return ctrl . Result { } , nil
}
} else {
finalizers , removed := removeFinalizer ( runnerPod . ObjectMeta . Finalizers , runnerPodFinalizerName )
if removed {
ok , err := r . unregisterRunner ( ctx , enterprise , org , repo , runnerPod . Name )
if err != nil {
if errors . Is ( err , & gogithub . RateLimitError { } ) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log . Error (
err ,
fmt . Sprintf (
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls" ,
retryDelayOnGitHubAPIRateLimitError ,
) ,
)
return ctrl . Result { RequeueAfter : retryDelayOnGitHubAPIRateLimitError } , err
}
return ctrl . Result { } , err
}
if ! ok {
log . V ( 1 ) . Info ( "Runner no longer exists on GitHub" )
}
newRunner := runnerPod . DeepCopy ( )
newRunner . ObjectMeta . Finalizers = finalizers
if err := r . Patch ( ctx , newRunner , client . MergeFrom ( & runnerPod ) ) ; err != nil {
log . Error ( err , "Failed to update runner for finalizer removal" )
return ctrl . Result { } , err
}
log . Info ( "Removed runner from GitHub" , "repository" , repo , "organization" , org )
}
deletionTimeout := 1 * time . Minute
currentTime := time . Now ( )
deletionDidTimeout := currentTime . Sub ( runnerPod . DeletionTimestamp . Add ( deletionTimeout ) ) > 0
if deletionDidTimeout {
log . Info (
fmt . Sprintf ( "Failed to delete pod within %s. " , deletionTimeout ) +
"This is typically the case when a Kubernetes node became unreachable " +
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck." ,
"podDeletionTimestamp" , runnerPod . DeletionTimestamp ,
"currentTime" , currentTime ,
"configuredDeletionTimeout" , deletionTimeout ,
)
var force int64 = 0
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
if err := r . Delete ( ctx , & runnerPod , & client . DeleteOptions { GracePeriodSeconds : & force } ) ; err != nil {
// probably
if ! kerrors . IsNotFound ( err ) {
log . Error ( err , "Failed to forcefully delete pod resource ..." )
return ctrl . Result { } , err
}
// forceful deletion finally succeeded
return ctrl . Result { Requeue : true } , nil
}
r . Recorder . Event ( & runnerPod , corev1 . EventTypeNormal , "PodDeleted" , fmt . Sprintf ( "Forcefully deleted pod '%s'" , runnerPod . Name ) )
log . Info ( "Forcefully deleted runner pod" , "repository" , repo )
// give kube manager a little time to forcefully delete the stuck pod
return ctrl . Result { RequeueAfter : 3 * time . Second } , nil
}
return ctrl . Result { } , nil
}
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := runnerPod . Status . Phase == corev1 . PodSucceeded
if ! stopped {
if runnerPod . Status . Phase == corev1 . PodRunning {
for _ , status := range runnerPod . Status . ContainerStatuses {
if status . Name != containerName {
continue
}
if status . State . Terminated != nil && status . State . Terminated . ExitCode == 0 {
stopped = true
}
}
}
}
restart := stopped
var registrationRecheckDelay time . Duration
// all checks done below only decide whether a restart is needed
// if a restart was already decided before, there is no need for the checks
// saving API calls and scary log messages
if ! restart {
registrationCheckInterval := time . Minute
if r . RegistrationRecheckInterval > 0 {
registrationCheckInterval = r . RegistrationRecheckInterval
}
lastCheckTimeStr := runnerPod . Annotations [ AnnotationKeyLastRegistrationCheckTime ]
var lastCheckTime * time . Time
if lastCheckTimeStr != "" {
t , err := time . Parse ( time . RFC3339 , lastCheckTimeStr )
if err != nil {
log . Error ( err , "failed to parase last check time %q" , lastCheckTimeStr )
return ctrl . Result { } , nil
}
lastCheckTime = & t
}
// We want to call ListRunners GitHub Actions API only once per runner per minute.
// This if block, in conjunction with:
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
// achieves that.
if lastCheckTime != nil {
nextCheckTime := lastCheckTime . Add ( registrationCheckInterval )
now := time . Now ( )
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
// shorter than 1s
requeueAfter := nextCheckTime . Sub ( now ) - time . Second
if requeueAfter > 0 {
log . Info (
fmt . Sprintf ( "Skipped registration check because it's deferred until %s. Retrying in %s at latest" , nextCheckTime , requeueAfter ) ,
"lastRegistrationCheckTime" , lastCheckTime ,
"registrationCheckInterval" , registrationCheckInterval ,
)
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
// next sync period passes, which can be too much later than nextCheckTime.
//
// We need to requeue on this reconcilation even though we have already scheduled the initial
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
// spec/status change.
return ctrl . Result { RequeueAfter : requeueAfter } , nil
}
}
notFound := false
offline := false
_ , err := r . GitHubClient . IsRunnerBusy ( ctx , enterprise , org , repo , runnerPod . Name )
currentTime := time . Now ( )
if err != nil {
var notFoundException * github . RunnerNotFound
var offlineException * github . RunnerOffline
if errors . As ( err , & notFoundException ) {
notFound = true
} else if errors . As ( err , & offlineException ) {
offline = true
} else {
var e * gogithub . RateLimitError
if errors . As ( err , & e ) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log . Error (
err ,
fmt . Sprintf (
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls" ,
retryDelayOnGitHubAPIRateLimitError ,
) ,
)
return ctrl . Result { RequeueAfter : retryDelayOnGitHubAPIRateLimitError } , err
}
return ctrl . Result { } , err
}
}
registrationTimeout := 10 * time . Minute
durationAfterRegistrationTimeout := currentTime . Sub ( runnerPod . CreationTimestamp . Add ( registrationTimeout ) )
registrationDidTimeout := durationAfterRegistrationTimeout > 0
if notFound {
if registrationDidTimeout {
log . Info (
"Runner failed to register itself to GitHub in timely manner. " +
"Recreating the pod to see if it resolves the issue. " +
"CAUTION: If you see this a lot, you should investigate the root cause. " +
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288" ,
"podCreationTimestamp" , runnerPod . CreationTimestamp ,
"currentTime" , currentTime ,
"configuredRegistrationTimeout" , registrationTimeout ,
)
restart = true
} else {
log . V ( 1 ) . Info (
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time." ,
"runnerName" , runnerPod . Name ,
)
}
} else if offline {
if registrationDidTimeout {
log . Info (
"Already existing GitHub runner still appears offline . " +
"Recreating the pod to see if it resolves the issue. " +
"CAUTION: If you see this a lot, you should investigate the root cause. " ,
"podCreationTimestamp" , runnerPod . CreationTimestamp ,
"currentTime" , currentTime ,
"configuredRegistrationTimeout" , registrationTimeout ,
)
restart = true
} else {
log . V ( 1 ) . Info (
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ..." ,
"runnerName" , runnerPod . Name ,
)
}
}
if ( notFound || offline ) && ! registrationDidTimeout {
registrationRecheckJitter := 10 * time . Second
if r . RegistrationRecheckJitter > 0 {
registrationRecheckJitter = r . RegistrationRecheckJitter
}
registrationRecheckDelay = registrationCheckInterval + wait . Jitter ( registrationRecheckJitter , 0.1 )
}
}
// Don't do anything if there's no need to restart the runner
if ! restart {
// This guard enables us to update runner.Status.Phase to `Running` only after
// the runner is registered to GitHub.
if registrationRecheckDelay > 0 {
log . V ( 1 ) . Info ( fmt . Sprintf ( "Rechecking the runner registration in %s" , registrationRecheckDelay ) )
updated := runnerPod . DeepCopy ( )
t := time . Now ( ) . Format ( time . RFC3339 )
updated . Annotations [ AnnotationKeyLastRegistrationCheckTime ] = t
if err := r . Patch ( ctx , updated , client . MergeFrom ( & runnerPod ) ) ; err != nil {
log . Error ( err , "Failed to update runner pod annotation for LastRegistrationCheckTime" )
return ctrl . Result { } , err
}
return ctrl . Result { RequeueAfter : registrationRecheckDelay } , nil
}
// Seeing this message, you can expect the runner to become `Running` soon.
log . Info (
"Runner appears to have registered and running." ,
"podCreationTimestamp" , runnerPod . CreationTimestamp ,
)
return ctrl . Result { } , nil
}
// Delete current pod if recreation is needed
if err := r . Delete ( ctx , & runnerPod ) ; err != nil {
log . Error ( err , "Failed to delete pod resource" )
return ctrl . Result { } , err
}
r . Recorder . Event ( & runnerPod , corev1 . EventTypeNormal , "PodDeleted" , fmt . Sprintf ( "Deleted pod '%s'" , runnerPod . Name ) )
log . Info ( "Deleted runner pod" , "name" , runnerPod . Name )
return ctrl . Result { } , nil
}
func ( r * RunnerPodReconciler ) unregisterRunner ( ctx context . Context , enterprise , org , repo , name string ) ( bool , error ) {
runners , err := r . GitHubClient . ListRunners ( ctx , enterprise , org , repo )
if err != nil {
return false , err
}
var busy bool
id := int64 ( 0 )
for _ , runner := range runners {
if runner . GetName ( ) == name {
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Thus removing the condition on status can block the runner pod from being terminated forever.
busy = runner . GetBusy ( )
if runner . GetStatus ( ) != "offline" && busy {
r . Log . Info ( "This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy" , "name" , runner . GetName ( ) , "status" , runner . GetStatus ( ) , "busy" , runner . GetBusy ( ) )
return false , fmt . Errorf ( "runner is busy" )
}
id = runner . GetID ( )
break
}
}
if id == int64 ( 0 ) {
return false , nil
}
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Trying to remove the offline but busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
if ! busy {
if err := r . GitHubClient . RemoveRunner ( ctx , enterprise , org , repo , id ) ; err != nil {
return false , err
}
}
return true , nil
}
func ( r * RunnerPodReconciler ) SetupWithManager ( mgr ctrl . Manager ) error {
name := "runnerpod-controller"
if r . Name != "" {
name = r . Name
}
r . Recorder = mgr . GetEventRecorderFor ( name )
return ctrl . NewControllerManagedBy ( mgr ) .
For ( & corev1 . Pod { } ) .
Named ( name ) .
Complete ( r )
}