2021-06-24 20:39:37 +09:00
/ *
Copyright 2020 The actions - runner - controller authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package controllers
import (
"context"
2022-03-23 12:39:58 +01:00
"errors"
2021-06-24 20:39:37 +09:00
"fmt"
2022-06-28 01:12:40 -04:00
"sync"
2021-06-24 20:39:37 +09:00
"time"
2021-12-17 09:06:55 +09:00
"github.com/go-logr/logr"
2021-06-24 20:39:37 +09:00
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
corev1 "k8s.io/api/core/v1"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// RunnerPodReconciler reconciles a Runner object
type RunnerPodReconciler struct {
client . Client
Log logr . Logger
Recorder record . EventRecorder
Scheme * runtime . Scheme
GitHubClient * github . Client
Name string
RegistrationRecheckInterval time . Duration
RegistrationRecheckJitter time . Duration
2022-02-20 07:45:49 +00:00
UnregistrationRetryDelay time . Duration
2021-06-24 20:39:37 +09:00
}
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
2022-06-28 01:12:40 -04:00
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch
2021-06-24 20:39:37 +09:00
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
func ( r * RunnerPodReconciler ) Reconcile ( ctx context . Context , req ctrl . Request ) ( ctrl . Result , error ) {
log := r . Log . WithValues ( "runnerpod" , req . NamespacedName )
var runnerPod corev1 . Pod
if err := r . Get ( ctx , req . NamespacedName , & runnerPod ) ; err != nil {
return ctrl . Result { } , client . IgnoreNotFound ( err )
}
_ , isRunnerPod := runnerPod . Labels [ LabelKeyRunnerSetName ]
if ! isRunnerPod {
return ctrl . Result { } , nil
}
2022-03-23 12:39:58 +01:00
var envvars [ ] corev1 . EnvVar
for _ , container := range runnerPod . Spec . Containers {
if container . Name == "runner" {
envvars = container . Env
}
}
if len ( envvars ) == 0 {
return ctrl . Result { } , errors . New ( "Could not determine env vars for runner Pod" )
}
2021-06-24 20:39:37 +09:00
var enterprise , org , repo string
2022-06-28 01:12:40 -04:00
var isContainerMode bool
2021-06-24 20:39:37 +09:00
for _ , e := range envvars {
switch e . Name {
case EnvVarEnterprise :
enterprise = e . Value
case EnvVarOrg :
org = e . Value
case EnvVarRepo :
repo = e . Value
2022-06-28 01:12:40 -04:00
case "ACTIONS_RUNNER_CONTAINER_HOOKS" :
isContainerMode = true
2021-06-24 20:39:37 +09:00
}
}
if runnerPod . ObjectMeta . DeletionTimestamp . IsZero ( ) {
finalizers , added := addFinalizer ( runnerPod . ObjectMeta . Finalizers , runnerPodFinalizerName )
2022-06-28 01:12:40 -04:00
var cleanupFinalizersAdded bool
if isContainerMode {
finalizers , cleanupFinalizersAdded = addFinalizer ( finalizers , runnerLinkedResourcesFinalizerName )
}
if added || cleanupFinalizersAdded {
2021-06-24 20:39:37 +09:00
newRunner := runnerPod . DeepCopy ( )
newRunner . ObjectMeta . Finalizers = finalizers
if err := r . Patch ( ctx , newRunner , client . MergeFrom ( & runnerPod ) ) ; err != nil {
log . Error ( err , "Failed to update runner" )
return ctrl . Result { } , err
}
2022-03-01 02:28:15 +00:00
log . V ( 2 ) . Info ( "Added finalizer" )
2021-06-24 20:39:37 +09:00
return ctrl . Result { } , nil
}
} else {
2022-03-01 02:28:15 +00:00
log . V ( 2 ) . Info ( "Seen deletion-timestamp is already set" )
2022-06-28 01:12:40 -04:00
if finalizers , removed := removeFinalizer ( runnerPod . ObjectMeta . Finalizers , runnerLinkedResourcesFinalizerName ) ; removed {
if err := r . cleanupRunnerLinkedPods ( ctx , & runnerPod , log ) ; err != nil {
log . Info ( "Runner-linked pods clean up that has failed due to an error. If this persists, please manually remove the runner-linked pods to unblock ARC" , "err" , err . Error ( ) )
return ctrl . Result { Requeue : true , RequeueAfter : 30 * time . Second } , nil
}
if err := r . cleanupRunnerLinkedSecrets ( ctx , & runnerPod , log ) ; err != nil {
log . Info ( "Runner-linked secrets clean up that has failed due to an error. If this persists, please manually remove the runner-linked secrets to unblock ARC" , "err" , err . Error ( ) )
return ctrl . Result { Requeue : true , RequeueAfter : 30 * time . Second } , nil
}
patchedPod := runnerPod . DeepCopy ( )
patchedPod . ObjectMeta . Finalizers = finalizers
if err := r . Patch ( ctx , patchedPod , client . MergeFrom ( & runnerPod ) ) ; err != nil {
log . Error ( err , "Failed to update runner for finalizer linked resources removal" )
return ctrl . Result { } , err
}
// Otherwise the subsequent patch request can revive the removed finalizer and it will trigger a unnecessary reconcilation
runnerPod = * patchedPod
}
2021-06-24 20:39:37 +09:00
finalizers , removed := removeFinalizer ( runnerPod . ObjectMeta . Finalizers , runnerPodFinalizerName )
if removed {
2022-03-05 05:40:09 +00:00
// In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set.
// But for the case that the user manually deleted it for whatever reason,
// we have to ensure it to gracefully stop now.
2022-03-13 07:22:04 +00:00
updatedPod , res , err := tickRunnerGracefulStop ( ctx , r . unregistrationRetryDelay ( ) , log , r . GitHubClient , r . Client , enterprise , org , repo , runnerPod . Name , & runnerPod )
2022-02-19 16:12:39 +00:00
if res != nil {
return * res , err
2021-06-24 20:39:37 +09:00
}
2022-02-19 16:12:39 +00:00
patchedPod := updatedPod . DeepCopy ( )
patchedPod . ObjectMeta . Finalizers = finalizers
2021-06-24 20:39:37 +09:00
2022-02-19 16:12:39 +00:00
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
if err := r . Patch ( ctx , patchedPod , client . MergeFrom ( & runnerPod ) ) ; err != nil {
2021-06-24 20:39:37 +09:00
log . Error ( err , "Failed to update runner for finalizer removal" )
return ctrl . Result { } , err
}
2022-03-01 02:28:15 +00:00
log . V ( 2 ) . Info ( "Removed finalizer" )
return ctrl . Result { } , nil
2021-06-24 20:39:37 +09:00
}
deletionTimeout := 1 * time . Minute
currentTime := time . Now ( )
deletionDidTimeout := currentTime . Sub ( runnerPod . DeletionTimestamp . Add ( deletionTimeout ) ) > 0
if deletionDidTimeout {
log . Info (
fmt . Sprintf ( "Failed to delete pod within %s. " , deletionTimeout ) +
"This is typically the case when a Kubernetes node became unreachable " +
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck." ,
"podDeletionTimestamp" , runnerPod . DeletionTimestamp ,
"currentTime" , currentTime ,
"configuredDeletionTimeout" , deletionTimeout ,
)
var force int64 = 0
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
if err := r . Delete ( ctx , & runnerPod , & client . DeleteOptions { GracePeriodSeconds : & force } ) ; err != nil {
// probably
if ! kerrors . IsNotFound ( err ) {
log . Error ( err , "Failed to forcefully delete pod resource ..." )
return ctrl . Result { } , err
}
// forceful deletion finally succeeded
return ctrl . Result { Requeue : true } , nil
}
r . Recorder . Event ( & runnerPod , corev1 . EventTypeNormal , "PodDeleted" , fmt . Sprintf ( "Forcefully deleted pod '%s'" , runnerPod . Name ) )
log . Info ( "Forcefully deleted runner pod" , "repository" , repo )
// give kube manager a little time to forcefully delete the stuck pod
return ctrl . Result { RequeueAfter : 3 * time . Second } , nil
}
return ctrl . Result { } , nil
}
2022-03-01 02:28:15 +00:00
po , res , err := ensureRunnerPodRegistered ( ctx , log , r . GitHubClient , r . Client , enterprise , org , repo , runnerPod . Name , & runnerPod )
if res != nil {
return * res , err
}
runnerPod = * po
2022-03-05 10:41:52 +00:00
if _ , unregistrationRequested := getAnnotation ( & runnerPod , AnnotationKeyUnregistrationRequestTimestamp ) ; unregistrationRequested {
2022-03-01 02:28:15 +00:00
log . V ( 2 ) . Info ( "Progressing unregistration because unregistration-request timestamp is set" )
// At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller.
//
// In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all,
// so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work.
2022-03-13 07:22:04 +00:00
_ , res , err := tickRunnerGracefulStop ( ctx , r . unregistrationRetryDelay ( ) , log , r . GitHubClient , r . Client , enterprise , org , repo , runnerPod . Name , & runnerPod )
2022-03-01 02:28:15 +00:00
if res != nil {
return * res , err
}
// At this point we are sure that the runner has successfully unregistered, hence is safe to be deleted.
// But we don't delete the pod here. Instead, let the upstream controller/parent object to delete this pod as
// a part of a cascade deletion.
// This is to avoid a parent object, like statefulset, to recreate the deleted pod.
// If the pod was recreated, it will start a registration process and that may race with the statefulset deleting the pod.
log . V ( 2 ) . Info ( "Unregistration seems complete" )
return ctrl . Result { } , nil
}
2021-06-24 20:39:37 +09:00
return ctrl . Result { } , nil
}
2022-02-20 07:45:49 +00:00
func ( r * RunnerPodReconciler ) unregistrationRetryDelay ( ) time . Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r . UnregistrationRetryDelay > 0 {
retryDelay = r . UnregistrationRetryDelay
}
return retryDelay
}
2021-06-24 20:39:37 +09:00
func ( r * RunnerPodReconciler ) SetupWithManager ( mgr ctrl . Manager ) error {
name := "runnerpod-controller"
if r . Name != "" {
name = r . Name
}
r . Recorder = mgr . GetEventRecorderFor ( name )
return ctrl . NewControllerManagedBy ( mgr ) .
For ( & corev1 . Pod { } ) .
Named ( name ) .
Complete ( r )
}
2022-06-28 01:12:40 -04:00
func ( r * RunnerPodReconciler ) cleanupRunnerLinkedPods ( ctx context . Context , pod * corev1 . Pod , log logr . Logger ) error {
var runnerLinkedPodList corev1 . PodList
if err := r . List ( ctx , & runnerLinkedPodList , client . InNamespace ( pod . Namespace ) , client . MatchingLabels (
map [ string ] string {
"runner-pod" : pod . ObjectMeta . Name ,
} ,
) ) ; err != nil {
return fmt . Errorf ( "failed to list runner-linked pods: %w" , err )
}
var (
wg sync . WaitGroup
errs [ ] error
)
for _ , p := range runnerLinkedPodList . Items {
if ! p . ObjectMeta . DeletionTimestamp . IsZero ( ) {
continue
}
p := p
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
if err := r . Delete ( ctx , & p ) ; err != nil {
if kerrors . IsNotFound ( err ) || kerrors . IsGone ( err ) {
return
}
errs = append ( errs , fmt . Errorf ( "delete pod %q error: %v" , p . ObjectMeta . Name , err ) )
}
} ( )
}
wg . Wait ( )
if len ( errs ) > 0 {
for _ , err := range errs {
log . Error ( err , "failed to remove runner-linked pod" )
}
return errors . New ( "failed to remove some runner linked pods" )
}
return nil
}
func ( r * RunnerPodReconciler ) cleanupRunnerLinkedSecrets ( ctx context . Context , pod * corev1 . Pod , log logr . Logger ) error {
log . V ( 2 ) . Info ( "Listing runner-linked secrets to be deleted" , "ns" , pod . Namespace )
var runnerLinkedSecretList corev1 . SecretList
if err := r . List ( ctx , & runnerLinkedSecretList , client . InNamespace ( pod . Namespace ) , client . MatchingLabels (
map [ string ] string {
"runner-pod" : pod . ObjectMeta . Name ,
} ,
) ) ; err != nil {
return fmt . Errorf ( "failed to list runner-linked secrets: %w" , err )
}
var (
wg sync . WaitGroup
errs [ ] error
)
for _ , s := range runnerLinkedSecretList . Items {
if ! s . ObjectMeta . DeletionTimestamp . IsZero ( ) {
continue
}
s := s
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
if err := r . Delete ( ctx , & s ) ; err != nil {
if kerrors . IsNotFound ( err ) || kerrors . IsGone ( err ) {
return
}
errs = append ( errs , fmt . Errorf ( "delete secret %q error: %v" , s . ObjectMeta . Name , err ) )
}
} ( )
}
wg . Wait ( )
if len ( errs ) > 0 {
for _ , err := range errs {
log . Error ( err , "failed to remove runner-linked secret" )
}
return errors . New ( "failed to remove some runner linked secrets" )
}
return nil
}