5
5
"fmt"
6
6
"os"
7
7
"reflect"
8
+ "runtime"
8
9
"strconv"
9
10
"strings"
10
11
"time"
@@ -32,7 +33,7 @@ import (
32
33
networkingv1 "k8s.io/api/networking/v1"
33
34
"k8s.io/apimachinery/pkg/api/errors"
34
35
"k8s.io/apimachinery/pkg/api/resource"
35
- "k8s.io/apimachinery/pkg/runtime"
36
+ k8sruntime "k8s.io/apimachinery/pkg/runtime"
36
37
ctrl "sigs.k8s.io/controller-runtime"
37
38
"sigs.k8s.io/controller-runtime/pkg/builder"
38
39
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -43,6 +44,8 @@ import (
43
44
"sigs.k8s.io/controller-runtime/pkg/reconcile"
44
45
)
45
46
47
+ type reconcileFunc func (context.Context , * rayv1.RayCluster ) error
48
+
46
49
var (
47
50
DefaultRequeueDuration = 2 * time .Second
48
51
EnableBatchScheduler bool
@@ -117,7 +120,7 @@ var _ reconcile.Reconciler = &RayClusterReconciler{}
117
120
// RayClusterReconciler reconciles a RayCluster object
118
121
type RayClusterReconciler struct {
119
122
client.Client
120
- Scheme * runtime .Scheme
123
+ Scheme * k8sruntime .Scheme
121
124
Recorder record.EventRecorder
122
125
BatchSchedulerMgr * batchscheduler.SchedulerManager
123
126
@@ -194,6 +197,7 @@ func (r *RayClusterReconciler) deleteAllPods(ctx context.Context, filters common
194
197
}
195
198
196
199
func (r * RayClusterReconciler ) rayClusterReconcile (ctx context.Context , request ctrl.Request , instance * rayv1.RayCluster ) (ctrl.Result , error ) {
200
+ var reconcileErr error
197
201
logger := ctrl .LoggerFrom (ctx )
198
202
199
203
// Please do NOT modify `originalRayClusterInstance` in the following code.
@@ -296,77 +300,45 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
296
300
return ctrl.Result {}, nil
297
301
}
298
302
299
- if err := r .reconcileAutoscalerServiceAccount (ctx , instance ); err != nil {
300
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
301
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
302
- }
303
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
303
+ reconcileFuncs := []reconcileFunc {
304
+ r .reconcileAutoscalerServiceAccount ,
305
+ r .reconcileAutoscalerRole ,
306
+ r .reconcileAutoscalerRoleBinding ,
307
+ r .reconcileIngress ,
308
+ r .reconcileHeadService ,
309
+ r .reconcileHeadlessService ,
310
+ r .reconcileServeService ,
311
+ r .reconcilePods ,
304
312
}
305
313
306
- if err := r .reconcileAutoscalerRole (ctx , instance ); err != nil {
307
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
308
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
309
- }
310
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
311
- }
312
- if err := r .reconcileAutoscalerRoleBinding (ctx , instance ); err != nil {
313
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
314
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
315
- }
316
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
317
- }
318
- if err := r .reconcileIngress (ctx , instance ); err != nil {
319
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
320
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
321
- }
322
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
323
- }
324
- if err := r .reconcileHeadService (ctx , instance ); err != nil {
325
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
326
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
327
- }
328
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
329
- }
330
- if err := r .reconcileHeadlessService (ctx , instance ); err != nil {
331
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
332
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
333
- }
334
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
335
- }
336
- // Only reconcile the K8s service for Ray Serve when the "ray.io/enable-serve-service" annotation is set to true.
337
- if enableServeServiceValue , exist := instance .Annotations [utils .EnableServeServiceKey ]; exist && enableServeServiceValue == utils .EnableServeServiceTrue {
338
- if err := r .reconcileServeService (ctx , instance ); err != nil {
339
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
340
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
341
- }
342
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
343
- }
344
- }
345
- if err := r .reconcilePods (ctx , instance ); err != nil {
346
- if updateErr := r .updateClusterState (ctx , instance , rayv1 .Failed ); updateErr != nil {
347
- logger .Error (updateErr , "RayCluster update state error" , "cluster name" , request .Name )
348
- }
349
- if updateErr := r .updateClusterReason (ctx , instance , err .Error ()); updateErr != nil {
350
- logger .Error (updateErr , "RayCluster update reason error" , "cluster name" , request .Name )
314
+ for _ , fn := range reconcileFuncs {
315
+ if reconcileErr = fn (ctx , instance ); reconcileErr != nil {
316
+ funcName := runtime .FuncForPC (reflect .ValueOf (fn ).Pointer ()).Name ()
317
+ logger .Error (reconcileErr , "Error reconcile resources" , "function name" , funcName )
318
+ break
351
319
}
352
- r .Recorder .Event (instance , corev1 .EventTypeWarning , string (rayv1 .PodReconciliationError ), err .Error ())
353
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
354
320
}
355
321
356
322
// Calculate the new status for the RayCluster. Note that the function will deep copy `instance` instead of mutating it.
357
- newInstance , err := r .calculateStatus (ctx , instance )
358
- if err != nil {
359
- logger .Info ("Got error when calculating new status" , "cluster name" , request .Name , "error" , err )
360
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
323
+ newInstance , calculateErr := r .calculateStatus (ctx , instance , reconcileErr )
324
+ var updateErr error
325
+ if calculateErr != nil {
326
+ logger .Info ("Got error when calculating new status" , "cluster name" , request .Name , "error" , calculateErr )
327
+ } else {
328
+ updateErr = r .updateRayClusterStatus (ctx , originalRayClusterInstance , newInstance )
361
329
}
362
330
363
- // Check if need to update the status.
364
- if r .inconsistentRayClusterStatus (ctx , originalRayClusterInstance .Status , newInstance .Status ) {
365
- logger .Info ("rayClusterReconcile" , "Update CR status" , request .Name , "status" , newInstance .Status )
366
- if err := r .Status ().Update (ctx , newInstance ); err != nil {
367
- logger .Info ("Got error when updating status" , "cluster name" , request .Name , "error" , err , "RayCluster" , newInstance )
368
- return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
369
- }
331
+ // Return error based on order.
332
+ var err error
333
+ if reconcileErr != nil {
334
+ err = reconcileErr
335
+ } else if calculateErr != nil {
336
+ err = calculateErr
337
+ } else {
338
+ err = updateErr
339
+ }
340
+ if err != nil {
341
+ return ctrl.Result {RequeueAfter : DefaultRequeueDuration }, err
370
342
}
371
343
372
344
// Unconditionally requeue after the number of seconds specified in the
@@ -555,6 +527,11 @@ func (r *RayClusterReconciler) reconcileHeadService(ctx context.Context, instanc
555
527
556
528
// Return nil only when the serve service successfully created or already exists.
557
529
func (r * RayClusterReconciler ) reconcileServeService (ctx context.Context , instance * rayv1.RayCluster ) error {
530
+ // Only reconcile the K8s service for Ray Serve when the "ray.io/enable-serve-service" annotation is set to true.
531
+ if enableServeServiceValue , exist := instance .Annotations [utils .EnableServeServiceKey ]; ! exist || enableServeServiceValue != utils .EnableServeServiceTrue {
532
+ return nil
533
+ }
534
+
558
535
// Retrieve the Service from the Kubernetes cluster with the name and namespace.
559
536
svc := & corev1.Service {}
560
537
err := r .Get (ctx , common .RayClusterServeServiceNamespacedName (instance ), svc )
@@ -1168,45 +1145,50 @@ func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcu
1168
1145
Complete (r )
1169
1146
}
1170
1147
1171
- func (r * RayClusterReconciler ) calculateStatus (ctx context.Context , instance * rayv1.RayCluster ) (* rayv1.RayCluster , error ) {
1148
+ func (r * RayClusterReconciler ) calculateStatus (ctx context.Context , instance * rayv1.RayCluster , reconcileErr error ) (* rayv1.RayCluster , error ) {
1172
1149
// Deep copy the instance, so we don't mutate the original object.
1173
1150
newInstance := instance .DeepCopy ()
1174
1151
1175
- // TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
1176
- newInstance .Status .ObservedGeneration = newInstance .ObjectMeta .Generation
1152
+ if reconcileErr != nil {
1153
+ newInstance .Status .State = rayv1 .Failed
1154
+ newInstance .Status .Reason = reconcileErr .Error ()
1155
+ } else {
1156
+ // TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
1157
+ newInstance .Status .ObservedGeneration = newInstance .ObjectMeta .Generation
1177
1158
1178
- runtimePods := corev1.PodList {}
1179
- filterLabels := client.MatchingLabels {utils .RayClusterLabelKey : newInstance .Name }
1180
- if err := r .List (ctx , & runtimePods , client .InNamespace (newInstance .Namespace ), filterLabels ); err != nil {
1181
- return nil , err
1182
- }
1159
+ runtimePods := corev1.PodList {}
1160
+ filterLabels := client.MatchingLabels {utils .RayClusterLabelKey : newInstance .Name }
1161
+ if err := r .List (ctx , & runtimePods , client .InNamespace (newInstance .Namespace ), filterLabels ); err != nil {
1162
+ return nil , err
1163
+ }
1183
1164
1184
- newInstance .Status .ReadyWorkerReplicas = utils .CalculateReadyReplicas (runtimePods )
1185
- newInstance .Status .AvailableWorkerReplicas = utils .CalculateAvailableReplicas (runtimePods )
1186
- newInstance .Status .DesiredWorkerReplicas = utils .CalculateDesiredReplicas (ctx , newInstance )
1187
- newInstance .Status .MinWorkerReplicas = utils .CalculateMinReplicas (newInstance )
1188
- newInstance .Status .MaxWorkerReplicas = utils .CalculateMaxReplicas (newInstance )
1165
+ newInstance .Status .ReadyWorkerReplicas = utils .CalculateReadyReplicas (runtimePods )
1166
+ newInstance .Status .AvailableWorkerReplicas = utils .CalculateAvailableReplicas (runtimePods )
1167
+ newInstance .Status .DesiredWorkerReplicas = utils .CalculateDesiredReplicas (ctx , newInstance )
1168
+ newInstance .Status .MinWorkerReplicas = utils .CalculateMinReplicas (newInstance )
1169
+ newInstance .Status .MaxWorkerReplicas = utils .CalculateMaxReplicas (newInstance )
1189
1170
1190
- totalResources := utils .CalculateDesiredResources (newInstance )
1191
- newInstance .Status .DesiredCPU = totalResources [corev1 .ResourceCPU ]
1192
- newInstance .Status .DesiredMemory = totalResources [corev1 .ResourceMemory ]
1193
- newInstance .Status .DesiredGPU = sumGPUs (totalResources )
1194
- newInstance .Status .DesiredTPU = totalResources [corev1 .ResourceName ("google.com/tpu" )]
1171
+ totalResources := utils .CalculateDesiredResources (newInstance )
1172
+ newInstance .Status .DesiredCPU = totalResources [corev1 .ResourceCPU ]
1173
+ newInstance .Status .DesiredMemory = totalResources [corev1 .ResourceMemory ]
1174
+ newInstance .Status .DesiredGPU = sumGPUs (totalResources )
1175
+ newInstance .Status .DesiredTPU = totalResources [corev1 .ResourceName ("google.com/tpu" )]
1195
1176
1196
- if utils .CheckAllPodsRunning (ctx , runtimePods ) {
1197
- newInstance .Status .State = rayv1 .Ready
1198
- }
1177
+ if utils .CheckAllPodsRunning (ctx , runtimePods ) {
1178
+ newInstance .Status .State = rayv1 .Ready
1179
+ }
1199
1180
1200
- if newInstance .Spec .Suspend != nil && * newInstance .Spec .Suspend && len (runtimePods .Items ) == 0 {
1201
- newInstance .Status .State = rayv1 .Suspended
1202
- }
1181
+ if newInstance .Spec .Suspend != nil && * newInstance .Spec .Suspend && len (runtimePods .Items ) == 0 {
1182
+ newInstance .Status .State = rayv1 .Suspended
1183
+ }
1203
1184
1204
- if err := r .updateEndpoints (ctx , newInstance ); err != nil {
1205
- return nil , err
1206
- }
1185
+ if err := r .updateEndpoints (ctx , newInstance ); err != nil {
1186
+ return nil , err
1187
+ }
1207
1188
1208
- if err := r .updateHeadInfo (ctx , newInstance ); err != nil {
1209
- return nil , err
1189
+ if err := r .updateHeadInfo (ctx , newInstance ); err != nil {
1190
+ return nil , err
1191
+ }
1210
1192
}
1211
1193
1212
1194
timeNow := metav1 .Now ()
@@ -1456,24 +1438,17 @@ func (r *RayClusterReconciler) reconcileAutoscalerRoleBinding(ctx context.Contex
1456
1438
return nil
1457
1439
}
1458
1440
1459
- func (r * RayClusterReconciler ) updateClusterState (ctx context.Context , instance * rayv1. RayCluster , clusterState rayv1.ClusterState ) error {
1441
+ func (r * RayClusterReconciler ) updateRayClusterStatus (ctx context.Context , originalRayClusterInstance , newInstance * rayv1.RayCluster ) error {
1460
1442
logger := ctrl .LoggerFrom (ctx )
1461
- if instance . Status . State == clusterState {
1443
+ if ! r . inconsistentRayClusterStatus ( ctx , originalRayClusterInstance . Status , newInstance . Status ) {
1462
1444
return nil
1463
1445
}
1464
- instance .Status .State = clusterState
1465
- logger .Info ("updateClusterState" , "Update CR Status.State" , clusterState )
1466
- return r .Status ().Update (ctx , instance )
1467
- }
1468
-
1469
- func (r * RayClusterReconciler ) updateClusterReason (ctx context.Context , instance * rayv1.RayCluster , clusterReason string ) error {
1470
- logger := ctrl .LoggerFrom (ctx )
1471
- if instance .Status .Reason == clusterReason {
1472
- return nil
1446
+ logger .Info ("updateRayClusterStatus" , "name" , originalRayClusterInstance .Name , "old status" , originalRayClusterInstance .Status , "new status" , newInstance .Status )
1447
+ err := r .Status ().Update (ctx , newInstance )
1448
+ if err != nil {
1449
+ logger .Info ("Error updating status" , "name" , originalRayClusterInstance .Name , "error" , err , "RayCluster" , newInstance )
1473
1450
}
1474
- instance .Status .Reason = clusterReason
1475
- logger .Info ("updateClusterReason" , "Update CR Status.Reason" , clusterReason )
1476
- return r .Status ().Update (ctx , instance )
1451
+ return err
1477
1452
}
1478
1453
1479
1454
// sumGPUs sums the GPUs in the given resource list.
0 commit comments