@@ -10,12 +10,14 @@ import (
10
10
"strings"
11
11
"time"
12
12
13
+ "k8s.io/apimachinery/pkg/api/meta"
13
14
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14
15
"k8s.io/utils/ptr"
15
16
16
17
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
17
18
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
18
19
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
20
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
19
21
20
22
batchv1 "k8s.io/api/batch/v1"
21
23
rbacv1 "k8s.io/api/rbac/v1"
@@ -44,7 +46,8 @@ import (
44
46
"sigs.k8s.io/controller-runtime/pkg/reconcile"
45
47
)
46
48
47
- type reconcileFunc func (context.Context , * rayv1.RayCluster ) error
49
+ type rayClusterConditions map [rayv1.RayClusterConditionType ]metav1.Condition
50
+ type reconcileFunc func (context.Context , * rayv1.RayCluster , rayClusterConditions ) error
48
51
49
52
var (
50
53
DefaultRequeueDuration = 2 * time .Second
@@ -300,6 +303,9 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
300
303
return ctrl.Result {}, nil
301
304
}
302
305
306
+ // conditions should be mutated by the following reconcileXXX functions.
307
+ conditions := defaultRayClusterConditions ()
308
+
303
309
reconcileFuncs := []reconcileFunc {
304
310
r .reconcileAutoscalerServiceAccount ,
305
311
r .reconcileAutoscalerRole ,
@@ -312,7 +318,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
312
318
}
313
319
314
320
for _ , fn := range reconcileFuncs {
315
- if reconcileErr = fn (ctx , instance ); reconcileErr != nil {
321
+ if reconcileErr = fn (ctx , instance , conditions ); reconcileErr != nil {
316
322
funcName := runtime .FuncForPC (reflect .ValueOf (fn ).Pointer ()).Name ()
317
323
logger .Error (reconcileErr , "Error reconcile resources" , "function name" , funcName )
318
324
break
@@ -325,7 +331,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
325
331
if calculateErr != nil {
326
332
logger .Info ("Got error when calculating new status" , "cluster name" , request .Name , "error" , calculateErr )
327
333
} else {
328
- updateErr = r .updateRayClusterStatus (ctx , originalRayClusterInstance , newInstance )
334
+ updateErr = r .updateRayClusterStatus (ctx , originalRayClusterInstance , newInstance , conditions )
329
335
}
330
336
331
337
// Return error based on order.
@@ -394,7 +400,7 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
394
400
return false
395
401
}
396
402
397
- func (r * RayClusterReconciler ) reconcileIngress (ctx context.Context , instance * rayv1.RayCluster ) error {
403
+ func (r * RayClusterReconciler ) reconcileIngress (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
398
404
logger := ctrl .LoggerFrom (ctx )
399
405
logger .Info ("Reconciling Ingress" )
400
406
if instance .Spec .HeadGroupSpec .EnableIngress == nil || ! * instance .Spec .HeadGroupSpec .EnableIngress {
@@ -474,7 +480,7 @@ func (r *RayClusterReconciler) reconcileIngressKubernetes(ctx context.Context, i
474
480
}
475
481
476
482
// Return nil only when the head service successfully created or already exists.
477
- func (r * RayClusterReconciler ) reconcileHeadService (ctx context.Context , instance * rayv1.RayCluster ) error {
483
+ func (r * RayClusterReconciler ) reconcileHeadService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
478
484
logger := ctrl .LoggerFrom (ctx )
479
485
services := corev1.ServiceList {}
480
486
filterLabels := client.MatchingLabels {utils .RayClusterLabelKey : instance .Name , utils .RayNodeTypeLabelKey : string (rayv1 .HeadNode )}
@@ -526,7 +532,7 @@ func (r *RayClusterReconciler) reconcileHeadService(ctx context.Context, instanc
526
532
}
527
533
528
534
// Return nil only when the serve service successfully created or already exists.
529
- func (r * RayClusterReconciler ) reconcileServeService (ctx context.Context , instance * rayv1.RayCluster ) error {
535
+ func (r * RayClusterReconciler ) reconcileServeService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
530
536
// Only reconcile the K8s service for Ray Serve when the "ray.io/enable-serve-service" annotation is set to true.
531
537
if enableServeServiceValue , exist := instance .Annotations [utils .EnableServeServiceKey ]; ! exist || enableServeServiceValue != utils .EnableServeServiceTrue {
532
538
return nil
@@ -555,7 +561,7 @@ func (r *RayClusterReconciler) reconcileServeService(ctx context.Context, instan
555
561
}
556
562
557
563
// Return nil only when the headless service for multi-host worker groups is successfully created or already exists.
558
- func (r * RayClusterReconciler ) reconcileHeadlessService (ctx context.Context , instance * rayv1.RayCluster ) error {
564
+ func (r * RayClusterReconciler ) reconcileHeadlessService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
559
565
// Check if there are worker groups with NumOfHosts > 1 in the cluster
560
566
isMultiHost := false
561
567
for _ , workerGroup := range instance .Spec .WorkerGroupSpecs {
@@ -591,12 +597,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins
591
597
return nil
592
598
}
593
599
594
- func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
600
+ func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster , conditions rayClusterConditions ) error {
595
601
logger := ctrl .LoggerFrom (ctx )
596
602
597
603
// if RayCluster is suspended, delete all pods and skip reconcile
598
604
if instance .Spec .Suspend != nil && * instance .Spec .Suspend {
599
605
if _ , err := r .deleteAllPods (ctx , common .RayClusterAllPodsAssociationOptions (instance )); err != nil {
606
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
607
+ Status : metav1 .ConditionTrue ,
608
+ Reason : "FailedDeleteAllPods" ,
609
+ Message : err .Error (),
610
+ }
600
611
return err
601
612
}
602
613
@@ -632,6 +643,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
632
643
logger .Info ("reconcilePods" , "head Pod" , headPod .Name , "shouldDelete" , shouldDelete , "reason" , reason )
633
644
if shouldDelete {
634
645
if err := r .Delete (ctx , & headPod ); err != nil {
646
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
647
+ Status : metav1 .ConditionTrue ,
648
+ Reason : "FailedDeleteHeadPod" ,
649
+ Message : err .Error (),
650
+ }
635
651
return err
636
652
}
637
653
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
@@ -644,6 +660,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
644
660
logger .Info ("reconcilePods" , "Found 0 head Pods; creating a head Pod for the RayCluster." , instance .Name )
645
661
common .CreatedClustersCounterInc (instance .Namespace )
646
662
if err := r .createHeadPod (ctx , * instance ); err != nil {
663
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
664
+ Status : metav1 .ConditionTrue ,
665
+ Reason : "FailedCreateHeadPod" ,
666
+ Message : err .Error (),
667
+ }
647
668
common .FailedClustersCounterInc (instance .Namespace )
648
669
return err
649
670
}
@@ -663,6 +684,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
663
684
// delete all the extra head pod pods
664
685
for _ , extraHeadPodToDelete := range headPods .Items {
665
686
if err := r .Delete (ctx , & extraHeadPodToDelete ); err != nil {
687
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
688
+ Status : metav1 .ConditionTrue ,
689
+ Reason : "FailedDeleteHeadPod" ,
690
+ Message : err .Error (),
691
+ }
666
692
return err
667
693
}
668
694
}
@@ -690,6 +716,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
690
716
numDeletedUnhealthyWorkerPods ++
691
717
deletedWorkers [workerPod .Name ] = deleted
692
718
if err := r .Delete (ctx , & workerPod ); err != nil {
719
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
720
+ Status : metav1 .ConditionTrue ,
721
+ Reason : "FailedDeleteWorkerPod" ,
722
+ Message : err .Error (),
723
+ }
693
724
return err
694
725
}
695
726
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
@@ -713,6 +744,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
713
744
logger .Info ("Deleting pod" , "namespace" , pod .Namespace , "name" , pod .Name )
714
745
if err := r .Delete (ctx , & pod ); err != nil {
715
746
if ! errors .IsNotFound (err ) {
747
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
748
+ Status : metav1 .ConditionTrue ,
749
+ Reason : "FailedDeleteWorkerPod" ,
750
+ Message : err .Error (),
751
+ }
716
752
logger .Info ("reconcilePods" , "Fail to delete Pod" , pod .Name , "error" , err )
717
753
return err
718
754
}
@@ -749,6 +785,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
749
785
for i = 0 ; i < diff ; i ++ {
750
786
logger .Info ("reconcilePods" , "creating worker for group" , worker .GroupName , fmt .Sprintf ("index %d" , i ), fmt .Sprintf ("in total %d" , diff ))
751
787
if err := r .createWorkerPod (ctx , * instance , * worker .DeepCopy ()); err != nil {
788
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
789
+ Status : metav1 .ConditionTrue ,
790
+ Reason : "FailedCreateWorkerPod" ,
791
+ Message : err .Error (),
792
+ }
752
793
return err
753
794
}
754
795
}
@@ -782,6 +823,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
782
823
logger .Info ("Randomly deleting Pod" , "progress" , fmt .Sprintf ("%d / %d" , i + 1 , randomlyRemovedWorkers ), "with name" , randomPodToDelete .Name )
783
824
if err := r .Delete (ctx , & randomPodToDelete ); err != nil {
784
825
if ! errors .IsNotFound (err ) {
826
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
827
+ Status : metav1 .ConditionTrue ,
828
+ Reason : "FailedDeleteWorkerPod" ,
829
+ Message : err .Error (),
830
+ }
785
831
return err
786
832
}
787
833
logger .Info ("reconcilePods" , "The worker Pod has already been deleted" , randomPodToDelete .Name )
@@ -796,6 +842,12 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
796
842
return nil
797
843
}
798
844
845
+ func defaultRayClusterConditions () rayClusterConditions {
846
+ return map [rayv1.RayClusterConditionType ]metav1.Condition {
847
+ rayv1 .RayClusterReplicaFailure : {Status : metav1 .ConditionFalse }, // omit the Condition.Type here for simplicity. we will set it later in the updateRayClusterStatus().
848
+ }
849
+ }
850
+
799
851
// shouldDeletePod returns whether the Pod should be deleted and the reason
800
852
//
801
853
// @param pod: The Pod to be checked.
@@ -1301,7 +1353,7 @@ func (r *RayClusterReconciler) updateHeadInfo(ctx context.Context, instance *ray
1301
1353
return nil
1302
1354
}
1303
1355
1304
- func (r * RayClusterReconciler ) reconcileAutoscalerServiceAccount (ctx context.Context , instance * rayv1.RayCluster ) error {
1356
+ func (r * RayClusterReconciler ) reconcileAutoscalerServiceAccount (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1305
1357
logger := ctrl .LoggerFrom (ctx )
1306
1358
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1307
1359
return nil
@@ -1356,7 +1408,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con
1356
1408
return nil
1357
1409
}
1358
1410
1359
- func (r * RayClusterReconciler ) reconcileAutoscalerRole (ctx context.Context , instance * rayv1.RayCluster ) error {
1411
+ func (r * RayClusterReconciler ) reconcileAutoscalerRole (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1360
1412
logger := ctrl .LoggerFrom (ctx )
1361
1413
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1362
1414
return nil
@@ -1397,7 +1449,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerRole(ctx context.Context, inst
1397
1449
return nil
1398
1450
}
1399
1451
1400
- func (r * RayClusterReconciler ) reconcileAutoscalerRoleBinding (ctx context.Context , instance * rayv1.RayCluster ) error {
1452
+ func (r * RayClusterReconciler ) reconcileAutoscalerRoleBinding (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1401
1453
logger := ctrl .LoggerFrom (ctx )
1402
1454
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1403
1455
return nil
@@ -1438,11 +1490,21 @@ func (r *RayClusterReconciler) reconcileAutoscalerRoleBinding(ctx context.Contex
1438
1490
return nil
1439
1491
}
1440
1492
1441
- func (r * RayClusterReconciler ) updateRayClusterStatus (ctx context.Context , originalRayClusterInstance , newInstance * rayv1.RayCluster ) error {
1493
+ func (r * RayClusterReconciler ) updateRayClusterStatus (ctx context.Context , originalRayClusterInstance , newInstance * rayv1.RayCluster , conditions rayClusterConditions ) error {
1442
1494
logger := ctrl .LoggerFrom (ctx )
1443
- if ! r .inconsistentRayClusterStatus (ctx , originalRayClusterInstance .Status , newInstance .Status ) {
1495
+
1496
+ inconsistent := false
1497
+ if features .Enabled (features .RayClusterStatusConditions ) {
1498
+ for typ , condition := range conditions {
1499
+ condition .Type = string (typ ) // make sure the condition.Type is set correctly.
1500
+ inconsistent = meta .SetStatusCondition (& newInstance .Status .Conditions , condition ) || inconsistent
1501
+ }
1502
+ }
1503
+ inconsistent = r .inconsistentRayClusterStatus (ctx , originalRayClusterInstance .Status , newInstance .Status ) || inconsistent
1504
+ if ! inconsistent {
1444
1505
return nil
1445
1506
}
1507
+
1446
1508
logger .Info ("updateRayClusterStatus" , "name" , originalRayClusterInstance .Name , "old status" , originalRayClusterInstance .Status , "new status" , newInstance .Status )
1447
1509
err := r .Status ().Update (ctx , newInstance )
1448
1510
if err != nil {
0 commit comments