@@ -10,12 +10,14 @@ import (
10
10
"strings"
11
11
"time"
12
12
13
+ "k8s.io/apimachinery/pkg/api/meta"
13
14
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14
15
"k8s.io/utils/ptr"
15
16
16
17
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
17
18
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
18
19
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
20
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
19
21
20
22
batchv1 "k8s.io/api/batch/v1"
21
23
rbacv1 "k8s.io/api/rbac/v1"
@@ -44,7 +46,10 @@ import (
44
46
"sigs.k8s.io/controller-runtime/pkg/reconcile"
45
47
)
46
48
47
- type reconcileFunc func (context.Context , * rayv1.RayCluster ) error
49
+ type (
50
+ rayClusterConditions map [rayv1.RayClusterConditionType ]metav1.Condition
51
+ reconcileFunc func (context.Context , * rayv1.RayCluster , rayClusterConditions ) error
52
+ )
48
53
49
54
var (
50
55
DefaultRequeueDuration = 2 * time .Second
@@ -300,6 +305,9 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
300
305
return ctrl.Result {}, nil
301
306
}
302
307
308
+ // conditions should be mutated by the following reconcileXXX functions.
309
+ conditions := defaultRayClusterConditions ()
310
+
303
311
reconcileFuncs := []reconcileFunc {
304
312
r .reconcileAutoscalerServiceAccount ,
305
313
r .reconcileAutoscalerRole ,
@@ -312,7 +320,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
312
320
}
313
321
314
322
for _ , fn := range reconcileFuncs {
315
- if reconcileErr = fn (ctx , instance ); reconcileErr != nil {
323
+ if reconcileErr = fn (ctx , instance , conditions ); reconcileErr != nil {
316
324
funcName := runtime .FuncForPC (reflect .ValueOf (fn ).Pointer ()).Name ()
317
325
logger .Error (reconcileErr , "Error reconcile resources" , "function name" , funcName )
318
326
break
@@ -325,7 +333,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
325
333
if calculateErr != nil {
326
334
logger .Info ("Got error when calculating new status" , "cluster name" , request .Name , "error" , calculateErr )
327
335
} else {
328
- updateErr = r .updateRayClusterStatus (ctx , originalRayClusterInstance , newInstance )
336
+ updateErr = r .updateRayClusterStatus (ctx , originalRayClusterInstance , newInstance , conditions )
329
337
}
330
338
331
339
// Return error based on order.
@@ -394,7 +402,7 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
394
402
return false
395
403
}
396
404
397
- func (r * RayClusterReconciler ) reconcileIngress (ctx context.Context , instance * rayv1.RayCluster ) error {
405
+ func (r * RayClusterReconciler ) reconcileIngress (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
398
406
logger := ctrl .LoggerFrom (ctx )
399
407
logger .Info ("Reconciling Ingress" )
400
408
if instance .Spec .HeadGroupSpec .EnableIngress == nil || ! * instance .Spec .HeadGroupSpec .EnableIngress {
@@ -474,7 +482,7 @@ func (r *RayClusterReconciler) reconcileIngressKubernetes(ctx context.Context, i
474
482
}
475
483
476
484
// Return nil only when the head service successfully created or already exists.
477
- func (r * RayClusterReconciler ) reconcileHeadService (ctx context.Context , instance * rayv1.RayCluster ) error {
485
+ func (r * RayClusterReconciler ) reconcileHeadService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
478
486
logger := ctrl .LoggerFrom (ctx )
479
487
services := corev1.ServiceList {}
480
488
filterLabels := client.MatchingLabels {utils .RayClusterLabelKey : instance .Name , utils .RayNodeTypeLabelKey : string (rayv1 .HeadNode )}
@@ -526,7 +534,7 @@ func (r *RayClusterReconciler) reconcileHeadService(ctx context.Context, instanc
526
534
}
527
535
528
536
// Return nil only when the serve service successfully created or already exists.
529
- func (r * RayClusterReconciler ) reconcileServeService (ctx context.Context , instance * rayv1.RayCluster ) error {
537
+ func (r * RayClusterReconciler ) reconcileServeService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
530
538
// Only reconcile the K8s service for Ray Serve when the "ray.io/enable-serve-service" annotation is set to true.
531
539
if enableServeServiceValue , exist := instance .Annotations [utils .EnableServeServiceKey ]; ! exist || enableServeServiceValue != utils .EnableServeServiceTrue {
532
540
return nil
@@ -555,7 +563,7 @@ func (r *RayClusterReconciler) reconcileServeService(ctx context.Context, instan
555
563
}
556
564
557
565
// Return nil only when the headless service for multi-host worker groups is successfully created or already exists.
558
- func (r * RayClusterReconciler ) reconcileHeadlessService (ctx context.Context , instance * rayv1.RayCluster ) error {
566
+ func (r * RayClusterReconciler ) reconcileHeadlessService (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
559
567
// Check if there are worker groups with NumOfHosts > 1 in the cluster
560
568
isMultiHost := false
561
569
for _ , workerGroup := range instance .Spec .WorkerGroupSpecs {
@@ -591,12 +599,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins
591
599
return nil
592
600
}
593
601
594
- func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
602
+ func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster , conditions rayClusterConditions ) error {
595
603
logger := ctrl .LoggerFrom (ctx )
596
604
597
605
// if RayCluster is suspended, delete all pods and skip reconcile
598
606
if instance .Spec .Suspend != nil && * instance .Spec .Suspend {
599
607
if _ , err := r .deleteAllPods (ctx , common .RayClusterAllPodsAssociationOptions (instance )); err != nil {
608
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
609
+ Status : metav1 .ConditionTrue ,
610
+ Reason : "FailedDeleteAllPods" ,
611
+ Message : err .Error (),
612
+ }
600
613
return err
601
614
}
602
615
@@ -632,6 +645,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
632
645
logger .Info ("reconcilePods" , "head Pod" , headPod .Name , "shouldDelete" , shouldDelete , "reason" , reason )
633
646
if shouldDelete {
634
647
if err := r .Delete (ctx , & headPod ); err != nil {
648
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
649
+ Status : metav1 .ConditionTrue ,
650
+ Reason : "FailedDeleteHeadPod" ,
651
+ Message : err .Error (),
652
+ }
635
653
return err
636
654
}
637
655
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
@@ -644,6 +662,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
644
662
logger .Info ("reconcilePods" , "Found 0 head Pods; creating a head Pod for the RayCluster." , instance .Name )
645
663
common .CreatedClustersCounterInc (instance .Namespace )
646
664
if err := r .createHeadPod (ctx , * instance ); err != nil {
665
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
666
+ Status : metav1 .ConditionTrue ,
667
+ Reason : "FailedCreateHeadPod" ,
668
+ Message : err .Error (),
669
+ }
647
670
common .FailedClustersCounterInc (instance .Namespace )
648
671
return err
649
672
}
@@ -663,6 +686,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
663
686
// delete all the extra head pod pods
664
687
for _ , extraHeadPodToDelete := range headPods .Items {
665
688
if err := r .Delete (ctx , & extraHeadPodToDelete ); err != nil {
689
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
690
+ Status : metav1 .ConditionTrue ,
691
+ Reason : "FailedDeleteHeadPod" ,
692
+ Message : err .Error (),
693
+ }
666
694
return err
667
695
}
668
696
}
@@ -690,6 +718,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
690
718
numDeletedUnhealthyWorkerPods ++
691
719
deletedWorkers [workerPod .Name ] = deleted
692
720
if err := r .Delete (ctx , & workerPod ); err != nil {
721
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
722
+ Status : metav1 .ConditionTrue ,
723
+ Reason : "FailedDeleteWorkerPod" ,
724
+ Message : err .Error (),
725
+ }
693
726
return err
694
727
}
695
728
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
@@ -713,6 +746,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
713
746
logger .Info ("Deleting pod" , "namespace" , pod .Namespace , "name" , pod .Name )
714
747
if err := r .Delete (ctx , & pod ); err != nil {
715
748
if ! errors .IsNotFound (err ) {
749
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
750
+ Status : metav1 .ConditionTrue ,
751
+ Reason : "FailedDeleteWorkerPod" ,
752
+ Message : err .Error (),
753
+ }
716
754
logger .Info ("reconcilePods" , "Fail to delete Pod" , pod .Name , "error" , err )
717
755
return err
718
756
}
@@ -749,6 +787,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
749
787
for i = 0 ; i < diff ; i ++ {
750
788
logger .Info ("reconcilePods" , "creating worker for group" , worker .GroupName , fmt .Sprintf ("index %d" , i ), fmt .Sprintf ("in total %d" , diff ))
751
789
if err := r .createWorkerPod (ctx , * instance , * worker .DeepCopy ()); err != nil {
790
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
791
+ Status : metav1 .ConditionTrue ,
792
+ Reason : "FailedCreateWorkerPod" ,
793
+ Message : err .Error (),
794
+ }
752
795
return err
753
796
}
754
797
}
@@ -782,6 +825,11 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
782
825
logger .Info ("Randomly deleting Pod" , "progress" , fmt .Sprintf ("%d / %d" , i + 1 , randomlyRemovedWorkers ), "with name" , randomPodToDelete .Name )
783
826
if err := r .Delete (ctx , & randomPodToDelete ); err != nil {
784
827
if ! errors .IsNotFound (err ) {
828
+ conditions [rayv1 .RayClusterReplicaFailure ] = metav1.Condition {
829
+ Status : metav1 .ConditionTrue ,
830
+ Reason : "FailedDeleteWorkerPod" ,
831
+ Message : err .Error (),
832
+ }
785
833
return err
786
834
}
787
835
logger .Info ("reconcilePods" , "The worker Pod has already been deleted" , randomPodToDelete .Name )
@@ -796,6 +844,12 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
796
844
return nil
797
845
}
798
846
847
+ func defaultRayClusterConditions () rayClusterConditions {
848
+ return map [rayv1.RayClusterConditionType ]metav1.Condition {
849
+ rayv1 .RayClusterReplicaFailure : {Status : metav1 .ConditionFalse }, // omit the Condition.Type here for simplicity. we will set it later in the updateRayClusterStatus().
850
+ }
851
+ }
852
+
799
853
// shouldDeletePod returns whether the Pod should be deleted and the reason
800
854
//
801
855
// @param pod: The Pod to be checked.
@@ -1301,7 +1355,7 @@ func (r *RayClusterReconciler) updateHeadInfo(ctx context.Context, instance *ray
1301
1355
return nil
1302
1356
}
1303
1357
1304
- func (r * RayClusterReconciler ) reconcileAutoscalerServiceAccount (ctx context.Context , instance * rayv1.RayCluster ) error {
1358
+ func (r * RayClusterReconciler ) reconcileAutoscalerServiceAccount (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1305
1359
logger := ctrl .LoggerFrom (ctx )
1306
1360
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1307
1361
return nil
@@ -1356,7 +1410,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(ctx context.Con
1356
1410
return nil
1357
1411
}
1358
1412
1359
- func (r * RayClusterReconciler ) reconcileAutoscalerRole (ctx context.Context , instance * rayv1.RayCluster ) error {
1413
+ func (r * RayClusterReconciler ) reconcileAutoscalerRole (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1360
1414
logger := ctrl .LoggerFrom (ctx )
1361
1415
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1362
1416
return nil
@@ -1397,7 +1451,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerRole(ctx context.Context, inst
1397
1451
return nil
1398
1452
}
1399
1453
1400
- func (r * RayClusterReconciler ) reconcileAutoscalerRoleBinding (ctx context.Context , instance * rayv1.RayCluster ) error {
1454
+ func (r * RayClusterReconciler ) reconcileAutoscalerRoleBinding (ctx context.Context , instance * rayv1.RayCluster , _ rayClusterConditions ) error {
1401
1455
logger := ctrl .LoggerFrom (ctx )
1402
1456
if instance .Spec .EnableInTreeAutoscaling == nil || ! * instance .Spec .EnableInTreeAutoscaling {
1403
1457
return nil
@@ -1438,11 +1492,21 @@ func (r *RayClusterReconciler) reconcileAutoscalerRoleBinding(ctx context.Contex
1438
1492
return nil
1439
1493
}
1440
1494
1441
- func (r * RayClusterReconciler ) updateRayClusterStatus (ctx context.Context , originalRayClusterInstance , newInstance * rayv1.RayCluster ) error {
1495
+ func (r * RayClusterReconciler ) updateRayClusterStatus (ctx context.Context , originalRayClusterInstance , newInstance * rayv1.RayCluster , conditions rayClusterConditions ) error {
1442
1496
logger := ctrl .LoggerFrom (ctx )
1443
- if ! r .inconsistentRayClusterStatus (ctx , originalRayClusterInstance .Status , newInstance .Status ) {
1497
+
1498
+ inconsistent := false
1499
+ if features .Enabled (features .RayClusterStatusConditions ) {
1500
+ for typ , condition := range conditions {
1501
+ condition .Type = string (typ ) // make sure the condition.Type is set correctly.
1502
+ inconsistent = meta .SetStatusCondition (& newInstance .Status .Conditions , condition ) || inconsistent
1503
+ }
1504
+ }
1505
+ inconsistent = r .inconsistentRayClusterStatus (ctx , originalRayClusterInstance .Status , newInstance .Status ) || inconsistent
1506
+ if ! inconsistent {
1444
1507
return nil
1445
1508
}
1509
+
1446
1510
logger .Info ("updateRayClusterStatus" , "name" , originalRayClusterInstance .Name , "old status" , originalRayClusterInstance .Status , "new status" , newInstance .Status )
1447
1511
err := r .Status ().Update (ctx , newInstance )
1448
1512
if err != nil {
0 commit comments