@@ -2,6 +2,7 @@ package ray
2
2
3
3
import (
4
4
"context"
5
+ errstd "errors"
5
6
"fmt"
6
7
"os"
7
8
"reflect"
@@ -10,12 +11,14 @@ import (
10
11
"strings"
11
12
"time"
12
13
14
+ "k8s.io/apimachinery/pkg/api/meta"
13
15
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14
16
"k8s.io/utils/ptr"
15
17
16
18
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
17
19
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
18
20
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
21
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
19
22
20
23
batchv1 "k8s.io/api/batch/v1"
21
24
rbacv1 "k8s.io/api/rbac/v1"
@@ -391,6 +394,10 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
391
394
oldStatus .Endpoints , newStatus .Endpoints , oldStatus .Head , newStatus .Head ))
392
395
return true
393
396
}
397
+ if ! reflect .DeepEqual (oldStatus .Conditions , newStatus .Conditions ) {
398
+ logger .Info ("inconsistentRayClusterStatus" , "old conditions" , oldStatus .Conditions , "new conditions" , newStatus .Conditions )
399
+ return true
400
+ }
394
401
return false
395
402
}
396
403
@@ -597,7 +604,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
597
604
// if RayCluster is suspended, delete all pods and skip reconcile
598
605
if instance .Spec .Suspend != nil && * instance .Spec .Suspend {
599
606
if _ , err := r .deleteAllPods (ctx , common .RayClusterAllPodsAssociationOptions (instance )); err != nil {
600
- return err
607
+ return errstd . Join ( utils . ErrFailedDeleteAllPods , err )
601
608
}
602
609
603
610
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
@@ -632,7 +639,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
632
639
logger .Info ("reconcilePods" , "head Pod" , headPod .Name , "shouldDelete" , shouldDelete , "reason" , reason )
633
640
if shouldDelete {
634
641
if err := r .Delete (ctx , & headPod ); err != nil {
635
- return err
642
+ return errstd . Join ( utils . ErrFailedDeleteHeadPod , err )
636
643
}
637
644
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
638
645
"Deleted head Pod %s; Pod status: %s; Pod restart policy: %s; Ray container terminated status: %v" ,
@@ -645,7 +652,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
645
652
common .CreatedClustersCounterInc (instance .Namespace )
646
653
if err := r .createHeadPod (ctx , * instance ); err != nil {
647
654
common .FailedClustersCounterInc (instance .Namespace )
648
- return err
655
+ return errstd . Join ( utils . ErrFailedCreateHeadPod , err )
649
656
}
650
657
common .SuccessfulClustersCounterInc (instance .Namespace )
651
658
} else if len (headPods .Items ) > 1 {
@@ -663,7 +670,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
663
670
// delete all the extra head pod pods
664
671
for _ , extraHeadPodToDelete := range headPods .Items {
665
672
if err := r .Delete (ctx , & extraHeadPodToDelete ); err != nil {
666
- return err
673
+ return errstd . Join ( utils . ErrFailedDeleteHeadPod , err )
667
674
}
668
675
}
669
676
}
@@ -690,7 +697,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
690
697
numDeletedUnhealthyWorkerPods ++
691
698
deletedWorkers [workerPod .Name ] = deleted
692
699
if err := r .Delete (ctx , & workerPod ); err != nil {
693
- return err
700
+ return errstd . Join ( utils . ErrFailedDeleteWorkerPod , err )
694
701
}
695
702
r .Recorder .Eventf (instance , corev1 .EventTypeNormal , "Deleted" ,
696
703
"Deleted worker Pod %s; Pod status: %s; Pod restart policy: %s; Ray container terminated status: %v" ,
@@ -714,7 +721,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
714
721
if err := r .Delete (ctx , & pod ); err != nil {
715
722
if ! errors .IsNotFound (err ) {
716
723
logger .Info ("reconcilePods" , "Fail to delete Pod" , pod .Name , "error" , err )
717
- return err
724
+ return errstd . Join ( utils . ErrFailedDeleteWorkerPod , err )
718
725
}
719
726
logger .Info ("reconcilePods" , "The worker Pod has already been deleted" , pod .Name )
720
727
} else {
@@ -749,7 +756,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
749
756
for i = 0 ; i < diff ; i ++ {
750
757
logger .Info ("reconcilePods" , "creating worker for group" , worker .GroupName , fmt .Sprintf ("index %d" , i ), fmt .Sprintf ("in total %d" , diff ))
751
758
if err := r .createWorkerPod (ctx , * instance , * worker .DeepCopy ()); err != nil {
752
- return err
759
+ return errstd . Join ( utils . ErrFailedCreateWorkerPod , err )
753
760
}
754
761
}
755
762
} else if diff == 0 {
@@ -782,7 +789,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
782
789
logger .Info ("Randomly deleting Pod" , "progress" , fmt .Sprintf ("%d / %d" , i + 1 , randomlyRemovedWorkers ), "with name" , randomPodToDelete .Name )
783
790
if err := r .Delete (ctx , & randomPodToDelete ); err != nil {
784
791
if ! errors .IsNotFound (err ) {
785
- return err
792
+ return errstd . Join ( utils . ErrFailedDeleteWorkerPod , err )
786
793
}
787
794
logger .Info ("reconcilePods" , "The worker Pod has already been deleted" , randomPodToDelete .Name )
788
795
}
@@ -1155,6 +1162,22 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
1155
1162
// Deep copy the instance, so we don't mutate the original object.
1156
1163
newInstance := instance .DeepCopy ()
1157
1164
1165
+ if features .Enabled (features .RayClusterStatusConditions ) {
1166
+ if reconcileErr != nil {
1167
+ if reason := utils .RayClusterReplicaFailureReason (reconcileErr ); reason != "" {
1168
+ meta .SetStatusCondition (& newInstance .Status .Conditions , metav1.Condition {
1169
+ Type : string (rayv1 .RayClusterReplicaFailure ),
1170
+ Status : metav1 .ConditionTrue ,
1171
+ Reason : reason ,
1172
+ Message : reconcileErr .Error (),
1173
+ })
1174
+ }
1175
+ } else {
1176
+ // if reconcileErr == nil, we can safely remove the RayClusterReplicaFailure condition.
1177
+ meta .RemoveStatusCondition (& newInstance .Status .Conditions , string (rayv1 .RayClusterReplicaFailure ))
1178
+ }
1179
+ }
1180
+
1158
1181
// TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
1159
1182
newInstance .Status .ObservedGeneration = newInstance .ObjectMeta .Generation
1160
1183
0 commit comments