@@ -2,6 +2,7 @@ package ray
22
33import (
44 "context"
5+ errstd "errors"
56 "fmt"
67 "os"
78 "reflect"
@@ -10,12 +11,14 @@ import (
1011 "strings"
1112 "time"
1213
14+ "k8s.io/apimachinery/pkg/api/meta"
1315 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1416 "k8s.io/utils/ptr"
1517
1618 "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
1719 "github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
1820 "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
21+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
1922
2023 batchv1 "k8s.io/api/batch/v1"
2124 rbacv1 "k8s.io/api/rbac/v1"
@@ -391,6 +394,12 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
391394 oldStatus .Endpoints , newStatus .Endpoints , oldStatus .Head , newStatus .Head ))
392395 return true
393396 }
397+ if ! reflect .DeepEqual (oldStatus .Conditions , newStatus .Conditions ) {
398+ logger .Info ("inconsistentRayClusterStatus" , "detect inconsistency" , fmt .Sprintf (
399+ "old Conditions: %v, new Conditions: %v" ,
400+ oldStatus .Conditions , newStatus .Conditions ))
401+ return true
402+ }
394403 return false
395404}
396405
@@ -591,7 +600,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins
591600 return nil
592601}
593602
603+ // reconcilePodsErr is a marker used by the calculateStatus() for setting the RayClusterReplicaFailure condition.
604+ var reconcilePodsErr = errstd .New ("reconcile pods error" )
605+
594606func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
607+ if err := r .doReconcilePods (ctx , instance ); err != nil {
608+ return fmt .Errorf ("%w: %w" , reconcilePodsErr , err )
609+ }
610+ return nil
611+ }
612+
613+ func (r * RayClusterReconciler ) doReconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
595614 logger := ctrl .LoggerFrom (ctx )
596615
597616 // if RayCluster is suspended, delete all pods and skip reconcile
@@ -1152,7 +1171,22 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
11521171 if reconcileErr != nil {
11531172 newInstance .Status .State = rayv1 .Failed
11541173 newInstance .Status .Reason = reconcileErr .Error ()
1174+ if features .Enabled (features .RayClusterStatusConditions ) {
1175+ if errstd .Is (reconcileErr , reconcilePodsErr ) {
1176+ meta .SetStatusCondition (& newInstance .Status .Conditions , metav1.Condition {
1177+ Type : string (rayv1 .RayClusterReplicaFailure ),
1178+ Status : metav1 .ConditionTrue ,
1179+ Reason : "FailedReconcilePods" ,
1180+ Message : reconcileErr .Error (),
1181+ })
1182+ }
1183+ }
11551184 } else {
1185+ // if reconcileErr == nil, we can safely remove the RayClusterReplicaFailure condition.
1186+ if features .Enabled (features .RayClusterStatusConditions ) {
1187+ meta .RemoveStatusCondition (& newInstance .Status .Conditions , string (rayv1 .RayClusterReplicaFailure ))
1188+ }
1189+
11561190 // TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not.
11571191 newInstance .Status .ObservedGeneration = newInstance .ObjectMeta .Generation
11581192
0 commit comments