@@ -2,6 +2,7 @@ package ray
2
2
3
3
import (
4
4
"context"
5
+ errstd "errors"
5
6
"fmt"
6
7
"os"
7
8
"reflect"
@@ -10,12 +11,14 @@ import (
10
11
"strings"
11
12
"time"
12
13
14
+ "k8s.io/apimachinery/pkg/api/meta"
13
15
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14
16
"k8s.io/utils/ptr"
15
17
16
18
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
17
19
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
18
20
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
21
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
19
22
20
23
batchv1 "k8s.io/api/batch/v1"
21
24
rbacv1 "k8s.io/api/rbac/v1"
@@ -391,6 +394,12 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
391
394
oldStatus .Endpoints , newStatus .Endpoints , oldStatus .Head , newStatus .Head ))
392
395
return true
393
396
}
397
+ if ! reflect .DeepEqual (oldStatus .Conditions , newStatus .Conditions ) {
398
+ logger .Info ("inconsistentRayClusterStatus" , "detect inconsistency" , fmt .Sprintf (
399
+ "old Conditions: %v, new Conditions: %v" ,
400
+ oldStatus .Conditions , newStatus .Conditions ))
401
+ return true
402
+ }
394
403
return false
395
404
}
396
405
@@ -591,7 +600,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins
591
600
return nil
592
601
}
593
602
603
+ // reconcilePodsErr is a marker used by the calculateStatus() for setting the RayClusterReplicaFailure condition.
604
+ var reconcilePodsErr = errstd .New ("reconcile pods error" )
605
+
594
606
func (r * RayClusterReconciler ) reconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
607
+ if err := r .doReconcilePods (ctx , instance ); err != nil {
608
+ return fmt .Errorf ("%w: %w" , reconcilePodsErr , err )
609
+ }
610
+ return nil
611
+ }
612
+
613
+ func (r * RayClusterReconciler ) doReconcilePods (ctx context.Context , instance * rayv1.RayCluster ) error {
595
614
logger := ctrl .LoggerFrom (ctx )
596
615
597
616
// if RayCluster is suspended, delete all pods and skip reconcile
@@ -1149,6 +1168,22 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
1149
1168
// Deep copy the instance, so we don't mutate the original object.
1150
1169
newInstance := instance .DeepCopy ()
1151
1170
1171
+ if features .Enabled (features .RayClusterStatusConditions ) {
1172
+ if reconcileErr != nil {
1173
+ if errstd .Is (reconcileErr , reconcilePodsErr ) {
1174
+ meta .SetStatusCondition (& newInstance .Status .Conditions , metav1.Condition {
1175
+ Type : string (rayv1 .RayClusterReplicaFailure ),
1176
+ Status : metav1 .ConditionTrue ,
1177
+ Reason : "FailedReconcilePods" ,
1178
+ Message : reconcileErr .Error (),
1179
+ })
1180
+ }
1181
+ } else {
1182
+ // if reconcileErr == nil, we can safely remove the RayClusterReplicaFailure condition.
1183
+ meta .RemoveStatusCondition (& newInstance .Status .Conditions , string (rayv1 .RayClusterReplicaFailure ))
1184
+ }
1185
+ }
1186
+
1152
1187
if reconcileErr != nil {
1153
1188
newInstance .Status .State = rayv1 .Failed
1154
1189
newInstance .Status .Reason = reconcileErr .Error ()
0 commit comments