Skip to content

Commit 8482a25

Browse files
committed
[Feat][RayCluster] Use a new RayClusterReplicaFailure condition to reflect the result of reconcilePods
Signed-off-by: Rueian <[email protected]>
1 parent 8c64e60 commit 8482a25

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package ray
22

33
import (
44
"context"
5+
errstd "errors"
56
"fmt"
67
"os"
78
"reflect"
@@ -10,12 +11,14 @@ import (
1011
"strings"
1112
"time"
1213

14+
"k8s.io/apimachinery/pkg/api/meta"
1315
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1416
"k8s.io/utils/ptr"
1517

1618
"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler"
1719
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
1820
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
21+
"github.com/ray-project/kuberay/ray-operator/pkg/features"
1922

2023
batchv1 "k8s.io/api/batch/v1"
2124
rbacv1 "k8s.io/api/rbac/v1"
@@ -391,6 +394,12 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context,
391394
oldStatus.Endpoints, newStatus.Endpoints, oldStatus.Head, newStatus.Head))
392395
return true
393396
}
397+
if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) {
398+
logger.Info("inconsistentRayClusterStatus", "detect inconsistency", fmt.Sprintf(
399+
"old Conditions: %v, new Conditions: %v",
400+
oldStatus.Conditions, newStatus.Conditions))
401+
return true
402+
}
394403
return false
395404
}
396405

@@ -591,7 +600,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins
591600
return nil
592601
}
593602

603+
// reconcilePodsErr is a marker used by the calculateStatus() for setting the RayClusterReplicaFailure condition.
604+
var reconcilePodsErr = errstd.New("reconcile pods error")
605+
594606
func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv1.RayCluster) error {
607+
if err := r.doReconcilePods(ctx, instance); err != nil {
608+
return fmt.Errorf("%w: %w", reconcilePodsErr, err)
609+
}
610+
return nil
611+
}
612+
613+
func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *rayv1.RayCluster) error {
595614
logger := ctrl.LoggerFrom(ctx)
596615

597616
// if RayCluster is suspended, delete all pods and skip reconcile
@@ -1149,6 +1168,22 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
11491168
// Deep copy the instance, so we don't mutate the original object.
11501169
newInstance := instance.DeepCopy()
11511170

1171+
if features.Enabled(features.RayClusterStatusConditions) {
1172+
if reconcileErr != nil {
1173+
if errstd.Is(reconcileErr, reconcilePodsErr) {
1174+
meta.SetStatusCondition(&newInstance.Status.Conditions, metav1.Condition{
1175+
Type: string(rayv1.RayClusterReplicaFailure),
1176+
Status: metav1.ConditionTrue,
1177+
Reason: "FailedReconcilePods",
1178+
Message: reconcileErr.Error(),
1179+
})
1180+
}
1181+
} else {
1182+
// if reconcileErr == nil, we can safely remove the RayClusterReplicaFailure condition.
1183+
meta.RemoveStatusCondition(&newInstance.Status.Conditions, string(rayv1.RayClusterReplicaFailure))
1184+
}
1185+
}
1186+
11521187
if reconcileErr != nil {
11531188
newInstance.Status.State = rayv1.Failed
11541189
newInstance.Status.Reason = reconcileErr.Error()

0 commit comments

Comments
 (0)