From 8482a2592a44ebbd6d9f2ba3ed81d7016edb5f0a Mon Sep 17 00:00:00 2001 From: Rueian Date: Fri, 19 Jul 2024 21:58:48 +0800 Subject: [PATCH 1/7] [Feat][RayCluster] Use a new RayClusterReplicaFailure condition to reflect the result of reconcilePods Signed-off-by: Rueian --- .../controllers/ray/raycluster_controller.go | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 957b8851f58..d318d5d5a30 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -2,6 +2,7 @@ package ray import ( "context" + errstd "errors" "fmt" "os" "reflect" @@ -10,12 +11,14 @@ import ( "strings" "time" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" + "github.com/ray-project/kuberay/ray-operator/pkg/features" batchv1 "k8s.io/api/batch/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -391,6 +394,12 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context, oldStatus.Endpoints, newStatus.Endpoints, oldStatus.Head, newStatus.Head)) return true } + if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { + logger.Info("inconsistentRayClusterStatus", "detect inconsistency", fmt.Sprintf( + "old Conditions: %v, new Conditions: %v", + oldStatus.Conditions, newStatus.Conditions)) + return true + } return false } @@ -591,7 +600,17 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins return nil } +// reconcilePodsErr is a marker used by the calculateStatus() for setting the RayClusterReplicaFailure condition. +var reconcilePodsErr = errstd.New("reconcile pods error") + func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { + if err := r.doReconcilePods(ctx, instance); err != nil { + return fmt.Errorf("%w: %w", reconcilePodsErr, err) + } + return nil +} + +func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { logger := ctrl.LoggerFrom(ctx) // if RayCluster is suspended, delete all pods and skip reconcile @@ -1149,6 +1168,22 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra // Deep copy the instance, so we don't mutate the original object. newInstance := instance.DeepCopy() + if features.Enabled(features.RayClusterStatusConditions) { + if reconcileErr != nil { + if errstd.Is(reconcileErr, reconcilePodsErr) { + meta.SetStatusCondition(&newInstance.Status.Conditions, metav1.Condition{ + Type: string(rayv1.RayClusterReplicaFailure), + Status: metav1.ConditionTrue, + Reason: "FailedReconcilePods", + Message: reconcileErr.Error(), + }) + } + } else { + // if reconcileErr == nil, we can safely remove the RayClusterReplicaFailure condition. + meta.RemoveStatusCondition(&newInstance.Status.Conditions, string(rayv1.RayClusterReplicaFailure)) + } + } + if reconcileErr != nil { newInstance.Status.State = rayv1.Failed newInstance.Status.Reason = reconcileErr.Error() From 41462a92d97c466478683bb5c9c61c8fb1111769 Mon Sep 17 00:00:00 2001 From: Rueian Date: Fri, 19 Jul 2024 23:40:42 +0800 Subject: [PATCH 2/7] [Test][RayCluster] Use a new RayClusterReplicaFailure condition to reflect the result of reconcilePods Signed-off-by: Rueian --- .../ray/raycluster_controller_unit_test.go | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller_unit_test.go b/ray-operator/controllers/ray/raycluster_controller_unit_test.go index db5df7240ae..10bb4b36889 100644 --- a/ray-operator/controllers/ray/raycluster_controller_unit_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_unit_test.go @@ -25,6 +25,7 @@ import ( "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" + "github.com/ray-project/kuberay/ray-operator/pkg/features" . "github.com/onsi/ginkgo/v2" "github.com/stretchr/testify/assert" @@ -33,6 +34,7 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -946,7 +948,7 @@ func TestReconcile_PodEvicted_DiffLess0_OK(t *testing.T) { // The head Pod with the status `Failed` will be deleted, and the function will return an // error to requeue the request with a short delay. If the function returns nil, the controller // will requeue the request after RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV (default: 300) seconds. - assert.NotNil(t, err) + assert.ErrorIs(t, err, reconcilePodsErr) // Filter head pod err = fakeClient.List(ctx, &podList, &client.ListOptions{ @@ -1637,6 +1639,11 @@ func TestInconsistentRayClusterStatus(t *testing.T) { newStatus = oldStatus.DeepCopy() newStatus.ObservedGeneration = oldStatus.ObservedGeneration + 1 assert.False(t, r.inconsistentRayClusterStatus(ctx, oldStatus, *newStatus)) + + // Case 12: `Conditions` is different => return true + newStatus = oldStatus.DeepCopy() + meta.SetStatusCondition(&newStatus.Conditions, metav1.Condition{Type: string(rayv1.RayClusterReplicaFailure), Status: metav1.ConditionTrue}) + assert.True(t, r.inconsistentRayClusterStatus(ctx, oldStatus, *newStatus)) } func TestCalculateStatus(t *testing.T) { @@ -1687,6 +1694,17 @@ func TestCalculateStatus(t *testing.T) { assert.Equal(t, headService.Name, newInstance.Status.Head.ServiceName) assert.NotNil(t, newInstance.Status.StateTransitionTimes, "Cluster state transition timestamp should be created") assert.Equal(t, newInstance.Status.LastUpdateTime, newInstance.Status.StateTransitionTimes[rayv1.Ready]) + + // Test reconcilePodsErr with the feature gate disabled + newInstance, err = r.calculateStatus(ctx, testRayCluster, reconcilePodsErr) + assert.Nil(t, err) + assert.Empty(t, newInstance.Status.Conditions) + + // Test reconcilePodsErr with the feature gate enabled + defer features.SetFeatureGateDuringTest(t, features.RayClusterStatusConditions, true)() + newInstance, err = r.calculateStatus(ctx, testRayCluster, reconcilePodsErr) + assert.Nil(t, err) + assert.True(t, meta.IsStatusConditionPresentAndEqual(newInstance.Status.Conditions, string(rayv1.RayClusterReplicaFailure), metav1.ConditionTrue)) } func TestStateTransitionTimes_NoStateChange(t *testing.T) { @@ -1808,7 +1826,7 @@ func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) { // Pods to be deleted, the controller won't create new worker Pods during the same reconcile loop. As a result, the number of worker // Pods will be (expectedNumWorkerPods - 1) after the reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, testRayCluster) - assert.NotNil(t, err) + assert.ErrorIs(t, err, reconcilePodsErr) err = fakeClient.List(ctx, &podList, &client.ListOptions{ LabelSelector: workerSelector, Namespace: namespaceStr, @@ -1848,7 +1866,7 @@ func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) { // Pods to be deleted, the controller won't create new worker Pods during the same reconcile loop. As a result, the number of worker // Pods will be (expectedNumWorkerPods - 1) after the reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, testRayCluster) - assert.NotNil(t, err) + assert.ErrorIs(t, err, reconcilePodsErr) err = fakeClient.List(ctx, &podList, &client.ListOptions{ LabelSelector: workerSelector, Namespace: namespaceStr, @@ -1927,7 +1945,7 @@ func Test_TerminatedHead_RestartPolicy(t *testing.T) { // The head Pod will be deleted and the controller will return an error // instead of creating a new head Pod in the same reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, cluster) - assert.NotNil(t, err) + assert.ErrorIs(t, err, reconcilePodsErr) err = fakeClient.List(ctx, &podList, client.InNamespace(namespaceStr)) assert.Nil(t, err, "Fail to get pod list") assert.Equal(t, 0, len(podList.Items)) @@ -1995,7 +2013,7 @@ func Test_RunningPods_RayContainerTerminated(t *testing.T) { // The head Pod will be deleted and the controller will return an error // instead of creating a new head Pod in the same reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, cluster) - assert.NotNil(t, err) + assert.ErrorIs(t, err, reconcilePodsErr) err = fakeClient.List(ctx, &podList, client.InNamespace(namespaceStr)) assert.Nil(t, err, "Fail to get pod list") assert.Equal(t, 0, len(podList.Items)) From 07811b3a984b8f0984a77fc6e225a373c2016647 Mon Sep 17 00:00:00 2001 From: Rueian Date: Sat, 20 Jul 2024 10:44:39 +0800 Subject: [PATCH 3/7] [Feat][RayCluster] Use errors.Join instead of fmt.Errorf for wrapping errors Signed-off-by: Rueian --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 42c414a2570..3b6ddda544d 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -605,7 +605,7 @@ var reconcilePodsErr = errstd.New("reconcile pods error") func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { if err := r.doReconcilePods(ctx, instance); err != nil { - return fmt.Errorf("%w: %w", reconcilePodsErr, err) + return errstd.Join(reconcilePodsErr, err) } return nil } From f08fa29f9c2617b89771f72dd2ac363a9521d1e6 Mon Sep 17 00:00:00 2001 From: Rueian Date: Mon, 22 Jul 2024 23:18:04 +0800 Subject: [PATCH 4/7] [Feat][RayCluster] avoid using fmt.Sprintf in logging Signed-off-by: Rueian --- ray-operator/controllers/ray/raycluster_controller.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 3b6ddda544d..a7b65ea0abf 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -395,9 +395,7 @@ func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context, return true } if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { - logger.Info("inconsistentRayClusterStatus", "detect inconsistency", fmt.Sprintf( - "old Conditions: %v, new Conditions: %v", - oldStatus.Conditions, newStatus.Conditions)) + logger.Info("inconsistentRayClusterStatus", "old conditions", oldStatus.Conditions, "new conditions", newStatus.Conditions) return true } return false From 5000630038d85ded98551e55e4f02e4ce60b9df0 Mon Sep 17 00:00:00 2001 From: Rueian Date: Mon, 22 Jul 2024 23:56:20 +0800 Subject: [PATCH 5/7] [Feat][RayCluster] Use a new RayClusterReplicaFailure condition to reflect the result of reconcilePods Signed-off-by: Rueian --- .../controllers/ray/raycluster_controller.go | 30 +++++++------------ .../ray/raycluster_controller_unit_test.go | 14 ++++----- .../controllers/ray/utils/constant.go | 26 ++++++++++++++++ .../controllers/ray/utils/util_test.go | 8 +++++ 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index a7b65ea0abf..5bf269d4330 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -598,23 +598,13 @@ func (r *RayClusterReconciler) reconcileHeadlessService(ctx context.Context, ins return nil } -// reconcilePodsErr is a marker used by the calculateStatus() for setting the RayClusterReplicaFailure condition. -var reconcilePodsErr = errstd.New("reconcile pods error") - func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { - if err := r.doReconcilePods(ctx, instance); err != nil { - return errstd.Join(reconcilePodsErr, err) - } - return nil -} - -func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *rayv1.RayCluster) error { logger := ctrl.LoggerFrom(ctx) // if RayCluster is suspended, delete all pods and skip reconcile if instance.Spec.Suspend != nil && *instance.Spec.Suspend { if _, err := r.deleteAllPods(ctx, common.RayClusterAllPodsAssociationOptions(instance)); err != nil { - return err + return errstd.Join(utils.ErrFailedDeleteAllPods, err) } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "Deleted", @@ -649,7 +639,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra logger.Info("reconcilePods", "head Pod", headPod.Name, "shouldDelete", shouldDelete, "reason", reason) if shouldDelete { if err := r.Delete(ctx, &headPod); err != nil { - return err + return errstd.Join(utils.ErrFailedDeleteHeadPod, err) } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "Deleted", "Deleted head Pod %s; Pod status: %s; Pod restart policy: %s; Ray container terminated status: %v", @@ -662,7 +652,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra common.CreatedClustersCounterInc(instance.Namespace) if err := r.createHeadPod(ctx, *instance); err != nil { common.FailedClustersCounterInc(instance.Namespace) - return err + return errstd.Join(utils.ErrFailedCreateHeadPod, err) } common.SuccessfulClustersCounterInc(instance.Namespace) } else if len(headPods.Items) > 1 { @@ -680,7 +670,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra // delete all the extra head pod pods for _, extraHeadPodToDelete := range headPods.Items { if err := r.Delete(ctx, &extraHeadPodToDelete); err != nil { - return err + return errstd.Join(utils.ErrFailedDeleteHeadPod, err) } } } @@ -707,7 +697,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra numDeletedUnhealthyWorkerPods++ deletedWorkers[workerPod.Name] = deleted if err := r.Delete(ctx, &workerPod); err != nil { - return err + return errstd.Join(utils.ErrFailedDeleteWorkerPod, err) } r.Recorder.Eventf(instance, corev1.EventTypeNormal, "Deleted", "Deleted worker Pod %s; Pod status: %s; Pod restart policy: %s; Ray container terminated status: %v", @@ -731,7 +721,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra if err := r.Delete(ctx, &pod); err != nil { if !errors.IsNotFound(err) { logger.Info("reconcilePods", "Fail to delete Pod", pod.Name, "error", err) - return err + return errstd.Join(utils.ErrFailedDeleteWorkerPod, err) } logger.Info("reconcilePods", "The worker Pod has already been deleted", pod.Name) } else { @@ -766,7 +756,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra for i = 0; i < diff; i++ { logger.Info("reconcilePods", "creating worker for group", worker.GroupName, fmt.Sprintf("index %d", i), fmt.Sprintf("in total %d", diff)) if err := r.createWorkerPod(ctx, *instance, *worker.DeepCopy()); err != nil { - return err + return errstd.Join(utils.ErrFailedCreateWorkerPod, err) } } } else if diff == 0 { @@ -799,7 +789,7 @@ func (r *RayClusterReconciler) doReconcilePods(ctx context.Context, instance *ra logger.Info("Randomly deleting Pod", "progress", fmt.Sprintf("%d / %d", i+1, randomlyRemovedWorkers), "with name", randomPodToDelete.Name) if err := r.Delete(ctx, &randomPodToDelete); err != nil { if !errors.IsNotFound(err) { - return err + return errstd.Join(utils.ErrFailedDeleteWorkerPod, err) } logger.Info("reconcilePods", "The worker Pod has already been deleted", randomPodToDelete.Name) } @@ -1174,11 +1164,11 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra if features.Enabled(features.RayClusterStatusConditions) { if reconcileErr != nil { - if errstd.Is(reconcileErr, reconcilePodsErr) { + if reason := utils.RayClusterReplicaFailureReason(reconcileErr); reason != "" { meta.SetStatusCondition(&newInstance.Status.Conditions, metav1.Condition{ Type: string(rayv1.RayClusterReplicaFailure), Status: metav1.ConditionTrue, - Reason: "FailedReconcilePods", + Reason: reason, Message: reconcileErr.Error(), }) } diff --git a/ray-operator/controllers/ray/raycluster_controller_unit_test.go b/ray-operator/controllers/ray/raycluster_controller_unit_test.go index 7236d446321..ccd578be18f 100644 --- a/ray-operator/controllers/ray/raycluster_controller_unit_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_unit_test.go @@ -948,7 +948,7 @@ func TestReconcile_PodEvicted_DiffLess0_OK(t *testing.T) { // The head Pod with the status `Failed` will be deleted, and the function will return an // error to requeue the request with a short delay. If the function returns nil, the controller // will requeue the request after RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV (default: 300) seconds. - assert.ErrorIs(t, err, reconcilePodsErr) + assert.NotNil(t, err) // Filter head pod err = fakeClient.List(ctx, &podList, &client.ListOptions{ @@ -1696,13 +1696,13 @@ func TestCalculateStatus(t *testing.T) { assert.Equal(t, newInstance.Status.LastUpdateTime, newInstance.Status.StateTransitionTimes[rayv1.Ready]) // Test reconcilePodsErr with the feature gate disabled - newInstance, err = r.calculateStatus(ctx, testRayCluster, reconcilePodsErr) + newInstance, err = r.calculateStatus(ctx, testRayCluster, utils.ErrFailedCreateHeadPod) assert.Nil(t, err) assert.Empty(t, newInstance.Status.Conditions) // Test reconcilePodsErr with the feature gate enabled defer features.SetFeatureGateDuringTest(t, features.RayClusterStatusConditions, true)() - newInstance, err = r.calculateStatus(ctx, testRayCluster, reconcilePodsErr) + newInstance, err = r.calculateStatus(ctx, testRayCluster, utils.ErrFailedCreateHeadPod) assert.Nil(t, err) assert.True(t, meta.IsStatusConditionPresentAndEqual(newInstance.Status.Conditions, string(rayv1.RayClusterReplicaFailure), metav1.ConditionTrue)) } @@ -1826,7 +1826,7 @@ func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) { // Pods to be deleted, the controller won't create new worker Pods during the same reconcile loop. As a result, the number of worker // Pods will be (expectedNumWorkerPods - 1) after the reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, testRayCluster) - assert.ErrorIs(t, err, reconcilePodsErr) + assert.NotNil(t, err) err = fakeClient.List(ctx, &podList, &client.ListOptions{ LabelSelector: workerSelector, Namespace: namespaceStr, @@ -1866,7 +1866,7 @@ func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) { // Pods to be deleted, the controller won't create new worker Pods during the same reconcile loop. As a result, the number of worker // Pods will be (expectedNumWorkerPods - 1) after the reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, testRayCluster) - assert.ErrorIs(t, err, reconcilePodsErr) + assert.NotNil(t, err) err = fakeClient.List(ctx, &podList, &client.ListOptions{ LabelSelector: workerSelector, Namespace: namespaceStr, @@ -1945,7 +1945,7 @@ func Test_TerminatedHead_RestartPolicy(t *testing.T) { // The head Pod will be deleted and the controller will return an error // instead of creating a new head Pod in the same reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, cluster) - assert.ErrorIs(t, err, reconcilePodsErr) + assert.NotNil(t, err) err = fakeClient.List(ctx, &podList, client.InNamespace(namespaceStr)) assert.Nil(t, err, "Fail to get pod list") assert.Equal(t, 0, len(podList.Items)) @@ -2013,7 +2013,7 @@ func Test_RunningPods_RayContainerTerminated(t *testing.T) { // The head Pod will be deleted and the controller will return an error // instead of creating a new head Pod in the same reconcile loop. err = testRayClusterReconciler.reconcilePods(ctx, cluster) - assert.ErrorIs(t, err, reconcilePodsErr) + assert.NotNil(t, err) err = fakeClient.List(ctx, &podList, client.InNamespace(namespaceStr)) assert.Nil(t, err, "Fail to get pod list") assert.Equal(t, 0, len(podList.Items)) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 4b825dc2988..b4f23389ff7 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -1,5 +1,10 @@ package utils +import ( + "errors" + "go.uber.org/multierr" +) + const ( // Default application name @@ -194,3 +199,24 @@ const ( func RayOriginatedFromCRDLabelValue(crdType CRDType) string { return string(crdType) } + +// These are markers used by the calculateStatus() for setting the RayClusterReplicaFailure condition. +var ( + ErrRayClusterReplicaFailure = errors.New("RayClusterReplicaFailure") + ErrFailedDeleteAllPods = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteAllPods")) + ErrFailedDeleteHeadPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteHeadPod")) + ErrFailedCreateHeadPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedCreateHeadPod")) + ErrFailedDeleteWorkerPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteWorkerPod")) + ErrFailedCreateWorkerPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedCreateWorkerPod")) +) + +func RayClusterReplicaFailureReason(err error) string { + errs := multierr.Errors(err) + if len(errs) < 2 { + return "" + } + if !errors.Is(errs[0], ErrRayClusterReplicaFailure) { + return "" + } + return errs[1].Error() +} diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 30075c43abc..8b91745b187 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -523,3 +523,11 @@ env_vars: }) } } + +func TestErrRayClusterReplicaFailureReason(t *testing.T) { + assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedDeleteAllPods), "FailedDeleteAllPods") + assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedDeleteHeadPod), "FailedDeleteHeadPod") + assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedCreateHeadPod), "FailedCreateHeadPod") + assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedDeleteWorkerPod), "FailedDeleteWorkerPod") + assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedCreateWorkerPod), "FailedCreateWorkerPod") +} From 416d0bf0ccd297c2f9ccdf3f132f646917b03746 Mon Sep 17 00:00:00 2001 From: Rueian Date: Tue, 23 Jul 2024 00:05:36 +0800 Subject: [PATCH 6/7] [Feat][RayCluster] Use a new RayClusterReplicaFailure condition to reflect the result of reconcilePods Signed-off-by: Rueian --- ray-operator/controllers/ray/utils/constant.go | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index b4f23389ff7..6d2e2814195 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -1,9 +1,6 @@ package utils -import ( - "errors" - "go.uber.org/multierr" -) +import "errors" const ( @@ -211,12 +208,11 @@ var ( ) func RayClusterReplicaFailureReason(err error) string { - errs := multierr.Errors(err) - if len(errs) < 2 { - return "" - } - if !errors.Is(errs[0], ErrRayClusterReplicaFailure) { - return "" + if e, ok := err.(interface{ Unwrap() []error }); ok { + errs := e.Unwrap() + if len(errs) >= 2 && errors.Is(errs[0], ErrRayClusterReplicaFailure) { + return errs[1].Error() + } } - return errs[1].Error() + return "" } From 000cda4d41d7d55698a14a1ba0f514c6ae5717ee Mon Sep 17 00:00:00 2001 From: Rueian Date: Tue, 23 Jul 2024 00:37:24 +0800 Subject: [PATCH 7/7] [Feat][RayCluster] make the errRayClusterReplicaFailure marker private Signed-off-by: Rueian --- ray-operator/controllers/ray/utils/constant.go | 14 +++++++------- ray-operator/controllers/ray/utils/util_test.go | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 6d2e2814195..fe7ab4c9522 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -199,18 +199,18 @@ func RayOriginatedFromCRDLabelValue(crdType CRDType) string { // These are markers used by the calculateStatus() for setting the RayClusterReplicaFailure condition. var ( - ErrRayClusterReplicaFailure = errors.New("RayClusterReplicaFailure") - ErrFailedDeleteAllPods = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteAllPods")) - ErrFailedDeleteHeadPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteHeadPod")) - ErrFailedCreateHeadPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedCreateHeadPod")) - ErrFailedDeleteWorkerPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedDeleteWorkerPod")) - ErrFailedCreateWorkerPod = errors.Join(ErrRayClusterReplicaFailure, errors.New("FailedCreateWorkerPod")) + errRayClusterReplicaFailure = errors.New("RayClusterReplicaFailure") + ErrFailedDeleteAllPods = errors.Join(errRayClusterReplicaFailure, errors.New("FailedDeleteAllPods")) + ErrFailedDeleteHeadPod = errors.Join(errRayClusterReplicaFailure, errors.New("FailedDeleteHeadPod")) + ErrFailedCreateHeadPod = errors.Join(errRayClusterReplicaFailure, errors.New("FailedCreateHeadPod")) + ErrFailedDeleteWorkerPod = errors.Join(errRayClusterReplicaFailure, errors.New("FailedDeleteWorkerPod")) + ErrFailedCreateWorkerPod = errors.Join(errRayClusterReplicaFailure, errors.New("FailedCreateWorkerPod")) ) func RayClusterReplicaFailureReason(err error) string { if e, ok := err.(interface{ Unwrap() []error }); ok { errs := e.Unwrap() - if len(errs) >= 2 && errors.Is(errs[0], ErrRayClusterReplicaFailure) { + if len(errs) >= 2 && errors.Is(errs[0], errRayClusterReplicaFailure) { return errs[1].Error() } } diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 8b91745b187..1c781cef5b5 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -2,6 +2,7 @@ package utils import ( "context" + "errors" "testing" "github.com/stretchr/testify/assert" @@ -530,4 +531,5 @@ func TestErrRayClusterReplicaFailureReason(t *testing.T) { assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedCreateHeadPod), "FailedCreateHeadPod") assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedDeleteWorkerPod), "FailedDeleteWorkerPod") assert.Equal(t, RayClusterReplicaFailureReason(ErrFailedCreateWorkerPod), "FailedCreateWorkerPod") + assert.Equal(t, RayClusterReplicaFailureReason(errors.New("other error")), "") }