From 33a5e1a7af8723582768aee6bf935461db9af149 Mon Sep 17 00:00:00 2001 From: You-Cheng Lin Date: Wed, 23 Jul 2025 05:06:15 +0000 Subject: [PATCH 1/4] update Signed-off-by: You-Cheng Lin --- .../raycluster_autoscaler_part2_test.go | 17 ++------ ray-operator/test/support/support.go | 40 ++++++++++++++++--- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go index a97dc7baf8b..d637a64ce5f 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go @@ -570,19 +570,10 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) - // Delete the head Pod - err = test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) - g.Expect(err).NotTo(HaveOccurred()) - - PodUID := func(p *corev1.Pod) string { return string(p.UID) } - g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - ShouldNot(WithTransform(PodUID, Equal(string(headPod.UID)))) // Use UID to check if the new head pod is created. - - g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - Should(WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, Equal("Running"))) - - headPod, err = GetHeadPod(test, rayCluster) // Replace the old head pod - g.Expect(err).NotTo(HaveOccurred()) + // Delete the head Pod and wait for the new head pod to be ready. + newHeadPod, err := DeletePodAndWait(test, rayCluster, namespace, headPod) + g.Expect(err).NotTo(gomega.HaveOccurred()) + headPod = newHeadPod // Create a detached actor, and a worker should be created after the new head pod is ready. ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "actor1"}) diff --git a/ray-operator/test/support/support.go b/ray-operator/test/support/support.go index 28b3eeefad9..c53d665da63 100644 --- a/ray-operator/test/support/support.go +++ b/ray-operator/test/support/support.go @@ -6,9 +6,12 @@ import ( "time" "github.com/onsi/gomega" + . "github.com/onsi/gomega" "github.com/onsi/gomega/format" - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" ) var ( @@ -51,19 +54,19 @@ func init() { format.MaxLength = 0 } -func IsPodRunningAndReady(pod *v1.Pod) bool { - if pod.Status.Phase != v1.PodRunning { +func IsPodRunningAndReady(pod *corev1.Pod) bool { + if pod.Status.Phase != corev1.PodRunning { return false } for _, condition := range pod.Status.Conditions { - if condition.Type == v1.PodReady && condition.Status == v1.ConditionTrue { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { return true } } return false } -func AllPodsRunningAndReady(pods []v1.Pod) bool { +func AllPodsRunningAndReady(pods []corev1.Pod) bool { for _, pod := range pods { if !IsPodRunningAndReady(&pod) { return false @@ -71,3 +74,30 @@ func AllPodsRunningAndReady(pods []v1.Pod) bool { } return true } + +func DeletePodAndWait(test Test, rayCluster *rayv1.RayCluster, namespace *corev1.Namespace, currentHeadPod *corev1.Pod) (*corev1.Pod, error) { + g := NewWithT(test.T()) + + err := test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), currentHeadPod.Name, metav1.DeleteOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to delete head pod %s: %w", currentHeadPod.Name, err) + } + + PodUID := func(p *corev1.Pod) string { return string(p.UID) } + + // Wait for a new head pod to be created (different UID) + g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). + ShouldNot(WithTransform(PodUID, Equal(string(currentHeadPod.UID))), + "New head pod should have different UID than the deleted one") + + g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). + Should(WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, Equal("Running")), + "New head pod should be in Running state") + + newHeadPod, err := GetHeadPod(test, rayCluster) + if err != nil { + return nil, fmt.Errorf("failed to get new head pod: %w", err) + } + + return newHeadPod, nil +} From cfa43e53ffd4d06d6cb804c36e98186c54d7de07 Mon Sep 17 00:00:00 2001 From: You-Cheng Lin Date: Thu, 24 Jul 2025 08:27:15 +0000 Subject: [PATCH 2/4] update Signed-off-by: You-Cheng Lin --- .../test/e2eautoscaler/raycluster_autoscaler_part2_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go index d637a64ce5f..afd2d60b16f 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/onsi/gomega" - . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1ac "k8s.io/client-go/applyconfigurations/core/v1" @@ -518,7 +517,7 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) checkRedisDBSize := DeployRedis(test, namespace.Name, RedisPassword) - defer g.Eventually(checkRedisDBSize, time.Second*60, time.Second).Should(BeEquivalentTo("0")) + defer g.Eventually(checkRedisDBSize, time.Second*60, time.Second).Should(gomega.BeEquivalentTo("0")) rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). @@ -586,7 +585,7 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{}) - g.Expect(err).NotTo(HaveOccurred()) + g.Expect(err).NotTo(gomega.HaveOccurred()) }) } } From 3d1493029a339c1a28f18b24c127a173587a9b21 Mon Sep 17 00:00:00 2001 From: You-Cheng Lin Date: Thu, 24 Jul 2025 08:57:48 +0000 Subject: [PATCH 3/4] update Signed-off-by: You-Cheng Lin --- helm-chart/ray-cluster/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/helm-chart/ray-cluster/README.md b/helm-chart/ray-cluster/README.md index 57aef2f5f33..678a40ad765 100644 --- a/helm-chart/ray-cluster/README.md +++ b/helm-chart/ray-cluster/README.md @@ -78,6 +78,7 @@ helm uninstall raycluster | nameOverride | string | `"kuberay"` | String to partially override release name. | | fullnameOverride | string | `""` | String to fully override release name. | | imagePullSecrets | list | `[]` | Secrets with credentials to pull images from a private registry | +| gcsFaultTolerance.enabled | bool | `false` | | | common.containerEnv | list | `[]` | containerEnv specifies environment variables for the Ray head and worker containers. Follows standard K8s container env schema. | | head.initContainers | list | `[]` | Init containers to add to the head pod | | head.labels | object | `{}` | Labels for the head pod | From 7ce513be9e921372cafb4f606a01edcea2f144af Mon Sep 17 00:00:00 2001 From: You-Cheng Lin Date: Thu, 24 Jul 2025 09:04:20 +0000 Subject: [PATCH 4/4] update Signed-off-by: You-Cheng Lin --- ray-operator/test/support/support.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ray-operator/test/support/support.go b/ray-operator/test/support/support.go index c53d665da63..2a13dd0508a 100644 --- a/ray-operator/test/support/support.go +++ b/ray-operator/test/support/support.go @@ -6,7 +6,6 @@ import ( "time" "github.com/onsi/gomega" - . "github.com/onsi/gomega" "github.com/onsi/gomega/format" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -76,7 +75,7 @@ func AllPodsRunningAndReady(pods []corev1.Pod) bool { } func DeletePodAndWait(test Test, rayCluster *rayv1.RayCluster, namespace *corev1.Namespace, currentHeadPod *corev1.Pod) (*corev1.Pod, error) { - g := NewWithT(test.T()) + g := gomega.NewWithT(test.T()) err := test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), currentHeadPod.Name, metav1.DeleteOptions{}) if err != nil { @@ -87,11 +86,11 @@ func DeletePodAndWait(test Test, rayCluster *rayv1.RayCluster, namespace *corev1 // Wait for a new head pod to be created (different UID) g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - ShouldNot(WithTransform(PodUID, Equal(string(currentHeadPod.UID))), + ShouldNot(gomega.WithTransform(PodUID, gomega.Equal(string(currentHeadPod.UID))), "New head pod should have different UID than the deleted one") g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - Should(WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, Equal("Running")), + Should(gomega.WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, gomega.Equal("Running")), "New head pod should be in Running state") newHeadPod, err := GetHeadPod(test, rayCluster)