diff --git a/helm-chart/ray-cluster/README.md b/helm-chart/ray-cluster/README.md index 57aef2f5f33..678a40ad765 100644 --- a/helm-chart/ray-cluster/README.md +++ b/helm-chart/ray-cluster/README.md @@ -78,6 +78,7 @@ helm uninstall raycluster | nameOverride | string | `"kuberay"` | String to partially override release name. | | fullnameOverride | string | `""` | String to fully override release name. | | imagePullSecrets | list | `[]` | Secrets with credentials to pull images from a private registry | +| gcsFaultTolerance.enabled | bool | `false` | | | common.containerEnv | list | `[]` | containerEnv specifies environment variables for the Ray head and worker containers. Follows standard K8s container env schema. | | head.initContainers | list | `[]` | Init containers to add to the head pod | | head.labels | object | `{}` | Labels for the head pod | diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go index a97dc7baf8b..afd2d60b16f 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/onsi/gomega" - . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1ac "k8s.io/client-go/applyconfigurations/core/v1" @@ -518,7 +517,7 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) checkRedisDBSize := DeployRedis(test, namespace.Name, RedisPassword) - defer g.Eventually(checkRedisDBSize, time.Second*60, time.Second).Should(BeEquivalentTo("0")) + defer g.Eventually(checkRedisDBSize, time.Second*60, time.Second).Should(gomega.BeEquivalentTo("0")) rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). @@ -570,19 +569,10 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) - // Delete the head Pod - err = test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) - g.Expect(err).NotTo(HaveOccurred()) - - PodUID := func(p *corev1.Pod) string { return string(p.UID) } - g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - ShouldNot(WithTransform(PodUID, Equal(string(headPod.UID)))) // Use UID to check if the new head pod is created. - - g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). - Should(WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, Equal("Running"))) - - headPod, err = GetHeadPod(test, rayCluster) // Replace the old head pod - g.Expect(err).NotTo(HaveOccurred()) + // Delete the head Pod and wait for the new head pod to be ready. + newHeadPod, err := DeletePodAndWait(test, rayCluster, namespace, headPod) + g.Expect(err).NotTo(gomega.HaveOccurred()) + headPod = newHeadPod // Create a detached actor, and a worker should be created after the new head pod is ready. ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "actor1"}) @@ -595,7 +585,7 @@ func TestRayClusterAutoscalerGCSFT(t *testing.T) { Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0)))) err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{}) - g.Expect(err).NotTo(HaveOccurred()) + g.Expect(err).NotTo(gomega.HaveOccurred()) }) } } diff --git a/ray-operator/test/support/support.go b/ray-operator/test/support/support.go index 28b3eeefad9..2a13dd0508a 100644 --- a/ray-operator/test/support/support.go +++ b/ray-operator/test/support/support.go @@ -7,8 +7,10 @@ import ( "github.com/onsi/gomega" "github.com/onsi/gomega/format" - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" ) var ( @@ -51,19 +53,19 @@ func init() { format.MaxLength = 0 } -func IsPodRunningAndReady(pod *v1.Pod) bool { - if pod.Status.Phase != v1.PodRunning { +func IsPodRunningAndReady(pod *corev1.Pod) bool { + if pod.Status.Phase != corev1.PodRunning { return false } for _, condition := range pod.Status.Conditions { - if condition.Type == v1.PodReady && condition.Status == v1.ConditionTrue { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { return true } } return false } -func AllPodsRunningAndReady(pods []v1.Pod) bool { +func AllPodsRunningAndReady(pods []corev1.Pod) bool { for _, pod := range pods { if !IsPodRunningAndReady(&pod) { return false @@ -71,3 +73,30 @@ func AllPodsRunningAndReady(pods []v1.Pod) bool { } return true } + +func DeletePodAndWait(test Test, rayCluster *rayv1.RayCluster, namespace *corev1.Namespace, currentHeadPod *corev1.Pod) (*corev1.Pod, error) { + g := gomega.NewWithT(test.T()) + + err := test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), currentHeadPod.Name, metav1.DeleteOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to delete head pod %s: %w", currentHeadPod.Name, err) + } + + PodUID := func(p *corev1.Pod) string { return string(p.UID) } + + // Wait for a new head pod to be created (different UID) + g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). + ShouldNot(gomega.WithTransform(PodUID, gomega.Equal(string(currentHeadPod.UID))), + "New head pod should have different UID than the deleted one") + + g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium). + Should(gomega.WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, gomega.Equal("Running")), + "New head pod should be in Running state") + + newHeadPod, err := GetHeadPod(test, rayCluster) + if err != nil { + return nil, fmt.Errorf("failed to get new head pod: %w", err) + } + + return newHeadPod, nil +}