Skip to content

Commit 872fa9c

Browse files
Replaced 30 seconds time buffer with polling logic to wait until rayjob exists
1 parent c3b312f commit 872fa9c

File tree

3 files changed

+39
-34
lines changed

3 files changed

+39
-34
lines changed

tests/odh/mnist_ray_test.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -129,50 +129,56 @@ func mnistRay(t *testing.T, numGpus int) {
129129
),
130130
)
131131

132-
time.Sleep(30 * time.Second)
133-
134132
// Fetch created raycluster
135133
rayClusterName := "mnisttest"
134+
// Wait until raycluster is up and running
136135
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), rayClusterName, metav1.GetOptions{})
137136
test.Expect(err).ToNot(HaveOccurred())
138137

139138
// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
140139
dashboardUrl := GetDashboardUrl(test, namespace, rayCluster)
141140
rayClusterClientConfig := RayClusterClientConfig{Address: dashboardUrl.String(), Client: nil, InsecureSkipVerify: true}
142141
rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
143-
if err != nil {
144-
test.T().Errorf("%s", err)
145-
}
142+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to create new raycluster client: %s", err))
146143

144+
// wait until rayjob exists
145+
test.Eventually(func() []RayJobDetailsResponse {
146+
rayJobs, err := rayClient.GetJobs()
147+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to fetch ray-jobs : %s", err))
148+
return *rayJobs
149+
}, TestTimeoutMedium, 2*time.Second).Should(HaveLen(1), "Ray job not found")
150+
151+
// Get test job-id
147152
jobID := GetTestJobId(test, rayClient, dashboardUrl.Host)
148-
test.Expect(jobID).ToNot(Equal(nil))
153+
test.Expect(jobID).ToNot(BeEmpty())
149154

150155
// Wait for the job to be succeeded or failed
151156
var rayJobStatus string
152-
fmt.Printf("Waiting for job to be Succeeded...\n")
157+
test.T().Logf("Waiting for job to be Succeeded...\n")
153158
test.Eventually(func() string {
154159
resp, err := rayClient.GetJobDetails(jobID)
155-
test.Expect(err).ToNot(HaveOccurred())
160+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to get job details :%s", err))
156161
rayJobStatusVal := resp.Status
157162
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
158-
fmt.Printf("JobStatus : %s\n", rayJobStatusVal)
163+
test.T().Logf("JobStatus - %s\n", rayJobStatusVal)
159164
rayJobStatus = rayJobStatusVal
160165
return rayJobStatus
161166
}
162167
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
163-
fmt.Printf("JobStatus : %s...\n", rayJobStatusVal)
168+
test.T().Logf("JobStatus - %s...\n", rayJobStatusVal)
164169
rayJobStatus = rayJobStatusVal
165170
}
166171
return rayJobStatus
167172
}, TestTimeoutDouble, 3*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
168-
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
169-
170173
// Store job logs in output directory
171174
WriteRayJobAPILogs(test, rayClient, jobID)
172175

176+
// Assert ray-job status after job execution
177+
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
178+
173179
// Make sure the RayCluster finishes and is deleted
174180
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
175-
Should(HaveLen(0))
181+
Should(BeEmpty())
176182
}
177183

178184
func readMnistScriptTemplate(test Test, filePath string) []byte {

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -129,48 +129,55 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
129129
ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
130130
),
131131
)
132-
time.Sleep(30 * time.Second)
133132

134133
// Fetch created raycluster
135134
rayClusterName := "mnisthpotest"
135+
// Wait until raycluster is up and running
136136
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), rayClusterName, metav1.GetOptions{})
137137
test.Expect(err).ToNot(HaveOccurred())
138138

139139
// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
140140
dashboardUrl := GetDashboardUrl(test, namespace, rayCluster)
141141
rayClusterClientConfig := RayClusterClientConfig{Address: dashboardUrl.String(), Client: nil, InsecureSkipVerify: true}
142142
rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
143-
if err != nil {
144-
test.T().Errorf("%s", err)
145-
}
143+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to create new raycluster client: %s", err))
144+
145+
// Wait until the rayjob is created and running
146+
test.Eventually(func() []RayJobDetailsResponse {
147+
rayJobs, err := rayClient.GetJobs()
148+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to fetch ray-jobs : %s", err))
149+
return *rayJobs
150+
}, TestTimeoutMedium, 2*time.Second).Should(HaveLen(1), "Ray job not found")
146151

152+
// Get rayjob-ID
147153
jobID := GetTestJobId(test, rayClient, dashboardUrl.Host)
148-
test.Expect(jobID).ToNot(Equal(nil))
154+
test.Expect(jobID).ToNot(BeEmpty())
149155

150-
// Wait for the job to be succeeded or failed
156+
// Wait for the job to either succeed or fail
151157
var rayJobStatus string
152-
fmt.Printf("Waiting for job to be Succeeded...\n")
158+
test.T().Logf("Waiting for job to be Succeeded...\n")
153159
test.Eventually(func() string {
154160
resp, err := rayClient.GetJobDetails(jobID)
155-
test.Expect(err).ToNot(HaveOccurred())
161+
test.Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Failed to get job details :%s", err))
156162
rayJobStatusVal := resp.Status
157163
if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
158-
fmt.Printf("JobStatus : %s\n", rayJobStatusVal)
164+
test.T().Logf("JobStatus - %s\n", rayJobStatusVal)
159165
rayJobStatus = rayJobStatusVal
160166
return rayJobStatus
161167
}
162168
if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
163-
fmt.Printf("JobStatus : %s...\n", rayJobStatusVal)
169+
test.T().Logf("JobStatus - %s...\n", rayJobStatusVal)
164170
rayJobStatus = rayJobStatusVal
165171
}
166172
return rayJobStatus
167173
}, TestTimeoutDouble, 3*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
168-
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
169-
170174
// Store job logs in output directory
171175
WriteRayJobAPILogs(test, rayClient, jobID)
172176

177+
// Assert ray-job status after job execution
178+
test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
179+
173180
// Make sure the RayCluster finishes and is deleted
174181
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
175-
Should(HaveLen(0))
182+
Should(BeEmpty())
176183
}

tests/odh/support.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package odh
1818

1919
import (
2020
"embed"
21-
"net/http"
2221
"net/url"
2322

2423
. "github.com/onsi/gomega"
@@ -41,7 +40,6 @@ func ReadFile(t support.Test, fileName string) []byte {
4140

4241
func GetDashboardUrl(test support.Test, namespace *v1.Namespace, rayCluster *rayv1.RayCluster) *url.URL {
4342
dashboardName := "ray-dashboard-" + rayCluster.Name
44-
test.T().Logf("Raycluster created : %s\n", rayCluster.Name)
4543
route := GetRoute(test, namespace.Name, dashboardName)
4644
hostname := route.Status.Ingress[0].Host
4745
dashboardUrl, _ := url.Parse("https://" + hostname)
@@ -51,12 +49,6 @@ func GetDashboardUrl(test support.Test, namespace *v1.Namespace, rayCluster *ray
5149
}
5250

5351
func GetTestJobId(test Test, rayClient RayClusterClient, hostName string) string {
54-
listJobsReq, err := http.NewRequest("GET", "https://"+hostName+"/api/jobs/", nil)
55-
if err != nil {
56-
test.T().Errorf("failed to do get request: %s\n", err)
57-
}
58-
listJobsReq.Header.Add("Authorization", "Bearer "+test.Config().BearerToken)
59-
6052
allJobsData, err := rayClient.GetJobs()
6153
test.Expect(err).ToNot(HaveOccurred())
6254

0 commit comments

Comments
 (0)