@@ -129,48 +129,55 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
129129 ContainElement (WithTransform (KueueWorkloadAdmitted , BeTrueBecause ("Workload failed to be admitted" ))),
130130 ),
131131 )
132- time .Sleep (30 * time .Second )
133132
134133 // Fetch created raycluster
135134 rayClusterName := "mnisthpotest"
135+ // Wait until raycluster is up and running
136136 rayCluster , err := test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).Get (test .Ctx (), rayClusterName , metav1.GetOptions {})
137137 test .Expect (err ).ToNot (HaveOccurred ())
138138
139139 // Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
140140 dashboardUrl := GetDashboardUrl (test , namespace , rayCluster )
141141 rayClusterClientConfig := RayClusterClientConfig {Address : dashboardUrl .String (), Client : nil , InsecureSkipVerify : true }
142142 rayClient , err := NewRayClusterClient (rayClusterClientConfig , test .Config ().BearerToken )
143- if err != nil {
144- test .T ().Errorf ("%s" , err )
145- }
143+ test .Expect (err ).ToNot (HaveOccurred (), fmt .Sprintf ("Failed to create new raycluster client: %s" , err ))
144+
145+ // Wait until the rayjob is created and running
146+ test .Eventually (func () []RayJobDetailsResponse {
147+ rayJobs , err := rayClient .GetJobs ()
148+ test .Expect (err ).ToNot (HaveOccurred (), fmt .Sprintf ("Failed to fetch ray-jobs : %s" , err ))
149+ return * rayJobs
150+ }, TestTimeoutMedium , 2 * time .Second ).Should (HaveLen (1 ), "Ray job not found" )
146151
152+ // Get rayjob-ID
147153 jobID := GetTestJobId (test , rayClient , dashboardUrl .Host )
148- test .Expect (jobID ).ToNot (Equal ( nil ))
154+ test .Expect (jobID ).ToNot (BeEmpty ( ))
149155
150- // Wait for the job to be succeeded or failed
156+ // Wait for the job to either succeed or fail
151157 var rayJobStatus string
152- fmt . Printf ("Waiting for job to be Succeeded...\n " )
158+ test . T (). Logf ("Waiting for job to be Succeeded...\n " )
153159 test .Eventually (func () string {
154160 resp , err := rayClient .GetJobDetails (jobID )
155- test .Expect (err ).ToNot (HaveOccurred ())
161+ test .Expect (err ).ToNot (HaveOccurred (), fmt . Sprintf ( "Failed to get job details :%s" , err ) )
156162 rayJobStatusVal := resp .Status
157163 if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
158- fmt . Printf ( "JobStatus : %s\n " , rayJobStatusVal )
164+ test . T (). Logf ( "JobStatus - %s\n " , rayJobStatusVal )
159165 rayJobStatus = rayJobStatusVal
160166 return rayJobStatus
161167 }
162168 if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
163- fmt . Printf ( "JobStatus : %s...\n " , rayJobStatusVal )
169+ test . T (). Logf ( "JobStatus - %s...\n " , rayJobStatusVal )
164170 rayJobStatus = rayJobStatusVal
165171 }
166172 return rayJobStatus
167173 }, TestTimeoutDouble , 3 * time .Second ).Should (Or (Equal ("SUCCEEDED" ), Equal ("FAILED" )), "Job did not complete within the expected time" )
168- test .Expect (rayJobStatus ).To (Equal ("SUCCEEDED" ), "RayJob failed !" )
169-
170174 // Store job logs in output directory
171175 WriteRayJobAPILogs (test , rayClient , jobID )
172176
177+ // Assert ray-job status after job execution
178+ test .Expect (rayJobStatus ).To (Equal ("SUCCEEDED" ), "RayJob failed !" )
179+
173180 // Make sure the RayCluster finishes and is deleted
174181 test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutLong ).
175- Should (HaveLen ( 0 ))
182+ Should (BeEmpty ( ))
176183}
0 commit comments