@@ -129,48 +129,55 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
129129				ContainElement (WithTransform (KueueWorkloadAdmitted , BeTrueBecause ("Workload failed to be admitted" ))),
130130			),
131131		)
132- 	time .Sleep (30  *  time .Second )
133132
134133	// Fetch created raycluster 
135134	rayClusterName  :=  "mnisthpotest" 
135+ 	// Wait until raycluster is up and running 
136136	rayCluster , err  :=  test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).Get (test .Ctx (), rayClusterName , metav1.GetOptions {})
137137	test .Expect (err ).ToNot (HaveOccurred ())
138138
139139	// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API 
140140	dashboardUrl  :=  GetDashboardUrl (test , namespace , rayCluster )
141141	rayClusterClientConfig  :=  RayClusterClientConfig {Address : dashboardUrl .String (), Client : nil , InsecureSkipVerify : true }
142142	rayClient , err  :=  NewRayClusterClient (rayClusterClientConfig , test .Config ().BearerToken )
143- 	if  err  !=  nil  {
144- 		test .T ().Errorf ("%s" , err )
145- 	}
143+ 	test .Expect (err ).ToNot (HaveOccurred (), fmt .Sprintf ("Failed to create new raycluster client: %s" , err ))
144+ 
145+ 	// Wait until the rayjob is created and running 
146+ 	test .Eventually (func () []RayJobDetailsResponse  {
147+ 		rayJobs , err  :=  rayClient .GetJobs ()
148+ 		test .Expect (err ).ToNot (HaveOccurred (), fmt .Sprintf ("Failed to fetch ray-jobs : %s" , err ))
149+ 		return  * rayJobs 
150+ 	}, TestTimeoutMedium , 2 * time .Second ).Should (HaveLen (1 ), "Ray job not found" )
146151
152+ 	// Get rayjob-ID 
147153	jobID  :=  GetTestJobId (test , rayClient , dashboardUrl .Host )
148- 	test .Expect (jobID ).ToNot (Equal ( nil ))
154+ 	test .Expect (jobID ).ToNot (BeEmpty ( ))
149155
150- 	// Wait for the job to be succeeded  or failed  
156+ 	// Wait for the job to either succeed  or fail  
151157	var  rayJobStatus  string 
152- 	fmt . Printf ("Waiting for job to be Succeeded...\n " )
158+ 	test . T (). Logf ("Waiting for job to be Succeeded...\n " )
153159	test .Eventually (func () string  {
154160		resp , err  :=  rayClient .GetJobDetails (jobID )
155- 		test .Expect (err ).ToNot (HaveOccurred ())
161+ 		test .Expect (err ).ToNot (HaveOccurred (),  fmt . Sprintf ( "Failed to get job details :%s" ,  err ) )
156162		rayJobStatusVal  :=  resp .Status 
157163		if  rayJobStatusVal  ==  "SUCCEEDED"  ||  rayJobStatusVal  ==  "FAILED"  {
158- 			fmt . Printf ( "JobStatus :  %s\n " , rayJobStatusVal )
164+ 			test . T (). Logf ( "JobStatus -  %s\n " , rayJobStatusVal )
159165			rayJobStatus  =  rayJobStatusVal 
160166			return  rayJobStatus 
161167		}
162168		if  rayJobStatus  !=  rayJobStatusVal  &&  rayJobStatusVal  !=  "SUCCEEDED"  {
163- 			fmt . Printf ( "JobStatus :  %s...\n " , rayJobStatusVal )
169+ 			test . T (). Logf ( "JobStatus -  %s...\n " , rayJobStatusVal )
164170			rayJobStatus  =  rayJobStatusVal 
165171		}
166172		return  rayJobStatus 
167173	}, TestTimeoutDouble , 3 * time .Second ).Should (Or (Equal ("SUCCEEDED" ), Equal ("FAILED" )), "Job did not complete within the expected time" )
168- 	test .Expect (rayJobStatus ).To (Equal ("SUCCEEDED" ), "RayJob failed !" )
169- 
170174	// Store job logs in output directory 
171175	WriteRayJobAPILogs (test , rayClient , jobID )
172176
177+ 	// Assert ray-job status after job execution 
178+ 	test .Expect (rayJobStatus ).To (Equal ("SUCCEEDED" ), "RayJob failed !" )
179+ 
173180	// Make sure the RayCluster finishes and is deleted 
174181	test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutLong ).
175- 		Should (HaveLen ( 0 ))
182+ 		Should (BeEmpty ( ))
176183}
0 commit comments