1010
1111# This test creates a Ray Cluster and then submits a RayJob against the existing cluster on Kind Cluster
1212
13+
1314@pytest .mark .kind
1415class TestRayJobExistingClusterKind :
1516 def setup_method (self ):
@@ -30,12 +31,13 @@ def test_rayjob_ray_cluster_sdk_kind_nvidia_gpu(self):
3031 self .setup_method ()
3132 create_namespace (self )
3233 create_kueue_resources (self )
33- self .run_rayjob_against_existing_cluster_kind (accelerator = "gpu" , number_of_gpus = 1 )
34+ self .run_rayjob_against_existing_cluster_kind (
35+ accelerator = "gpu" , number_of_gpus = 1
36+ )
3437
3538 def run_rayjob_against_existing_cluster_kind (
3639 self , accelerator , gpu_resource_name = "nvidia.com/gpu" , number_of_gpus = 0
3740 ):
38-
3941 cluster_name = "existing-cluster"
4042 cluster = Cluster (
4143 ClusterConfiguration (
@@ -63,21 +65,25 @@ def run_rayjob_against_existing_cluster_kind(
6365 print (f"✅ Ray cluster '{ cluster_name } ' is ready" )
6466
6567 # test RayJob submission against the existing cluster
66- self .assert_rayjob_submit_against_existing_cluster (cluster , accelerator , number_of_gpus )
68+ self .assert_rayjob_submit_against_existing_cluster (
69+ cluster , accelerator , number_of_gpus
70+ )
6771
6872 # Cleanup - manually tear down the cluster since job won't do it
6973 print ("🧹 Cleaning up Ray cluster" )
7074 cluster .down ()
7175
72- def assert_rayjob_submit_against_existing_cluster (self , cluster , accelerator , number_of_gpus ):
76+ def assert_rayjob_submit_against_existing_cluster (
77+ self , cluster , accelerator , number_of_gpus
78+ ):
7379 """
7480 Test RayJob submission against an existing Ray cluster.
7581 """
7682 cluster_name = cluster .config .name
7783 job_name = f"mnist-rayjob-{ accelerator } "
78-
84+
7985 print (f"🚀 Testing RayJob submission against existing cluster '{ cluster_name } '" )
80-
86+
8187 # Create RayJob targeting the existing cluster
8288 rayjob = RayJob (
8389 job_name = job_name ,
@@ -94,7 +100,9 @@ def assert_rayjob_submit_against_existing_cluster(self, cluster, accelerator, nu
94100
95101 # Submit the job
96102 submission_result = rayjob .submit ()
97- assert submission_result == job_name , f"Job submission failed, expected { job_name } , got { submission_result } "
103+ assert (
104+ submission_result == job_name
105+ ), f"Job submission failed, expected { job_name } , got { submission_result } "
98106 print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
99107
100108 # Monitor the job status until completion
@@ -105,19 +113,19 @@ def assert_rayjob_submit_against_existing_cluster(self, cluster, accelerator, nu
105113 def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 900 ):
106114 """
107115 Monitor a RayJob until it completes or fails.
108-
116+
109117 Args:
110118 rayjob: The RayJob instance to monitor
111119 timeout: Maximum time to wait in seconds (default: 15 minutes)
112120 """
113121 print (f"⏳ Monitoring RayJob '{ rayjob .name } ' status..." )
114-
122+
115123 elapsed_time = 0
116124 check_interval = 10 # Check every 10 seconds
117-
125+
118126 while elapsed_time < timeout :
119127 status , ready = rayjob .status (print_to_console = True )
120-
128+
121129 # Check if job has completed (either successfully or failed)
122130 if status == CodeflareRayJobStatus .COMPLETE :
123131 print (f"✅ RayJob '{ rayjob .name } ' completed successfully!" )
@@ -128,11 +136,11 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
128136 print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
129137 elif status == CodeflareRayJobStatus .UNKNOWN :
130138 print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
131-
139+
132140 # Wait before next check
133141 sleep (check_interval )
134142 elapsed_time += check_interval
135-
143+
136144 # If we reach here, the job has timed out
137145 final_status , _ = rayjob .status (print_to_console = True )
138146 raise TimeoutError (
0 commit comments