Skip to content

Commit 80d1307

Browse files
committed
RHOAIENG-32532: Add additional test coverage
1 parent 7398069 commit 80d1307

File tree

4 files changed

+281
-3
lines changed

4 files changed

+281
-3
lines changed

src/codeflare_sdk/ray/rayjobs/test_rayjob.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,3 +2004,193 @@ def test_update_existing_cluster_for_scripts_api_errors(mocker, auto_mock_setup)
20042004
rayjob._update_existing_cluster_for_scripts(
20052005
"test-configmap", mock_config_builder
20062006
)
2007+
2008+
2009+
def test_rayjob_kueue_label_no_default_queue(auto_mock_setup, mocker, caplog):
2010+
"""Test RayJob falls back to 'default' queue when no default queue exists."""
2011+
mocker.patch(
2012+
"codeflare_sdk.ray.rayjobs.rayjob.get_default_kueue_name",
2013+
return_value=None,
2014+
)
2015+
2016+
mock_api_instance = auto_mock_setup["rayjob_api"]
2017+
mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}}
2018+
2019+
cluster_config = ManagedClusterConfig()
2020+
rayjob = RayJob(
2021+
job_name="test-job",
2022+
cluster_config=cluster_config,
2023+
entrypoint="python script.py",
2024+
)
2025+
2026+
with caplog.at_level("WARNING"):
2027+
rayjob.submit()
2028+
2029+
# Verify the submitted job has the fallback label
2030+
call_args = mock_api_instance.submit_job.call_args
2031+
submitted_job = call_args.kwargs["job"]
2032+
assert submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"] == "default"
2033+
2034+
# Verify warning was logged
2035+
assert "No default Kueue LocalQueue found" in caplog.text
2036+
2037+
2038+
def test_rayjob_kueue_explicit_local_queue(auto_mock_setup):
2039+
"""Test RayJob uses explicitly specified local queue."""
2040+
mock_api_instance = auto_mock_setup["rayjob_api"]
2041+
mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}}
2042+
2043+
cluster_config = ManagedClusterConfig()
2044+
rayjob = RayJob(
2045+
job_name="test-job",
2046+
cluster_config=cluster_config,
2047+
entrypoint="python script.py",
2048+
local_queue="custom-queue",
2049+
)
2050+
2051+
rayjob.submit()
2052+
2053+
# Verify the submitted job has the explicit queue label
2054+
call_args = mock_api_instance.submit_job.call_args
2055+
submitted_job = call_args.kwargs["job"]
2056+
assert (
2057+
submitted_job["metadata"]["labels"]["kueue.x-k8s.io/queue-name"]
2058+
== "custom-queue"
2059+
)
2060+
2061+
2062+
def test_rayjob_no_kueue_label_for_existing_cluster(auto_mock_setup):
2063+
"""Test RayJob doesn't add Kueue label for existing clusters."""
2064+
mock_api_instance = auto_mock_setup["rayjob_api"]
2065+
mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}}
2066+
2067+
# Using existing cluster (no cluster_config)
2068+
rayjob = RayJob(
2069+
job_name="test-job",
2070+
cluster_name="existing-cluster",
2071+
entrypoint="python script.py",
2072+
)
2073+
2074+
rayjob.submit()
2075+
2076+
# Verify no Kueue label was added
2077+
call_args = mock_api_instance.submit_job.call_args
2078+
submitted_job = call_args.kwargs["job"]
2079+
assert "kueue.x-k8s.io/queue-name" not in submitted_job["metadata"]["labels"]
2080+
2081+
2082+
def test_rayjob_with_ttl_and_deadline(auto_mock_setup):
2083+
"""Test RayJob with TTL and active deadline seconds."""
2084+
mock_api_instance = auto_mock_setup["rayjob_api"]
2085+
mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}}
2086+
2087+
cluster_config = ManagedClusterConfig()
2088+
rayjob = RayJob(
2089+
job_name="test-job",
2090+
cluster_config=cluster_config,
2091+
entrypoint="python script.py",
2092+
ttl_seconds_after_finished=300,
2093+
active_deadline_seconds=600,
2094+
)
2095+
2096+
rayjob.submit()
2097+
2098+
# Verify TTL and deadline were set
2099+
call_args = mock_api_instance.submit_job.call_args
2100+
submitted_job = call_args.kwargs["job"]
2101+
assert submitted_job["spec"]["ttlSecondsAfterFinished"] == 300
2102+
assert submitted_job["spec"]["activeDeadlineSeconds"] == 600
2103+
2104+
2105+
def test_rayjob_shutdown_after_job_finishes(auto_mock_setup):
2106+
"""Test RayJob sets shutdownAfterJobFinishes correctly."""
2107+
mock_api_instance = auto_mock_setup["rayjob_api"]
2108+
mock_api_instance.submit_job.return_value = {"metadata": {"name": "test-job"}}
2109+
2110+
# Test with managed cluster (should shutdown)
2111+
cluster_config = ManagedClusterConfig()
2112+
rayjob = RayJob(
2113+
job_name="test-job",
2114+
cluster_config=cluster_config,
2115+
entrypoint="python script.py",
2116+
)
2117+
2118+
rayjob.submit()
2119+
2120+
call_args = mock_api_instance.submit_job.call_args
2121+
submitted_job = call_args.kwargs["job"]
2122+
assert submitted_job["spec"]["shutdownAfterJobFinishes"] is True
2123+
2124+
# Test with existing cluster (should not shutdown)
2125+
rayjob2 = RayJob(
2126+
job_name="test-job2",
2127+
cluster_name="existing-cluster",
2128+
entrypoint="python script.py",
2129+
)
2130+
2131+
rayjob2.submit()
2132+
2133+
call_args2 = mock_api_instance.submit_job.call_args
2134+
submitted_job2 = call_args2.kwargs["job"]
2135+
assert submitted_job2["spec"]["shutdownAfterJobFinishes"] is False
2136+
2137+
2138+
def test_rayjob_stop_delete_resubmit_logging(auto_mock_setup, caplog):
2139+
"""Test logging for stop, delete, and resubmit operations."""
2140+
mock_api_instance = auto_mock_setup["rayjob_api"]
2141+
2142+
# Test stop with logging
2143+
mock_api_instance.suspend_job.return_value = {
2144+
"metadata": {"name": "test-rayjob"},
2145+
"spec": {"suspend": True},
2146+
}
2147+
2148+
rayjob = RayJob(
2149+
job_name="test-rayjob",
2150+
cluster_name="test-cluster",
2151+
namespace="test-namespace",
2152+
entrypoint="python script.py",
2153+
)
2154+
2155+
with caplog.at_level("INFO"):
2156+
result = rayjob.stop()
2157+
2158+
assert result is True
2159+
assert "Successfully stopped the RayJob test-rayjob" in caplog.text
2160+
2161+
# Test delete with logging
2162+
caplog.clear()
2163+
mock_api_instance.delete_job.return_value = True
2164+
2165+
with caplog.at_level("INFO"):
2166+
result = rayjob.delete()
2167+
2168+
assert result is True
2169+
assert "Successfully deleted the RayJob test-rayjob" in caplog.text
2170+
2171+
# Test resubmit with logging
2172+
caplog.clear()
2173+
mock_api_instance.resubmit_job.return_value = {
2174+
"metadata": {"name": "test-rayjob"},
2175+
"spec": {"suspend": False},
2176+
}
2177+
2178+
with caplog.at_level("INFO"):
2179+
result = rayjob.resubmit()
2180+
2181+
assert result is True
2182+
assert "Successfully resubmitted the RayJob test-rayjob" in caplog.text
2183+
2184+
2185+
def test_rayjob_initialization_logging(auto_mock_setup, caplog):
2186+
"""Test RayJob initialization logging."""
2187+
with caplog.at_level("INFO"):
2188+
cluster_config = ManagedClusterConfig()
2189+
rayjob = RayJob(
2190+
job_name="test-job",
2191+
cluster_config=cluster_config,
2192+
entrypoint="python script.py",
2193+
)
2194+
2195+
assert "Creating new cluster: test-job-cluster" in caplog.text
2196+
assert "Initialized RayJob: test-job in namespace: test-namespace" in caplog.text

src/codeflare_sdk/ray/rayjobs/test_status.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,3 +302,72 @@ def test_rayjob_status_print_job_found(mocker):
302302

303303
assert status == CodeflareRayJobStatus.RUNNING
304304
assert ready == False
305+
306+
307+
def test_rayjob_status_all_deployment_states(mocker):
308+
"""Test RayJob status method with all deployment states."""
309+
mocker.patch("kubernetes.config.load_kube_config")
310+
mock_api_class = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi")
311+
mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi")
312+
mock_api_instance = mock_api_class.return_value
313+
314+
rayjob = RayJob(
315+
job_name="test-job",
316+
cluster_name="test-cluster",
317+
namespace="test-ns",
318+
entrypoint="python test.py",
319+
)
320+
321+
# Test mapping of deployment statuses to CodeflareRayJobStatus
322+
test_cases = [
323+
# (deployment_status_str, expected CodeflareRayJobStatus, expected ready)
324+
("Complete", CodeflareRayJobStatus.COMPLETE, True),
325+
("Running", CodeflareRayJobStatus.RUNNING, False),
326+
("Failed", CodeflareRayJobStatus.FAILED, False),
327+
("Suspended", CodeflareRayJobStatus.SUSPENDED, False),
328+
]
329+
330+
for deployment_status_str, expected_status, expected_ready in test_cases:
331+
mock_api_instance.get_job_status.return_value = {
332+
"jobId": "test-job-abc123",
333+
"jobDeploymentStatus": deployment_status_str,
334+
"startTime": "2025-07-28T11:37:07Z",
335+
"failed": 0,
336+
"succeeded": 0,
337+
"rayClusterName": "test-cluster",
338+
}
339+
status, ready = rayjob.status(print_to_console=False)
340+
assert status == expected_status, f"Failed for {deployment_status_str}"
341+
assert (
342+
ready == expected_ready
343+
), f"Failed ready check for {deployment_status_str}"
344+
345+
346+
def test_rayjob_status_with_end_time(mocker):
347+
"""Test RayJob status with end time field."""
348+
mocker.patch("kubernetes.config.load_kube_config")
349+
mock_api_class = mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayjobApi")
350+
mocker.patch("codeflare_sdk.ray.rayjobs.rayjob.RayClusterApi")
351+
mock_api_instance = mock_api_class.return_value
352+
353+
rayjob = RayJob(
354+
job_name="test-job",
355+
cluster_name="test-cluster",
356+
namespace="test-ns",
357+
entrypoint="python test.py",
358+
)
359+
360+
# Test with end time field
361+
mock_api_instance.get_job_status.return_value = {
362+
"jobId": "test-job-abc123",
363+
"jobDeploymentStatus": "Complete",
364+
"startTime": "2025-07-28T11:37:07Z",
365+
"endTime": "2025-07-28T11:47:07Z",
366+
"failed": 0,
367+
"succeeded": 1,
368+
"rayClusterName": "test-cluster",
369+
}
370+
371+
status, ready = rayjob.status(print_to_console=False)
372+
assert status == CodeflareRayJobStatus.COMPLETE
373+
assert ready == True

tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,28 @@ def test_lifecycled_kueue_resource_queueing(self):
120120
)
121121
assert job2.submit() == "waiter"
122122

123-
sleep(3)
123+
# Wait for Kueue to process the job
124+
sleep(5)
124125
job2_cr = self.job_api.get_job(name=job2.name, k8s_namespace=job2.namespace)
125-
assert job2_cr.get("spec", {}).get("suspend", False)
126+
127+
# For RayJobs with managed clusters, check if Kueue is holding resources
128+
job2_status = job2_cr.get("status", {})
129+
ray_cluster_name = job2_status.get("rayClusterName", "")
130+
131+
# If RayCluster is not created yet, it means Kueue is holding the job
132+
if not ray_cluster_name:
133+
# This is the expected behavior
134+
job_is_queued = True
135+
else:
136+
# Check RayCluster resources - if all are 0, it's queued
137+
ray_cluster_status = job2_status.get("rayClusterStatus", {})
138+
desired_cpu = ray_cluster_status.get("desiredCPU", "0")
139+
desired_memory = ray_cluster_status.get("desiredMemory", "0")
140+
141+
# Kueue creates the RayCluster but with 0 resources when queued
142+
job_is_queued = desired_cpu == "0" and desired_memory == "0"
143+
144+
assert job_is_queued, "Job2 should be queued by Kueue while Job1 is running"
126145

127146
assert self.job_api.wait_until_job_finished(
128147
name=job1.name, k8s_namespace=job1.namespace, timeout=60

tests/e2e/support.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ def create_limited_kueue_resources(self):
559559
if is_openshift():
560560
# MODH images need more memory, so higher quota but still limited to allow only 1 job
561561
cpu_quota = 3
562-
memory_quota = "15Gi"
562+
memory_quota = "8Gi" # Reduced from 15Gi to ensure second job is suspended (1 job needs 7Gi)
563563
else:
564564
# Standard Ray images need less memory
565565
cpu_quota = 3

0 commit comments

Comments
 (0)