Skip to content

Commit e2fc98b

Browse files
chipspeakopenshift-merge-bot[bot]
authored andcommitted
feat(RHOAIENG-26590): Report RayJob status via SDK
Signed-off-by: Pat O'Connor <[email protected]>
1 parent bfff951 commit e2fc98b

File tree

8 files changed

+830
-3
lines changed

8 files changed

+830
-3
lines changed

src/codeflare_sdk/ray/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
from .rayjobs import (
88
RayJob,
9+
RayJobDeploymentStatus,
10+
CodeflareRayJobStatus,
11+
RayJobInfo,
912
)
1013

1114
from .cluster import (
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from .rayjob import RayJob
2+
from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
This sub-module exists primarily to be used internally by the RayJob object
17+
(in the rayjob sub-module) for pretty-printing job status and details.
18+
"""
19+
20+
from rich.console import Console
21+
from rich.table import Table
22+
from rich.panel import Panel
23+
from typing import Tuple, Optional
24+
25+
from .status import RayJobDeploymentStatus, RayJobInfo
26+
27+
28+
def print_job_status(job_info: RayJobInfo):
29+
"""
30+
Pretty print the job status in a format similar to cluster status.
31+
"""
32+
status_display, header_color = _get_status_display(job_info.status)
33+
34+
# Create main info table
35+
table = _create_info_table(header_color, job_info.name, status_display)
36+
table.add_row(f"[bold]Job ID:[/bold] {job_info.job_id}")
37+
table.add_row(f"[bold]Status:[/bold] {job_info.status.value}")
38+
table.add_row(f"[bold]RayCluster:[/bold] {job_info.cluster_name}")
39+
table.add_row(f"[bold]Namespace:[/bold] {job_info.namespace}")
40+
41+
# Add timing information if available
42+
if job_info.start_time:
43+
table.add_row(f"[bold]Started:[/bold] {job_info.start_time}")
44+
45+
# Add attempt counts if there are failures
46+
if job_info.failed_attempts > 0:
47+
table.add_row(f"[bold]Failed Attempts:[/bold] {job_info.failed_attempts}")
48+
49+
_print_table_in_panel(table)
50+
51+
52+
def print_no_job_found(job_name: str, namespace: str):
53+
"""
54+
Print a message when no job is found.
55+
"""
56+
# Create table with error message
57+
table = _create_info_table(
58+
"[white on red][bold]Name", job_name, "[bold red]No RayJob found"
59+
)
60+
table.add_row()
61+
table.add_row("Please run rayjob.submit() to submit a job.")
62+
table.add_row()
63+
table.add_row(f"[bold]Namespace:[/bold] {namespace}")
64+
65+
_print_table_in_panel(table)
66+
67+
68+
def _get_status_display(status: RayJobDeploymentStatus) -> Tuple[str, str]:
69+
"""
70+
Get the display string and header color for a given status.
71+
72+
Returns:
73+
Tuple of (status_display, header_color)
74+
"""
75+
status_mapping = {
76+
RayJobDeploymentStatus.COMPLETE: (
77+
"Complete :white_heavy_check_mark:",
78+
"[white on green][bold]Name",
79+
),
80+
RayJobDeploymentStatus.RUNNING: ("Running :gear:", "[white on blue][bold]Name"),
81+
RayJobDeploymentStatus.FAILED: ("Failed :x:", "[white on red][bold]Name"),
82+
RayJobDeploymentStatus.SUSPENDED: (
83+
"Suspended :pause_button:",
84+
"[white on yellow][bold]Name",
85+
),
86+
}
87+
88+
return status_mapping.get(
89+
status, ("Unknown :question:", "[white on red][bold]Name")
90+
)
91+
92+
93+
def _create_info_table(header_color: str, name: str, status_display: str) -> Table:
94+
"""
95+
Create a standardized info table with header and status.
96+
97+
Returns:
98+
Table with header row, name/status row, and empty separator row
99+
"""
100+
table = Table(box=None, show_header=False)
101+
table.add_row(header_color)
102+
table.add_row("[bold underline]" + name, status_display)
103+
table.add_row() # Empty separator row
104+
return table
105+
106+
107+
def _print_table_in_panel(table: Table):
108+
"""
109+
Print a table wrapped in a consistent panel format.
110+
"""
111+
console = Console()
112+
main_table = Table(
113+
box=None, title="[bold] :package: CodeFlare RayJob Status :package:"
114+
)
115+
main_table.add_row(Panel.fit(table))
116+
console.print(main_table)

src/codeflare_sdk/ray/rayjobs/rayjob.py

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,32 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
"""
216
RayJob client for submitting and managing Ray jobs using the odh-kuberay-client.
317
"""
418

519
import logging
6-
from typing import Dict, Any, Optional
20+
from typing import Dict, Any, Optional, Tuple
721
from odh_kuberay_client.kuberay_job_api import RayjobApi
822

23+
from .status import (
24+
RayJobDeploymentStatus,
25+
CodeflareRayJobStatus,
26+
RayJobInfo,
27+
)
28+
from . import pretty_print
29+
930
# Set up logging
1031
logger = logging.getLogger(__name__)
1132

@@ -15,7 +36,7 @@ class RayJob:
1536
A client for managing Ray jobs using the KubeRay operator.
1637
1738
This class provides a simplified interface for submitting and managing
18-
Ray jobs in a Kubernetes cluster with the KubeRay operator installed.
39+
RayJob CRs (using the KubeRay RayJob python client).
1940
"""
2041

2142
def __init__(
@@ -109,3 +130,73 @@ def _build_rayjob_cr(
109130
rayjob_cr["spec"]["runtimeEnvYAML"] = str(runtime_env)
110131

111132
return rayjob_cr
133+
134+
def status(
135+
self, print_to_console: bool = True
136+
) -> Tuple[CodeflareRayJobStatus, bool]:
137+
"""
138+
Get the status of the Ray job.
139+
140+
Args:
141+
print_to_console (bool): Whether to print formatted status to console (default: True)
142+
143+
Returns:
144+
Tuple of (CodeflareRayJobStatus, ready: bool) where ready indicates job completion
145+
"""
146+
status_data = self._api.get_job_status(
147+
name=self.name, k8s_namespace=self.namespace
148+
)
149+
150+
if not status_data:
151+
if print_to_console:
152+
pretty_print.print_no_job_found(self.name, self.namespace)
153+
return CodeflareRayJobStatus.UNKNOWN, False
154+
155+
# Map deployment status to our enums
156+
deployment_status_str = status_data.get("jobDeploymentStatus", "Unknown")
157+
158+
try:
159+
deployment_status = RayJobDeploymentStatus(deployment_status_str)
160+
except ValueError:
161+
deployment_status = RayJobDeploymentStatus.UNKNOWN
162+
163+
# Create RayJobInfo dataclass
164+
job_info = RayJobInfo(
165+
name=self.name,
166+
job_id=status_data.get("jobId", ""),
167+
status=deployment_status,
168+
namespace=self.namespace,
169+
cluster_name=self.cluster_name,
170+
start_time=status_data.get("startTime"),
171+
end_time=status_data.get("endTime"),
172+
failed_attempts=status_data.get("failed", 0),
173+
succeeded_attempts=status_data.get("succeeded", 0),
174+
)
175+
176+
# Map to CodeFlare status and determine readiness
177+
codeflare_status, ready = self._map_to_codeflare_status(deployment_status)
178+
179+
if print_to_console:
180+
pretty_print.print_job_status(job_info)
181+
182+
return codeflare_status, ready
183+
184+
def _map_to_codeflare_status(
185+
self, deployment_status: RayJobDeploymentStatus
186+
) -> Tuple[CodeflareRayJobStatus, bool]:
187+
"""
188+
Map deployment status to CodeFlare status and determine readiness.
189+
190+
Returns:
191+
Tuple of (CodeflareRayJobStatus, ready: bool)
192+
"""
193+
status_mapping = {
194+
RayJobDeploymentStatus.COMPLETE: (CodeflareRayJobStatus.COMPLETE, True),
195+
RayJobDeploymentStatus.RUNNING: (CodeflareRayJobStatus.RUNNING, False),
196+
RayJobDeploymentStatus.FAILED: (CodeflareRayJobStatus.FAILED, False),
197+
RayJobDeploymentStatus.SUSPENDED: (CodeflareRayJobStatus.SUSPENDED, False),
198+
}
199+
200+
return status_mapping.get(
201+
deployment_status, (CodeflareRayJobStatus.UNKNOWN, False)
202+
)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
The status sub-module defines Enums containing information for Ray job
17+
deployment states and CodeFlare job states, as well as
18+
dataclasses to store information for Ray jobs.
19+
"""
20+
21+
from dataclasses import dataclass
22+
from enum import Enum
23+
from typing import Optional
24+
25+
26+
class RayJobDeploymentStatus(Enum):
27+
"""
28+
Defines the possible deployment states of a Ray job (from the KubeRay RayJob API).
29+
"""
30+
31+
COMPLETE = "Complete"
32+
RUNNING = "Running"
33+
FAILED = "Failed"
34+
SUSPENDED = "Suspended"
35+
UNKNOWN = "Unknown"
36+
37+
38+
class CodeflareRayJobStatus(Enum):
39+
"""
40+
Defines the possible reportable states of a CodeFlare Ray job.
41+
"""
42+
43+
COMPLETE = 1
44+
RUNNING = 2
45+
FAILED = 3
46+
SUSPENDED = 4
47+
UNKNOWN = 5
48+
49+
50+
@dataclass
51+
class RayJobInfo:
52+
"""
53+
For storing information about a Ray job.
54+
"""
55+
56+
name: str
57+
job_id: str
58+
status: RayJobDeploymentStatus
59+
namespace: str
60+
cluster_name: str
61+
start_time: Optional[str] = None
62+
end_time: Optional[str] = None
63+
failed_attempts: int = 0
64+
succeeded_attempts: int = 0

0 commit comments

Comments
 (0)