Skip to content

Commit 974629c

Browse files
committed
test: revert e2e workflow
1 parent 40c3449 commit 974629c

File tree

9 files changed

+113
-84
lines changed

9 files changed

+113
-84
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- 'release-*'
8+
- "release-*"
99
- ray-jobs-feature
1010
paths-ignore:
11-
- 'docs/**'
12-
- '**.adoc'
13-
- '**.md'
14-
- 'LICENSE'
11+
- "docs/**"
12+
- "**.adoc"
13+
- "**.md"
14+
- "LICENSE"
1515

1616
concurrency:
1717
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -33,9 +33,9 @@ jobs:
3333
- name: Checkout common repo code
3434
uses: actions/checkout@v4
3535
with:
36-
repository: 'project-codeflare/codeflare-common'
37-
ref: 'main'
38-
path: 'common'
36+
repository: "project-codeflare/codeflare-common"
37+
ref: "main"
38+
path: "common"
3939

4040
- name: Checkout CodeFlare operator repository
4141
uses: actions/checkout@v4
@@ -46,7 +46,7 @@ jobs:
4646
- name: Set Go
4747
uses: actions/setup-go@v5
4848
with:
49-
go-version-file: './codeflare-operator/go.mod'
49+
go-version-file: "./codeflare-operator/go.mod"
5050
cache-dependency-path: "./codeflare-operator/go.sum"
5151

5252
- name: Set up gotestfmt
@@ -57,8 +57,8 @@ jobs:
5757
- name: Set up specific Python version
5858
uses: actions/setup-python@v5
5959
with:
60-
python-version: '3.11'
61-
cache: 'pip' # caching pip dependencies
60+
python-version: "3.11"
61+
cache: "pip" # caching pip dependencies
6262

6363
- name: Setup NVidia GPU environment for KinD
6464
uses: ./common/github-actions/nvidia-gpu-setup
@@ -76,7 +76,7 @@ jobs:
7676
run: |
7777
cd codeflare-operator
7878
echo Setting up CodeFlare stack
79-
make setup-e2e
79+
make setup-e2e KUEUE_VERSION=${KUEUE_VERSION} KUERAY_VERSION=${KUERAY_VERSION}
8080
echo Deploying CodeFlare operator
8181
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
8282
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ def is_dashboard_ready(self) -> bool:
391391
bool:
392392
True if the dashboard is ready, False otherwise.
393393
"""
394+
394395
dashboard_uri = self.cluster_dashboard_uri()
395396
if dashboard_uri is None:
396397
return False
@@ -516,6 +517,8 @@ def cluster_dashboard_uri(self) -> str:
516517
):
517518
protocol = "https" if route["spec"].get("tls") else "http"
518519
return f"{protocol}://{route['spec']['host']}"
520+
# No route found for this cluster
521+
return "Dashboard not available yet, have you run cluster.up()?"
519522
else:
520523
try:
521524
api_instance = client.NetworkingV1Api(get_api_client())
@@ -534,25 +537,9 @@ def cluster_dashboard_uri(self) -> str:
534537
protocol = "http"
535538
elif "route.openshift.io/termination" in annotations:
536539
protocol = "https"
537-
return f"{protocol}://{ingress.spec.rules[0].host}"
540+
return f"{protocol}://{ingress.spec.rules[0].host}"
538541

539-
# For local/test environments without ingress controller (e.g., KIND)
540-
# Try to find the Ray head service
541-
try:
542-
api_instance = client.CoreV1Api(get_api_client())
543-
services = api_instance.list_namespaced_service(
544-
self.config.namespace,
545-
label_selector=f"ray.io/cluster={self.config.name},ray.io/node-type=head",
546-
)
547-
for service in services.items:
548-
if service.metadata.name == f"{self.config.name}-head-svc":
549-
# For ClusterIP services in local environments, return a placeholder
550-
# The actual connection would need port-forwarding or NodePort
551-
return f"http://{service.metadata.name}.{self.config.namespace}.svc.cluster.local:8265"
552-
except Exception: # pragma: no cover
553-
pass
554-
555-
return None
542+
return "Dashboard not available yet, have you run cluster.up()?"
556543

557544
def list_jobs(self) -> List:
558545
"""
@@ -813,11 +800,6 @@ def remove_autogenerated_fields(resource):
813800
else:
814801
remove_autogenerated_fields(resource[key])
815802

816-
# After cleaning, remove empty metadata sections
817-
if "metadata" in resource and isinstance(resource["metadata"], dict):
818-
if len(resource["metadata"]) == 0:
819-
del resource["metadata"]
820-
821803
elif isinstance(resource, list):
822804
for item in resource:
823805
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,10 @@ def test_cluster_uris(mocker):
322322
mocker.patch(
323323
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
324324
)
325-
# When no ingress/route/service is found, the method should return None
326-
assert cluster.cluster_dashboard_uri() is None
325+
assert (
326+
cluster.cluster_dashboard_uri()
327+
== "Dashboard not available yet, have you run cluster.up()?"
328+
)
327329

328330
mocker.patch(
329331
"codeflare_sdk.ray.cluster.cluster._is_openshift_cluster", return_value=True

tests/e2e/cluster_apply_kind_test.py

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1+
from calendar import c
2+
from time import sleep
13
from codeflare_sdk import Cluster, ClusterConfiguration
24
import pytest
3-
import time
45
from kubernetes import client
5-
from codeflare_sdk.common.utils import constants
66

77
from support import (
88
initialize_kubernetes_client,
@@ -40,7 +40,6 @@ def test_cluster_apply(self):
4040
worker_cpu_limits="1",
4141
worker_memory_requests="1Gi",
4242
worker_memory_limits="2Gi",
43-
image=f"rayproject/ray:{constants.RAY_VERSION}",
4443
write_to_file=True,
4544
verify_tls=False,
4645
)
@@ -50,9 +49,9 @@ def test_cluster_apply(self):
5049
cluster.apply()
5150

5251
# Wait for the cluster to be ready
53-
cluster.wait_ready(dashboard_check=False)
52+
cluster.wait_ready()
5453
status, ready = cluster.status()
55-
assert ready, f"Cluster {cluster_name} is not ready: {status}"
54+
assert ready, f"Cluster {cluster_name} is not ready"
5655

5756
# Verify the cluster is created
5857
ray_cluster = get_ray_cluster(cluster_name, namespace)
@@ -61,7 +60,7 @@ def test_cluster_apply(self):
6160
ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] == 1
6261
), "Initial worker count does not match"
6362

64-
# Update configuration with 2 workers
63+
# Update configuration with 3 workers
6564
updated_config = ClusterConfiguration(
6665
name=cluster_name,
6766
namespace=namespace,
@@ -74,7 +73,6 @@ def test_cluster_apply(self):
7473
worker_cpu_limits="1",
7574
worker_memory_requests="1Gi",
7675
worker_memory_limits="2Gi",
77-
image=f"rayproject/ray:{constants.RAY_VERSION}",
7876
write_to_file=True,
7977
verify_tls=False,
8078
)
@@ -83,15 +81,10 @@ def test_cluster_apply(self):
8381
cluster.config = updated_config
8482
cluster.apply()
8583

86-
# Give Kubernetes a moment to process the update
87-
time.sleep(5)
88-
8984
# Wait for the updated cluster to be ready
90-
cluster.wait_ready(dashboard_check=False)
85+
cluster.wait_ready()
9186
updated_status, updated_ready = cluster.status()
92-
assert (
93-
updated_ready
94-
), f"Cluster {cluster_name} is not ready after update: {updated_status}"
87+
assert updated_ready, f"Cluster {cluster_name} is not ready after update"
9588

9689
# Verify the cluster is updated
9790
updated_ray_cluster = get_ray_cluster(cluster_name, namespace)
@@ -101,19 +94,67 @@ def test_cluster_apply(self):
10194

10295
# Clean up
10396
cluster.down()
97+
sleep(10)
98+
ray_cluster = get_ray_cluster(cluster_name, namespace)
99+
assert ray_cluster is None, "Cluster was not deleted successfully"
104100

105-
# Wait for deletion to complete (finalizers may delay deletion)
106-
max_wait = 30 # seconds
107-
wait_interval = 2
108-
elapsed = 0
101+
def test_apply_invalid_update(self):
102+
self.setup_method()
103+
create_namespace(self)
109104

110-
while elapsed < max_wait:
111-
ray_cluster = get_ray_cluster(cluster_name, namespace)
112-
if ray_cluster is None:
113-
break
114-
time.sleep(wait_interval)
115-
elapsed += wait_interval
105+
cluster_name = "test-cluster-apply-invalid"
106+
namespace = self.namespace
116107

117-
assert (
118-
ray_cluster is None
119-
), f"Cluster was not deleted successfully after {max_wait}s"
108+
# Initial configuration
109+
initial_config = ClusterConfiguration(
110+
name=cluster_name,
111+
namespace=namespace,
112+
num_workers=1,
113+
head_cpu_requests="500m",
114+
head_cpu_limits="1",
115+
head_memory_requests="1Gi",
116+
head_memory_limits="2Gi",
117+
worker_cpu_requests="500m",
118+
worker_cpu_limits="1",
119+
worker_memory_requests="1Gi",
120+
worker_memory_limits="2Gi",
121+
write_to_file=True,
122+
verify_tls=False,
123+
)
124+
125+
# Create the cluster
126+
cluster = Cluster(initial_config)
127+
cluster.apply()
128+
129+
# Wait for the cluster to be ready
130+
cluster.wait_ready()
131+
status, ready = cluster.status()
132+
assert ready, f"Cluster {cluster_name} is not ready"
133+
134+
# Update with an invalid configuration (e.g., immutable field change)
135+
invalid_config = ClusterConfiguration(
136+
name=cluster_name,
137+
namespace=namespace,
138+
num_workers=2,
139+
head_cpu_requests="1",
140+
head_cpu_limits="2", # Changing CPU limits (immutable)
141+
head_memory_requests="1Gi",
142+
head_memory_limits="2Gi",
143+
worker_cpu_requests="500m",
144+
worker_cpu_limits="1",
145+
worker_memory_requests="1Gi",
146+
worker_memory_limits="2Gi",
147+
write_to_file=True,
148+
verify_tls=False,
149+
)
150+
151+
# Try to apply the invalid configuration and expect failure
152+
cluster.config = invalid_config
153+
cluster.apply()
154+
155+
cluster.wait_ready()
156+
status, ready = cluster.status()
157+
assert ready, f"Cluster {cluster_name} is not ready"
158+
159+
# Clean up
160+
cluster.down()

tests/e2e/heterogeneous_clusters_kind_test.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,17 @@ def run_heterogeneous_clusters(
4848
namespace=self.namespace,
4949
num_workers=1,
5050
head_cpu_requests="500m",
51-
head_cpu_limits="500m",
52-
head_memory_requests=2,
53-
head_memory_limits=2,
51+
head_cpu_limits="1",
52+
head_memory_requests="1Gi",
53+
head_memory_limits="2Gi",
5454
worker_cpu_requests="500m",
5555
worker_cpu_limits=1,
56-
worker_memory_requests=1,
57-
worker_memory_limits=4,
56+
worker_memory_requests="1Gi",
57+
worker_memory_limits="2Gi",
5858
worker_extended_resource_requests={
5959
gpu_resource_name: number_of_gpus
6060
},
61+
image=get_ray_image(),
6162
write_to_file=True,
6263
verify_tls=False,
6364
local_queue=queue_name,

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,19 @@ def run_local_interactives(
5555
namespace=self.namespace,
5656
num_workers=1,
5757
head_cpu_requests="500m",
58-
head_cpu_limits="500m",
58+
head_cpu_limits="1",
59+
head_memory_requests="1Gi",
60+
head_memory_limits="2Gi",
5961
worker_cpu_requests="500m",
6062
worker_cpu_limits=1,
61-
worker_memory_requests=1,
62-
worker_memory_limits=4,
63+
worker_memory_requests="1Gi",
64+
worker_memory_limits="2Gi",
6365
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
6466
verify_tls=False,
6567
)
6668
)
6769

68-
cluster.up()
70+
cluster.apply()
6971

7072
cluster.wait_ready()
7173
cluster.status()

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,21 @@ def run_mnist_raycluster_sdk_kind(
4343
namespace=self.namespace,
4444
num_workers=1,
4545
head_cpu_requests="500m",
46-
head_cpu_limits="500m",
46+
head_cpu_limits="1",
47+
head_memory_requests="1Gi",
48+
head_memory_limits="2Gi",
4749
worker_cpu_requests="500m",
4850
worker_cpu_limits=1,
49-
worker_memory_requests=1,
50-
worker_memory_limits=4,
51+
worker_memory_requests="1Gi",
52+
worker_memory_limits="2Gi",
5153
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
5254
write_to_file=True,
5355
verify_tls=False,
5456
appwrapper=True,
5557
)
5658
)
5759

58-
cluster.up()
60+
cluster.apply()
5961

6062
cluster.status()
6163

tests/e2e/mnist_raycluster_sdk_kind_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,20 @@ def run_mnist_raycluster_sdk_kind(
4343
namespace=self.namespace,
4444
num_workers=1,
4545
head_cpu_requests="500m",
46-
head_cpu_limits="500m",
46+
head_cpu_limits="1",
47+
head_memory_requests="1Gi",
48+
head_memory_limits="2Gi",
4749
worker_cpu_requests="500m",
4850
worker_cpu_limits=1,
49-
worker_memory_requests=1,
50-
worker_memory_limits=4,
51+
worker_memory_requests="1Gi",
52+
worker_memory_limits="2Gi",
5153
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
5254
write_to_file=True,
5355
verify_tls=False,
5456
)
5557
)
5658

57-
cluster.up()
59+
cluster.apply()
5860

5961
cluster.status()
6062

tests/e2e/support.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,6 @@ def create_kueue_resources(
383383

384384

385385
def delete_kueue_resources(self):
386-
# Delete if given cluster-queue exists
387-
if not hasattr(self, "cluster_queues"):
388-
return
389386
for cq in self.cluster_queues:
390387
try:
391388
self.custom_api.delete_cluster_custom_object(

0 commit comments

Comments
 (0)