Skip to content

Commit 9a22414

Browse files
committed
test: revert e2e workflow
1 parent dfeeb39 commit 9a22414

File tree

9 files changed

+138
-195
lines changed

9 files changed

+138
-195
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 35 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@ on:
55
pull_request:
66
branches:
77
- main
8-
- "release-*"
8+
- 'release-*'
99
- ray-jobs-feature
10-
- kueue-integration
1110
paths-ignore:
12-
- "docs/**"
13-
- "**.adoc"
14-
- "**.md"
15-
- "LICENSE"
11+
- 'docs/**'
12+
- '**.adoc'
13+
- '**.md'
14+
- 'LICENSE'
1615

1716
concurrency:
1817
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -21,6 +20,7 @@ concurrency:
2120
env:
2221
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
2322
KUEUE_VERSION: "v0.13.4"
23+
KUERAY_VERSION: "v1.4.0"
2424

2525
jobs:
2626
kubernetes:
@@ -35,9 +35,9 @@ jobs:
3535
- name: Checkout common repo code
3636
uses: actions/checkout@v4
3737
with:
38-
repository: "project-codeflare/codeflare-common"
39-
ref: "main"
40-
path: "common"
38+
repository: 'project-codeflare/codeflare-common'
39+
ref: 'main'
40+
path: 'common'
4141

4242
- name: Checkout CodeFlare operator repository
4343
uses: actions/checkout@v4
@@ -48,7 +48,7 @@ jobs:
4848
- name: Set Go
4949
uses: actions/setup-go@v5
5050
with:
51-
go-version-file: "./codeflare-operator/go.mod"
51+
go-version-file: './codeflare-operator/go.mod'
5252
cache-dependency-path: "./codeflare-operator/go.sum"
5353

5454
- name: Set up gotestfmt
@@ -59,8 +59,8 @@ jobs:
5959
- name: Set up specific Python version
6060
uses: actions/setup-python@v5
6161
with:
62-
python-version: "3.12"
63-
cache: "pip" # caching pip dependencies
62+
python-version: '3.12'
63+
cache: 'pip' # caching pip dependencies
6464

6565
- name: Setup NVidia GPU environment for KinD
6666
uses: ./common/github-actions/nvidia-gpu-setup
@@ -73,28 +73,6 @@ jobs:
7373
- name: Install NVidia GPU operator for KinD
7474
uses: ./common/github-actions/nvidia-gpu-operator
7575

76-
- name: Wait for nodes to be ready
77-
run: |
78-
echo "Waiting for all nodes to be ready..."
79-
kubectl wait --for=condition=Ready nodes --all --timeout=300s
80-
81-
echo "Checking node status..."
82-
kubectl get nodes -o wide
83-
84-
echo "Checking for CNI readiness..."
85-
for i in {1..30}; do
86-
if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87-
echo "Waiting for CNI to initialize (attempt $i/30)..."
88-
sleep 10
89-
else
90-
echo "All nodes are ready!"
91-
break
92-
fi
93-
done
94-
95-
# Final verification
96-
kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97-
9876
- name: Deploy CodeFlare stack
9977
id: deploy
10078
run: |
@@ -106,62 +84,27 @@ jobs:
10684
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
10785
cd ..
10886
109-
- name: Verify CodeFlare deployment
110-
run: |
111-
# Wait for Kueue to be ready
112-
echo "Waiting for Kueue controller to be ready..."
113-
kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114-
echo "Kueue deployment status:"
115-
kubectl get all -n kueue-system
116-
exit 1
117-
}
118-
119-
# Wait for KubeRay to be ready
120-
echo "Waiting for KubeRay operator to be ready..."
121-
kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122-
echo "KubeRay deployment status:"
123-
kubectl get all -n default
124-
exit 1
125-
}
126-
127-
# Verify webhook certificates
128-
echo "Checking CodeFlare operator webhook certificates..."
129-
kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130-
echo "Warning: Webhook certificate might be missing or invalid"
131-
}
132-
13387
- name: Add user to KinD
13488
uses: ./common/github-actions/kind-add-user
13589
with:
13690
user-name: sdk-user
13791

13892
- name: Configure RBAC for sdk user with limited permissions
13993
run: |
140-
# CRD permissions for discovering resource types
141-
kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142-
kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143-
144-
# AppWrapper permissions for CodeFlare workloads
145-
kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146-
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147-
148-
# Existing permissions
14994
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
15095
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
15196
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
15297
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
153-
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
98+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
15499
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155-
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156-
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
157-
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
100+
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
101+
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
102+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
158103
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159-
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
104+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
160105
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161-
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
106+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
162107
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163-
kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164-
kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
165108
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166109
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167110
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -170,72 +113,50 @@ jobs:
170113
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
171114
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
172115
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173-
kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174-
kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175116
kubectl config use-context sdk-user
176117
177-
- name: Verify cluster readiness before tests
178-
run: |
179-
echo "=== Pre-test cluster verification ==="
180-
echo "Current context:"
181-
kubectl config current-context
182-
183-
echo -e "\nNode status:"
184-
kubectl get nodes -o wide
185-
186-
echo -e "\nSystem pods status:"
187-
kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188-
189-
echo -e "\nChecking for any pods in error state:"
190-
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191-
192-
echo -e "\nKueue resources:"
193-
kubectl get resourceflavors,clusterqueues,localqueues -A || true
194-
195-
echo -e "\nRay CRDs:"
196-
kubectl get crd | grep ray || true
197-
198118
- name: Run e2e tests
199119
run: |
200-
export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201-
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
120+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202121
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203122
204123
set -euo pipefail
205124
pip install poetry
206125
poetry install --with test,docs
207126
echo "Running e2e tests..."
208-
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
127+
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
209128
env:
210129
GRPC_DNS_RESOLVER: "native"
211130

212-
- name: Run RayJob e2e tests
131+
- name: Run RayJob E2E tests
213132
run: |
133+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
134+
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
135+
214136
set -euo pipefail
137+
pip install poetry
138+
poetry install --with test,docs
215139
echo "Running RayJob e2e tests..."
216-
# Set environment variable to prevent default queue assignment for non-Kueue tests
217-
export DISABLE_DEFAULT_KUEUE_QUEUE=true
218-
219-
# Run only the tests that are designed for Kueue integration
220-
poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
140+
poetry run pytest -v -s ./tests/e2e/rayjob > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1
221141
env:
222142
GRPC_DNS_RESOLVER: "native"
223143

224144
- name: Switch to kind-cluster context to print logs
225145
if: always() && steps.deploy.outcome == 'success'
226146
run: kubectl config use-context kind-cluster
227147

228-
- name: Print RayJob E2E Pytest output log
148+
- name: Print Pytest output log
229149
if: always() && steps.deploy.outcome == 'success'
230150
run: |
231-
echo "Printing RayJob Pytest output logs"
232-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log || echo "No RayJob test output found"
151+
echo "Printing Pytest output logs"
152+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
233153
234-
- name: Print E2E Pytest output log
154+
- name: Print RayJobPytest output log
235155
if: always() && steps.deploy.outcome == 'success'
236156
run: |
237-
echo "Printing E2E Pytest output logs"
238-
cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
157+
echo "Printing Pytest output logs"
158+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log
159+
239160
240161
- name: Print CodeFlare operator logs
241162
if: always() && steps.deploy.outcome == 'success'
@@ -253,7 +174,7 @@ jobs:
253174
uses: ./common/github-actions/kind-export-logs
254175
if: always() && steps.deploy.outcome == 'success'
255176
with:
256-
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}
177+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
257178

258179
- name: Upload logs
259180
uses: actions/upload-artifact@v4
@@ -263,4 +184,3 @@ jobs:
263184
retention-days: 10
264185
path: |
265186
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266-
if-no-files-found: warn

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ def apply(self, force=False):
209209
namespace = self.config.namespace
210210
name = self.config.name
211211

212-
# Regenerate resource_yaml to reflect any configuration changes
213212
self.resource_yaml = self.create_resource()
214213

215214
try:
@@ -391,23 +390,17 @@ def is_dashboard_ready(self) -> bool:
391390
bool:
392391
True if the dashboard is ready, False otherwise.
393392
"""
394-
dashboard_uri = self.cluster_dashboard_uri()
395-
if dashboard_uri is None:
396-
return False
397393

398394
try:
399395
response = requests.get(
400-
dashboard_uri,
396+
self.cluster_dashboard_uri(),
401397
headers=self._client_headers,
402398
timeout=5,
403399
verify=self._client_verify_tls,
404400
)
405401
except requests.exceptions.SSLError: # pragma no cover
406402
# SSL exception occurs when oauth ingress has been created but cluster is not up
407403
return False
408-
except Exception: # pragma no cover
409-
# Any other exception (connection errors, timeouts, etc.)
410-
return False
411404

412405
if response.status_code == 200:
413406
return True
@@ -516,6 +509,8 @@ def cluster_dashboard_uri(self) -> str:
516509
):
517510
protocol = "https" if route["spec"].get("tls") else "http"
518511
return f"{protocol}://{route['spec']['host']}"
512+
# No route found for this cluster
513+
return "Dashboard not available yet, have you run cluster.up()?"
519514
else:
520515
try:
521516
api_instance = client.NetworkingV1Api(get_api_client())
@@ -534,25 +529,9 @@ def cluster_dashboard_uri(self) -> str:
534529
protocol = "http"
535530
elif "route.openshift.io/termination" in annotations:
536531
protocol = "https"
537-
return f"{protocol}://{ingress.spec.rules[0].host}"
532+
return f"{protocol}://{ingress.spec.rules[0].host}"
538533

539-
# For local/test environments without ingress controller (e.g., KIND)
540-
# Try to find the Ray head service
541-
try:
542-
api_instance = client.CoreV1Api(get_api_client())
543-
services = api_instance.list_namespaced_service(
544-
self.config.namespace,
545-
label_selector=f"ray.io/cluster={self.config.name},ray.io/node-type=head",
546-
)
547-
for service in services.items:
548-
if service.metadata.name == f"{self.config.name}-head-svc":
549-
# For ClusterIP services in local environments, return a placeholder
550-
# The actual connection would need port-forwarding or NodePort
551-
return f"http://{service.metadata.name}.{self.config.namespace}.svc.cluster.local:8265"
552-
except Exception: # pragma: no cover
553-
pass
554-
555-
return None
534+
return "Dashboard not available yet, have you run cluster.up()?"
556535

557536
def list_jobs(self) -> List:
558537
"""
@@ -813,11 +792,6 @@ def remove_autogenerated_fields(resource):
813792
else:
814793
remove_autogenerated_fields(resource[key])
815794

816-
# After cleaning, remove empty metadata sections
817-
if "metadata" in resource and isinstance(resource["metadata"], dict):
818-
if len(resource["metadata"]) == 0:
819-
del resource["metadata"]
820-
821795
elif isinstance(resource, list):
822796
for item in resource:
823797
remove_autogenerated_fields(item)

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,10 @@ def test_cluster_uris(mocker):
322322
mocker.patch(
323323
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
324324
)
325-
# When no ingress/route/service is found, the method should return None
326-
assert cluster.cluster_dashboard_uri() is None
325+
assert (
326+
cluster.cluster_dashboard_uri()
327+
== "Dashboard not available yet, have you run cluster.up()?"
328+
)
327329

328330
mocker.patch(
329331
"codeflare_sdk.ray.cluster.cluster._is_openshift_cluster", return_value=True

0 commit comments

Comments
 (0)