5
5
pull_request :
6
6
branches :
7
7
- main
8
- - " release-*"
8
+ - ' release-*'
9
9
- ray-jobs-feature
10
- - kueue-integration
11
10
paths-ignore :
12
- - " docs/**"
13
- - " **.adoc"
14
- - " **.md"
15
- - " LICENSE"
11
+ - ' docs/**'
12
+ - ' **.adoc'
13
+ - ' **.md'
14
+ - ' LICENSE'
16
15
17
16
concurrency :
18
17
group : ${{ github.head_ref }}-${{ github.workflow }}
@@ -21,6 +20,7 @@ concurrency:
21
20
env :
22
21
CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
23
22
KUEUE_VERSION : " v0.13.4"
23
+ KUERAY_VERSION : " v1.4.0"
24
24
25
25
jobs :
26
26
kubernetes :
35
35
- name : Checkout common repo code
36
36
uses : actions/checkout@v4
37
37
with :
38
- repository : " project-codeflare/codeflare-common"
39
- ref : " main"
40
- path : " common"
38
+ repository : ' project-codeflare/codeflare-common'
39
+ ref : ' main'
40
+ path : ' common'
41
41
42
42
- name : Checkout CodeFlare operator repository
43
43
uses : actions/checkout@v4
48
48
- name : Set Go
49
49
uses : actions/setup-go@v5
50
50
with :
51
- go-version-file : " ./codeflare-operator/go.mod"
51
+ go-version-file : ' ./codeflare-operator/go.mod'
52
52
cache-dependency-path : " ./codeflare-operator/go.sum"
53
53
54
54
- name : Set up gotestfmt
59
59
- name : Set up specific Python version
60
60
uses : actions/setup-python@v5
61
61
with :
62
- python-version : " 3.12"
63
- cache : " pip" # caching pip dependencies
62
+ python-version : ' 3.12'
63
+ cache : ' pip' # caching pip dependencies
64
64
65
65
- name : Setup NVidia GPU environment for KinD
66
66
uses : ./common/github-actions/nvidia-gpu-setup
73
73
- name : Install NVidia GPU operator for KinD
74
74
uses : ./common/github-actions/nvidia-gpu-operator
75
75
76
- - name : Wait for nodes to be ready
77
- run : |
78
- echo "Waiting for all nodes to be ready..."
79
- kubectl wait --for=condition=Ready nodes --all --timeout=300s
80
-
81
- echo "Checking node status..."
82
- kubectl get nodes -o wide
83
-
84
- echo "Checking for CNI readiness..."
85
- for i in {1..30}; do
86
- if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
87
- echo "Waiting for CNI to initialize (attempt $i/30)..."
88
- sleep 10
89
- else
90
- echo "All nodes are ready!"
91
- break
92
- fi
93
- done
94
-
95
- # Final verification
96
- kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
97
-
98
76
- name : Deploy CodeFlare stack
99
77
id : deploy
100
78
run : |
@@ -106,62 +84,27 @@ jobs:
106
84
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
107
85
cd ..
108
86
109
- - name : Verify CodeFlare deployment
110
- run : |
111
- # Wait for Kueue to be ready
112
- echo "Waiting for Kueue controller to be ready..."
113
- kubectl wait --for=condition=Available --timeout=300s deployment -n kueue-system kueue-controller-manager || {
114
- echo "Kueue deployment status:"
115
- kubectl get all -n kueue-system
116
- exit 1
117
- }
118
-
119
- # Wait for KubeRay to be ready
120
- echo "Waiting for KubeRay operator to be ready..."
121
- kubectl wait --for=condition=Available --timeout=300s deployment -n default kuberay-operator || {
122
- echo "KubeRay deployment status:"
123
- kubectl get all -n default
124
- exit 1
125
- }
126
-
127
- # Verify webhook certificates
128
- echo "Checking CodeFlare operator webhook certificates..."
129
- kubectl get secret -n openshift-operators codeflare-operator-webhook-server-cert -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text || {
130
- echo "Warning: Webhook certificate might be missing or invalid"
131
- }
132
-
133
87
- name : Add user to KinD
134
88
uses : ./common/github-actions/kind-add-user
135
89
with :
136
90
user-name : sdk-user
137
91
138
92
- name : Configure RBAC for sdk user with limited permissions
139
93
run : |
140
- # CRD permissions for discovering resource types
141
- kubectl create clusterrole crd-reader --verb=get,list,watch --resource=customresourcedefinitions.apiextensions.k8s.io
142
- kubectl create clusterrolebinding sdk-user-crd-reader --clusterrole=crd-reader --user=sdk-user
143
-
144
- # AppWrapper permissions for CodeFlare workloads
145
- kubectl create clusterrole appwrapper-creator --verb=get,list,watch,create,update,patch,delete --resource=appwrappers.workload.codeflare.dev
146
- kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
147
-
148
- # Existing permissions
149
94
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
150
95
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
151
96
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
152
97
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
153
- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters.ray.io
98
+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
154
99
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
155
- kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs.ray.io,rayjobs/status
156
- kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
157
- kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors.kueue.x-k8s.io
100
+ kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
101
+ kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
102
+ kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
158
103
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
159
- kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues.kueue.x-k8s.io
104
+ kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
160
105
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
161
- kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues.kueue.x-k8s.io
106
+ kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
162
107
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
163
- kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads.kueue.x-k8s.io
164
- kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
165
108
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
166
109
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
167
110
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -170,72 +113,50 @@ jobs:
170
113
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
171
114
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
172
115
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
173
- kubectl create clusterrole node-reader --verb=get,list --resource=nodes
174
- kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
175
116
kubectl config use-context sdk-user
176
117
177
- - name : Verify cluster readiness before tests
178
- run : |
179
- echo "=== Pre-test cluster verification ==="
180
- echo "Current context:"
181
- kubectl config current-context
182
-
183
- echo -e "\nNode status:"
184
- kubectl get nodes -o wide
185
-
186
- echo -e "\nSystem pods status:"
187
- kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
188
-
189
- echo -e "\nChecking for any pods in error state:"
190
- kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
191
-
192
- echo -e "\nKueue resources:"
193
- kubectl get resourceflavors,clusterqueues,localqueues -A || true
194
-
195
- echo -e "\nRay CRDs:"
196
- kubectl get crd | grep ray || true
197
-
198
118
- name : Run e2e tests
199
119
run : |
200
- export CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs"
201
- mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR}
120
+ export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
202
121
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
203
122
204
123
set -euo pipefail
205
124
pip install poetry
206
125
poetry install --with test,docs
207
126
echo "Running e2e tests..."
208
- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log 2>&1
127
+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
209
128
env :
210
129
GRPC_DNS_RESOLVER : " native"
211
130
212
- - name : Run RayJob e2e tests
131
+ - name : Run RayJob E2E tests
213
132
run : |
133
+ export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
134
+ echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
135
+
214
136
set -euo pipefail
137
+ pip install poetry
138
+ poetry install --with test,docs
215
139
echo "Running RayJob e2e tests..."
216
- # Set environment variable to prevent default queue assignment for non-Kueue tests
217
- export DISABLE_DEFAULT_KUEUE_QUEUE=true
218
-
219
- # Run only the tests that are designed for Kueue integration
220
- poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
140
+ poetry run pytest -v -s ./tests/e2e/rayjob > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log 2>&1
221
141
env :
222
142
GRPC_DNS_RESOLVER : " native"
223
143
224
144
- name : Switch to kind-cluster context to print logs
225
145
if : always() && steps.deploy.outcome == 'success'
226
146
run : kubectl config use-context kind-cluster
227
147
228
- - name : Print RayJob E2E Pytest output log
148
+ - name : Print Pytest output log
229
149
if : always() && steps.deploy.outcome == 'success'
230
150
run : |
231
- echo "Printing RayJob Pytest output logs"
232
- cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output .log || echo "No RayJob test output found"
151
+ echo "Printing Pytest output logs"
152
+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output .log
233
153
234
- - name : Print E2E Pytest output log
154
+ - name : Print RayJobPytest output log
235
155
if : always() && steps.deploy.outcome == 'success'
236
156
run : |
237
- echo "Printing E2E Pytest output logs"
238
- cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log || echo "No E2E test output found"
157
+ echo "Printing Pytest output logs"
158
+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log
159
+
239
160
240
161
- name : Print CodeFlare operator logs
241
162
if : always() && steps.deploy.outcome == 'success'
@@ -253,7 +174,7 @@ jobs:
253
174
uses : ./common/github-actions/kind-export-logs
254
175
if : always() && steps.deploy.outcome == 'success'
255
176
with :
256
- output-directory : ${{ env. CODEFLARE_TEST_OUTPUT_DIR } }
177
+ output-directory : ${CODEFLARE_TEST_OUTPUT_DIR}
257
178
258
179
- name : Upload logs
259
180
uses : actions/upload-artifact@v4
@@ -263,4 +184,3 @@ jobs:
263
184
retention-days : 10
264
185
path : |
265
186
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
266
- if-no-files-found : warn
0 commit comments