Skip to content

Commit 344e935

Browse files
committed
RHOAIENG-32532: Fix broken E2E tests
1 parent 80d1307 commit 344e935

File tree

4 files changed

+146
-4
lines changed

4 files changed

+146
-4
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ concurrency:
1919

2020
env:
2121
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
22+
# Explicitly set Docker MTU for KIND to avoid network issues
23+
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"
2224

2325
jobs:
2426
kubernetes:
@@ -60,13 +62,150 @@ jobs:
6062
python-version: '3.11'
6163
cache: 'pip' # caching pip dependencies
6264

65+
- name: Diagnose Docker environment on GPU runner
66+
run: |
67+
echo "=== Docker Environment Diagnostics ==="
68+
echo "Docker version:"
69+
docker version || true
70+
echo ""
71+
echo "Docker info:"
72+
docker info || true
73+
echo ""
74+
echo "System info:"
75+
uname -a
76+
echo ""
77+
echo "Network interfaces:"
78+
ip addr show || true
79+
echo ""
80+
echo "Checking cgroup version:"
81+
stat -fc %T /sys/fs/cgroup/ || true
82+
echo ""
83+
echo "Checking if running in container:"
84+
if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
85+
echo ""
86+
echo "Available disk space:"
87+
df -h
88+
echo ""
89+
echo "Memory info:"
90+
free -h
91+
echo "=== End Diagnostics ==="
92+
6393
- name: Setup NVidia GPU environment for KinD
6494
uses: ./common/github-actions/nvidia-gpu-setup
6595

96+
- name: Create KIND config with explicit networking
97+
run: |
98+
cat > /tmp/kind-config.yaml <<EOF
99+
kind: Cluster
100+
apiVersion: kind.x-k8s.io/v1alpha4
101+
networking:
102+
# Explicitly set pod subnet to avoid conflicts
103+
podSubnet: "10.244.0.0/16"
104+
serviceSubnet: "10.96.0.0/16"
105+
# Disable default CNI so we can ensure it's properly installed
106+
disableDefaultCNI: false
107+
# Set MTU for better compatibility
108+
kubeProxyMode: "iptables"
109+
nodes:
110+
- role: control-plane
111+
# Extra mounts that might be needed for GPU
112+
extraMounts:
113+
- containerPath: /dev/shm
114+
hostPath: /dev/shm
115+
propagation: HostToContainer
116+
- role: worker
117+
extraMounts:
118+
- containerPath: /dev/shm
119+
hostPath: /dev/shm
120+
propagation: HostToContainer
121+
containerdConfigPatches:
122+
- |-
123+
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
124+
runtime_type = "io.containerd.runc.v2"
125+
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
126+
SystemdCgroup = true
127+
EOF
128+
129+
echo "KIND configuration:"
130+
cat /tmp/kind-config.yaml
131+
66132
- name: Setup and start KinD cluster
67133
uses: ./common/github-actions/kind
68134
with:
69135
worker-nodes: 1
136+
kind-config: /tmp/kind-config.yaml
137+
138+
- name: Wait for KIND cluster and CNI initialization
139+
run: |
140+
echo "Waiting for KIND cluster to initialize..."
141+
# First ensure cluster API is responsive
142+
for i in {1..60}; do
143+
if kubectl cluster-info &>/dev/null; then
144+
echo "✓ Cluster API is responsive"
145+
break
146+
fi
147+
echo "Waiting for cluster API... ($i/60)"
148+
sleep 5
149+
done
150+
151+
# Check initial node status
152+
echo "Initial node status:"
153+
kubectl get nodes -o wide || true
154+
155+
# Wait for CNI to initialize on all nodes
156+
echo "Waiting for CNI plugin to initialize..."
157+
for i in {1..120}; do
158+
# Check if nodes exist
159+
node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
160+
if [ "$node_count" -eq "0" ]; then
161+
echo "No nodes found yet... ($i/120)"
162+
sleep 5
163+
continue
164+
fi
165+
166+
# Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
167+
if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
168+
echo "CNI still initializing... ($i/120)"
169+
if [ $((i % 10)) -eq 0 ]; then
170+
echo "Current node conditions:"
171+
kubectl describe nodes | grep -A10 "Conditions:" || true
172+
fi
173+
else
174+
echo "✓ CNI initialized on all nodes"
175+
break
176+
fi
177+
178+
if [ $i -eq 120 ]; then
179+
echo "ERROR: CNI failed to initialize"
180+
echo "Node details:"
181+
kubectl describe nodes
182+
echo "KIND logs:"
183+
docker ps -a | grep kind
184+
docker logs kind-control-plane 2>&1 | tail -100 || true
185+
exit 1
186+
fi
187+
sleep 5
188+
done
189+
190+
# Wait for nodes to be fully ready
191+
echo "Waiting for all nodes to be ready..."
192+
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
193+
echo "ERROR: Nodes failed to become ready"
194+
kubectl describe nodes
195+
kubectl get pods -A -o wide
196+
exit 1
197+
}
198+
199+
echo "✓ All nodes are ready:"
200+
kubectl get nodes -o wide
201+
202+
# Verify CNI with a test pod
203+
echo "Verifying CNI functionality..."
204+
kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
205+
echo "WARNING: CNI test pod failed, checking kindnet pods..."
206+
kubectl get pods -n kube-system -l app=kindnet -o wide
207+
kubectl logs -n kube-system -l app=kindnet --tail=50 || true
208+
}
70209
71210
- name: Install NVidia GPU operator for KinD
72211
uses: ./common/github-actions/nvidia-gpu-operator

.github/workflows/rayjob_e2e_tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: rayjob-e2e-with-kueue
1+
name: rayjob-e2e
22

33
on:
44
pull_request:

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,12 @@ def remove_autogenerated_fields(resource):
783783
del resource[key]
784784
else:
785785
remove_autogenerated_fields(resource[key])
786+
787+
# After cleaning, remove empty metadata sections
788+
if "metadata" in resource and isinstance(resource["metadata"], dict):
789+
if len(resource["metadata"]) == 0:
790+
del resource["metadata"]
791+
786792
elif isinstance(resource, list):
787793
for item in resource:
788794
remove_autogenerated_fields(item)

tests/e2e/rayjob/ray_version_validation_oauth_test.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@
1111
ManagedClusterConfig,
1212
)
1313

14-
# This test validates Ray version compatibility checking for RayJob with cluster lifecycling scenarios
1514

16-
17-
@pytest.mark.openshift
1815
class TestRayJobRayVersionValidationOauth:
1916
def setup_method(self):
2017
initialize_kubernetes_client(self)

0 commit comments

Comments
 (0)