@@ -19,6 +19,8 @@ concurrency:
19
19
20
20
env :
21
21
CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
22
+ # Explicitly set Docker MTU for KIND to avoid network issues
23
+ KIND_EXPERIMENTAL_DOCKER_NETWORK : " bridge"
22
24
23
25
jobs :
24
26
kubernetes :
@@ -60,13 +62,150 @@ jobs:
60
62
python-version : ' 3.11'
61
63
cache : ' pip' # caching pip dependencies
62
64
65
+ - name : Diagnose Docker environment on GPU runner
66
+ run : |
67
+ echo "=== Docker Environment Diagnostics ==="
68
+ echo "Docker version:"
69
+ docker version || true
70
+ echo ""
71
+ echo "Docker info:"
72
+ docker info || true
73
+ echo ""
74
+ echo "System info:"
75
+ uname -a
76
+ echo ""
77
+ echo "Network interfaces:"
78
+ ip addr show || true
79
+ echo ""
80
+ echo "Checking cgroup version:"
81
+ stat -fc %T /sys/fs/cgroup/ || true
82
+ echo ""
83
+ echo "Checking if running in container:"
84
+ if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
85
+ echo ""
86
+ echo "Available disk space:"
87
+ df -h
88
+ echo ""
89
+ echo "Memory info:"
90
+ free -h
91
+ echo "=== End Diagnostics ==="
92
+
63
93
- name : Setup NVidia GPU environment for KinD
64
94
uses : ./common/github-actions/nvidia-gpu-setup
65
95
96
+ - name : Create KIND config with explicit networking
97
+ run : |
98
+ cat > /tmp/kind-config.yaml <<EOF
99
+ kind: Cluster
100
+ apiVersion: kind.x-k8s.io/v1alpha4
101
+ networking:
102
+ # Explicitly set pod subnet to avoid conflicts
103
+ podSubnet: "10.244.0.0/16"
104
+ serviceSubnet: "10.96.0.0/16"
105
+ # Disable default CNI so we can ensure it's properly installed
106
+ disableDefaultCNI: false
107
+ # Set MTU for better compatibility
108
+ kubeProxyMode: "iptables"
109
+ nodes:
110
+ - role: control-plane
111
+ # Extra mounts that might be needed for GPU
112
+ extraMounts:
113
+ - containerPath: /dev/shm
114
+ hostPath: /dev/shm
115
+ propagation: HostToContainer
116
+ - role: worker
117
+ extraMounts:
118
+ - containerPath: /dev/shm
119
+ hostPath: /dev/shm
120
+ propagation: HostToContainer
121
+ containerdConfigPatches:
122
+ - |-
123
+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
124
+ runtime_type = "io.containerd.runc.v2"
125
+ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
126
+ SystemdCgroup = true
127
+ EOF
128
+
129
+ echo "KIND configuration:"
130
+ cat /tmp/kind-config.yaml
131
+
66
132
- name : Setup and start KinD cluster
67
133
uses : ./common/github-actions/kind
68
134
with :
69
135
worker-nodes : 1
136
+ kind-config : /tmp/kind-config.yaml
137
+
138
+ - name : Wait for KIND cluster and CNI initialization
139
+ run : |
140
+ echo "Waiting for KIND cluster to initialize..."
141
+ # First ensure cluster API is responsive
142
+ for i in {1..60}; do
143
+ if kubectl cluster-info &>/dev/null; then
144
+ echo "✓ Cluster API is responsive"
145
+ break
146
+ fi
147
+ echo "Waiting for cluster API... ($i/60)"
148
+ sleep 5
149
+ done
150
+
151
+ # Check initial node status
152
+ echo "Initial node status:"
153
+ kubectl get nodes -o wide || true
154
+
155
+ # Wait for CNI to initialize on all nodes
156
+ echo "Waiting for CNI plugin to initialize..."
157
+ for i in {1..120}; do
158
+ # Check if nodes exist
159
+ node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
160
+ if [ "$node_count" -eq "0" ]; then
161
+ echo "No nodes found yet... ($i/120)"
162
+ sleep 5
163
+ continue
164
+ fi
165
+
166
+ # Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
167
+ if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
168
+ echo "CNI still initializing... ($i/120)"
169
+ if [ $((i % 10)) -eq 0 ]; then
170
+ echo "Current node conditions:"
171
+ kubectl describe nodes | grep -A10 "Conditions:" || true
172
+ fi
173
+ else
174
+ echo "✓ CNI initialized on all nodes"
175
+ break
176
+ fi
177
+
178
+ if [ $i -eq 120 ]; then
179
+ echo "ERROR: CNI failed to initialize"
180
+ echo "Node details:"
181
+ kubectl describe nodes
182
+ echo "KIND logs:"
183
+ docker ps -a | grep kind
184
+ docker logs kind-control-plane 2>&1 | tail -100 || true
185
+ exit 1
186
+ fi
187
+ sleep 5
188
+ done
189
+
190
+ # Wait for nodes to be fully ready
191
+ echo "Waiting for all nodes to be ready..."
192
+ kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
193
+ echo "ERROR: Nodes failed to become ready"
194
+ kubectl describe nodes
195
+ kubectl get pods -A -o wide
196
+ exit 1
197
+ }
198
+
199
+ echo "✓ All nodes are ready:"
200
+ kubectl get nodes -o wide
201
+
202
+ # Verify CNI with a test pod
203
+ echo "Verifying CNI functionality..."
204
+ kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
205
+ echo "WARNING: CNI test pod failed, checking kindnet pods..."
206
+ kubectl get pods -n kube-system -l app=kindnet -o wide
207
+ kubectl logs -n kube-system -l app=kindnet --tail=50 || true
208
+ }
70
209
71
210
- name : Install NVidia GPU operator for KinD
72
211
uses : ./common/github-actions/nvidia-gpu-operator
0 commit comments