RHOAIENG-32532: Update kueue integration #1905
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# e2e tests workflow for CodeFlare-SDK | |
name: e2e | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
- ray-jobs-feature | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
# Explicitly set Docker MTU for KIND to avoid network issues | |
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge" | |
jobs: | |
kubernetes: | |
runs-on: gpu-t4-4-core | |
strategy: | |
fail-fast: false | |
matrix: | |
# Try with and without GPU setup to isolate the issue | |
gpu-setup: [true, false] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
cache: 'pip' # caching pip dependencies | |
- name: Diagnose Docker environment on GPU runner | |
run: | | |
echo "=== Docker Environment Diagnostics ===" | |
echo "Docker version:" | |
docker version || true | |
echo "" | |
echo "Docker info:" | |
docker info || true | |
echo "" | |
echo "System info:" | |
uname -a | |
echo "" | |
echo "Network interfaces:" | |
ip addr show || true | |
echo "" | |
echo "Checking cgroup version:" | |
stat -fc %T /sys/fs/cgroup/ || true | |
echo "" | |
echo "Checking if running in container:" | |
if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi | |
echo "" | |
echo "Available disk space:" | |
df -h | |
echo "" | |
echo "Memory info:" | |
free -h | |
echo "" | |
echo "DNS Configuration:" | |
cat /etc/resolv.conf || true | |
echo "" | |
echo "Testing DNS resolution:" | |
nslookup google.com || true | |
echo "=== End Diagnostics ===" | |
- name: Setup NVidia GPU environment for KinD | |
if: matrix.gpu-setup == true | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Create KIND config with explicit networking | |
run: | | |
cat > /tmp/kind-config.yaml <<EOF | |
kind: Cluster | |
apiVersion: kind.x-k8s.io/v1alpha4 | |
networking: | |
# Explicitly set pod subnet to avoid conflicts | |
podSubnet: "10.244.0.0/16" | |
serviceSubnet: "10.96.0.0/16" | |
# Disable default CNI so we can ensure it's properly installed | |
disableDefaultCNI: false | |
# Set MTU for better compatibility | |
kubeProxyMode: "iptables" | |
nodes: | |
- role: control-plane | |
# Extra mounts that might be needed for GPU | |
extraMounts: | |
- containerPath: /dev/shm | |
hostPath: /dev/shm | |
propagation: HostToContainer | |
- role: worker | |
extraMounts: | |
- containerPath: /dev/shm | |
hostPath: /dev/shm | |
propagation: HostToContainer | |
containerdConfigPatches: | |
- |- | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | |
runtime_type = "io.containerd.runc.v2" | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | |
SystemdCgroup = true | |
kubeadmConfigPatches: | |
- | | |
kind: ClusterConfiguration | |
apiServer: | |
extraArgs: | |
"enable-admission-plugins": "NodeRestriction,ResourceQuota" | |
- | | |
kind: KubeletConfiguration | |
serverTLSBootstrap: true | |
cgroupDriver: systemd | |
containerRuntimeEndpoint: unix:///run/containerd/containerd.sock | |
- | | |
kind: InitConfiguration | |
nodeRegistration: | |
kubeletExtraArgs: | |
pod-infra-container-image: registry.k8s.io/pause:3.9 | |
- | | |
kind: JoinConfiguration | |
discovery: | |
bootstrapToken: | |
apiServerEndpoint: "{{ .ControlPlaneEndpoint }}" | |
token: "{{ .Token }}" | |
unsafeSkipCAVerification: true | |
nodeRegistration: | |
kubeletExtraArgs: | |
pod-infra-container-image: registry.k8s.io/pause:3.9 | |
EOF | |
echo "KIND configuration:" | |
cat /tmp/kind-config.yaml | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 1 | |
kind-config: /tmp/kind-config.yaml | |
continue-on-error: true | |
id: kind-setup | |
- name: Fallback KIND setup if custom config fails | |
if: steps.kind-setup.outcome == 'failure' | |
run: | | |
echo "Custom KIND config failed, trying with default settings..." | |
# Clean up any failed attempts | |
kind delete cluster --name kind || true | |
docker rm -f $(docker ps -aq --filter name=kind-) || true | |
# Create cluster with simpler config | |
cat > /tmp/kind-simple.yaml <<EOF | |
kind: Cluster | |
apiVersion: kind.x-k8s.io/v1alpha4 | |
nodes: | |
- role: control-plane | |
- role: worker | |
EOF | |
kind create cluster --config /tmp/kind-simple.yaml --wait 5m || { | |
echo "ERROR: KIND cluster creation failed" | |
docker ps -a | |
exit 1 | |
} | |
- name: Fix KIND DNS and wait for cluster initialization | |
run: | | |
echo "=== KIND Cluster Setup Diagnostics ===" | |
# Check KIND containers | |
echo "KIND containers:" | |
docker ps -a --filter name=kind- | |
# Get control plane container name | |
CONTROL_PLANE=$(docker ps --filter name=kind-control-plane --format "{{.Names}}" | head -1) | |
if [ -z "$CONTROL_PLANE" ]; then | |
CONTROL_PLANE="kind-control-plane" | |
fi | |
echo "Control plane container: $CONTROL_PLANE" | |
# Get control plane IP | |
CONTROL_PLANE_IP=$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $CONTROL_PLANE 2>/dev/null || echo "") | |
echo "Control plane IP: $CONTROL_PLANE_IP" | |
# Check Docker network | |
echo "Docker networks:" | |
docker network ls | |
KIND_NETWORK=$(docker network ls | grep kind | awk '{print $2}' | head -1) | |
if [ -n "$KIND_NETWORK" ]; then | |
echo "KIND network: $KIND_NETWORK" | |
echo "Containers on KIND network:" | |
docker network inspect $KIND_NETWORK | jq -r '.Containers | to_entries | .[] | "\(.value.Name): \(.value.IPv4Address)"' || true | |
fi | |
# Ensure all KIND containers are on the same network | |
for container in $(docker ps -a --filter name=kind- --format "{{.Names}}"); do | |
echo "Checking network for $container" | |
docker inspect $container | jq -r '.[0].NetworkSettings.Networks | keys[]' || true | |
done | |
echo "=== Waiting for cluster initialization ===" | |
# Wait for API server | |
for i in {1..60}; do | |
if kubectl cluster-info &>/dev/null; then | |
echo "✓ Cluster API is responsive" | |
break | |
fi | |
echo "Waiting for cluster API... ($i/60)" | |
# Try to diagnose connection issues | |
if [ $i -eq 30 ]; then | |
echo "Debugging cluster connection..." | |
kubectl cluster-info dump --output-directory=/tmp/cluster-dump || true | |
echo "Kubeconfig:" | |
kubectl config view || true | |
fi | |
sleep 5 | |
done | |
# Check initial node status | |
echo "Initial node status:" | |
kubectl get nodes -o wide || true | |
# Wait for CNI to initialize on all nodes | |
echo "Waiting for CNI plugin to initialize..." | |
for i in {1..120}; do | |
# Check if nodes exist | |
node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0") | |
if [ "$node_count" -eq "0" ]; then | |
echo "No nodes found yet... ($i/120)" | |
sleep 5 | |
continue | |
fi | |
# Check if CNI is initialized (nodes won't have NetworkUnavailable condition) | |
if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then | |
echo "CNI still initializing... ($i/120)" | |
if [ $((i % 10)) -eq 0 ]; then | |
echo "Current node conditions:" | |
kubectl describe nodes | grep -A10 "Conditions:" || true | |
fi | |
else | |
echo "✓ CNI initialized on all nodes" | |
break | |
fi | |
if [ $i -eq 120 ]; then | |
echo "ERROR: CNI failed to initialize" | |
echo "Node details:" | |
kubectl describe nodes | |
echo "KIND logs:" | |
docker ps -a | grep kind | |
docker logs kind-control-plane 2>&1 | tail -100 || true | |
exit 1 | |
fi | |
sleep 5 | |
done | |
# Wait for nodes to be fully ready | |
echo "Waiting for all nodes to be ready..." | |
kubectl wait --for=condition=Ready nodes --all --timeout=300s || { | |
echo "ERROR: Nodes failed to become ready" | |
kubectl describe nodes | |
kubectl get pods -A -o wide | |
exit 1 | |
} | |
echo "✓ All nodes are ready:" | |
kubectl get nodes -o wide | |
# Verify CNI with a test pod | |
echo "Verifying CNI functionality..." | |
kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || { | |
echo "WARNING: CNI test pod failed, checking kindnet pods..." | |
kubectl get pods -n kube-system -l app=kindnet -o wide | |
kubectl logs -n kube-system -l app=kindnet --tail=50 || true | |
} | |
- name: Install NVidia GPU operator for KinD | |
if: matrix.gpu-setup == true | |
uses: ./common/github-actions/nvidia-gpu-operator | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers | |
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Run e2e tests | |
run: | | |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
echo "Running e2e tests..." | |
if [ "${{ matrix.gpu-setup }}" == "true" ]; then | |
echo "Running GPU tests..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
else | |
echo "Running non-GPU tests (GPU setup disabled for debugging)..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and not nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 || echo "No non-GPU tests found" | |
fi | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log |