Skip to content

RHOAIENG-32532: Update kueue integration #1905

RHOAIENG-32532: Update kueue integration

RHOAIENG-32532: Update kueue integration #1905

Workflow file for this run

# e2e tests workflow for CodeFlare-SDK
name: e2e
on:
pull_request:
branches:
- main
- 'release-*'
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
# Explicitly set Docker MTU for KIND to avoid network issues
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"
jobs:
kubernetes:
runs-on: gpu-t4-4-core
strategy:
fail-fast: false
matrix:
# Try with and without GPU setup to isolate the issue
gpu-setup: [true, false]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: "./codeflare-operator/go.sum"
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Diagnose Docker environment on GPU runner
run: |
echo "=== Docker Environment Diagnostics ==="
echo "Docker version:"
docker version || true
echo ""
echo "Docker info:"
docker info || true
echo ""
echo "System info:"
uname -a
echo ""
echo "Network interfaces:"
ip addr show || true
echo ""
echo "Checking cgroup version:"
stat -fc %T /sys/fs/cgroup/ || true
echo ""
echo "Checking if running in container:"
if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
echo ""
echo "Available disk space:"
df -h
echo ""
echo "Memory info:"
free -h
echo ""
echo "DNS Configuration:"
cat /etc/resolv.conf || true
echo ""
echo "Testing DNS resolution:"
nslookup google.com || true
echo "=== End Diagnostics ==="
- name: Setup NVidia GPU environment for KinD
if: matrix.gpu-setup == true
uses: ./common/github-actions/nvidia-gpu-setup
- name: Create KIND config with explicit networking
run: |
cat > /tmp/kind-config.yaml <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
# Explicitly set pod subnet to avoid conflicts
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/16"
# Disable default CNI so we can ensure it's properly installed
disableDefaultCNI: false
# Set MTU for better compatibility
kubeProxyMode: "iptables"
nodes:
- role: control-plane
# Extra mounts that might be needed for GPU
extraMounts:
- containerPath: /dev/shm
hostPath: /dev/shm
propagation: HostToContainer
- role: worker
extraMounts:
- containerPath: /dev/shm
hostPath: /dev/shm
propagation: HostToContainer
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
kubeadmConfigPatches:
- |
kind: ClusterConfiguration
apiServer:
extraArgs:
"enable-admission-plugins": "NodeRestriction,ResourceQuota"
- |
kind: KubeletConfiguration
serverTLSBootstrap: true
cgroupDriver: systemd
containerRuntimeEndpoint: unix:///run/containerd/containerd.sock
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
pod-infra-container-image: registry.k8s.io/pause:3.9
- |
kind: JoinConfiguration
discovery:
bootstrapToken:
apiServerEndpoint: "{{ .ControlPlaneEndpoint }}"
token: "{{ .Token }}"
unsafeSkipCAVerification: true
nodeRegistration:
kubeletExtraArgs:
pod-infra-container-image: registry.k8s.io/pause:3.9
EOF
echo "KIND configuration:"
cat /tmp/kind-config.yaml
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1
kind-config: /tmp/kind-config.yaml
continue-on-error: true
id: kind-setup
- name: Fallback KIND setup if custom config fails
if: steps.kind-setup.outcome == 'failure'
run: |
echo "Custom KIND config failed, trying with default settings..."
# Clean up any failed attempts
kind delete cluster --name kind || true
docker rm -f $(docker ps -aq --filter name=kind-) || true
# Create cluster with simpler config
cat > /tmp/kind-simple.yaml <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
EOF
kind create cluster --config /tmp/kind-simple.yaml --wait 5m || {
echo "ERROR: KIND cluster creation failed"
docker ps -a
exit 1
}
- name: Fix KIND DNS and wait for cluster initialization
run: |
echo "=== KIND Cluster Setup Diagnostics ==="
# Check KIND containers
echo "KIND containers:"
docker ps -a --filter name=kind-
# Get control plane container name
CONTROL_PLANE=$(docker ps --filter name=kind-control-plane --format "{{.Names}}" | head -1)
if [ -z "$CONTROL_PLANE" ]; then
CONTROL_PLANE="kind-control-plane"
fi
echo "Control plane container: $CONTROL_PLANE"
# Get control plane IP
CONTROL_PLANE_IP=$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $CONTROL_PLANE 2>/dev/null || echo "")
echo "Control plane IP: $CONTROL_PLANE_IP"
# Check Docker network
echo "Docker networks:"
docker network ls
KIND_NETWORK=$(docker network ls | grep kind | awk '{print $2}' | head -1)
if [ -n "$KIND_NETWORK" ]; then
echo "KIND network: $KIND_NETWORK"
echo "Containers on KIND network:"
docker network inspect $KIND_NETWORK | jq -r '.Containers | to_entries | .[] | "\(.value.Name): \(.value.IPv4Address)"' || true
fi
# Ensure all KIND containers are on the same network
for container in $(docker ps -a --filter name=kind- --format "{{.Names}}"); do
echo "Checking network for $container"
docker inspect $container | jq -r '.[0].NetworkSettings.Networks | keys[]' || true
done
echo "=== Waiting for cluster initialization ==="
# Wait for API server
for i in {1..60}; do
if kubectl cluster-info &>/dev/null; then
echo "✓ Cluster API is responsive"
break
fi
echo "Waiting for cluster API... ($i/60)"
# Try to diagnose connection issues
if [ $i -eq 30 ]; then
echo "Debugging cluster connection..."
kubectl cluster-info dump --output-directory=/tmp/cluster-dump || true
echo "Kubeconfig:"
kubectl config view || true
fi
sleep 5
done
# Check initial node status
echo "Initial node status:"
kubectl get nodes -o wide || true
# Wait for CNI to initialize on all nodes
echo "Waiting for CNI plugin to initialize..."
for i in {1..120}; do
# Check if nodes exist
node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
if [ "$node_count" -eq "0" ]; then
echo "No nodes found yet... ($i/120)"
sleep 5
continue
fi
# Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
echo "CNI still initializing... ($i/120)"
if [ $((i % 10)) -eq 0 ]; then
echo "Current node conditions:"
kubectl describe nodes | grep -A10 "Conditions:" || true
fi
else
echo "✓ CNI initialized on all nodes"
break
fi
if [ $i -eq 120 ]; then
echo "ERROR: CNI failed to initialize"
echo "Node details:"
kubectl describe nodes
echo "KIND logs:"
docker ps -a | grep kind
docker logs kind-control-plane 2>&1 | tail -100 || true
exit 1
fi
sleep 5
done
# Wait for nodes to be fully ready
echo "Waiting for all nodes to be ready..."
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
echo "ERROR: Nodes failed to become ready"
kubectl describe nodes
kubectl get pods -A -o wide
exit 1
}
echo "✓ All nodes are ready:"
kubectl get nodes -o wide
# Verify CNI with a test pod
echo "Verifying CNI functionality..."
kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
echo "WARNING: CNI test pod failed, checking kindnet pods..."
kubectl get pods -n kube-system -l app=kindnet -o wide
kubectl logs -n kube-system -l app=kindnet --tail=50 || true
}
- name: Install NVidia GPU operator for KinD
if: matrix.gpu-setup == true
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user
- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
if [ "${{ matrix.gpu-setup }}" == "true" ]; then
echo "Running GPU tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
else
echo "Running non-GPU tests (GPU setup disabled for debugging)..."
poetry run pytest -v -s ./tests/e2e -m 'kind and not nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 || echo "No non-GPU tests found"
fi
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log