Skip to content

RHOAIENG-32532: Update kueue integration #1904

RHOAIENG-32532: Update kueue integration

RHOAIENG-32532: Update kueue integration #1904

Workflow file for this run

# e2e tests workflow for CodeFlare-SDK
name: e2e
on:
pull_request:
branches:
- main
- 'release-*'
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
# Explicitly set Docker MTU for KIND to avoid network issues
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"
jobs:
kubernetes:
runs-on: gpu-t4-4-core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: "./codeflare-operator/go.sum"
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Diagnose Docker environment on GPU runner
run: |
echo "=== Docker Environment Diagnostics ==="
echo "Docker version:"
docker version || true
echo ""
echo "Docker info:"
docker info || true
echo ""
echo "System info:"
uname -a
echo ""
echo "Network interfaces:"
ip addr show || true
echo ""
echo "Checking cgroup version:"
stat -fc %T /sys/fs/cgroup/ || true
echo ""
echo "Checking if running in container:"
if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
echo ""
echo "Available disk space:"
df -h
echo ""
echo "Memory info:"
free -h
echo "=== End Diagnostics ==="
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Create KIND config with explicit networking
run: |
cat > /tmp/kind-config.yaml <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
# Explicitly set pod subnet to avoid conflicts
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/16"
# Disable default CNI so we can ensure it's properly installed
disableDefaultCNI: false
# Set MTU for better compatibility
kubeProxyMode: "iptables"
nodes:
- role: control-plane
# Extra mounts that might be needed for GPU
extraMounts:
- containerPath: /dev/shm
hostPath: /dev/shm
propagation: HostToContainer
- role: worker
extraMounts:
- containerPath: /dev/shm
hostPath: /dev/shm
propagation: HostToContainer
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
EOF
echo "KIND configuration:"
cat /tmp/kind-config.yaml
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1
kind-config: /tmp/kind-config.yaml
- name: Wait for KIND cluster and CNI initialization
run: |
echo "Waiting for KIND cluster to initialize..."
# First ensure cluster API is responsive
for i in {1..60}; do
if kubectl cluster-info &>/dev/null; then
echo "✓ Cluster API is responsive"
break
fi
echo "Waiting for cluster API... ($i/60)"
sleep 5
done
# Check initial node status
echo "Initial node status:"
kubectl get nodes -o wide || true
# Wait for CNI to initialize on all nodes
echo "Waiting for CNI plugin to initialize..."
for i in {1..120}; do
# Check if nodes exist
node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0")
if [ "$node_count" -eq "0" ]; then
echo "No nodes found yet... ($i/120)"
sleep 5
continue
fi
# Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then
echo "CNI still initializing... ($i/120)"
if [ $((i % 10)) -eq 0 ]; then
echo "Current node conditions:"
kubectl describe nodes | grep -A10 "Conditions:" || true
fi
else
echo "✓ CNI initialized on all nodes"
break
fi
if [ $i -eq 120 ]; then
echo "ERROR: CNI failed to initialize"
echo "Node details:"
kubectl describe nodes
echo "KIND logs:"
docker ps -a | grep kind
docker logs kind-control-plane 2>&1 | tail -100 || true
exit 1
fi
sleep 5
done
# Wait for nodes to be fully ready
echo "Waiting for all nodes to be ready..."
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
echo "ERROR: Nodes failed to become ready"
kubectl describe nodes
kubectl get pods -A -o wide
exit 1
}
echo "✓ All nodes are ready:"
kubectl get nodes -o wide
# Verify CNI with a test pod
echo "Verifying CNI functionality..."
kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || {
echo "WARNING: CNI test pod failed, checking kindnet pods..."
kubectl get pods -n kube-system -l app=kindnet -o wide
kubectl logs -n kube-system -l app=kindnet --tail=50 || true
}
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user
- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log