RHOAIENG-32532: Update kueue integration #1904
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# e2e tests workflow for CodeFlare-SDK | |
name: e2e | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
- ray-jobs-feature | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
# Explicitly set Docker MTU for KIND to avoid network issues | |
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge" | |
jobs: | |
kubernetes: | |
runs-on: gpu-t4-4-core | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
cache: 'pip' # caching pip dependencies | |
- name: Diagnose Docker environment on GPU runner | |
run: | | |
echo "=== Docker Environment Diagnostics ===" | |
echo "Docker version:" | |
docker version || true | |
echo "" | |
echo "Docker info:" | |
docker info || true | |
echo "" | |
echo "System info:" | |
uname -a | |
echo "" | |
echo "Network interfaces:" | |
ip addr show || true | |
echo "" | |
echo "Checking cgroup version:" | |
stat -fc %T /sys/fs/cgroup/ || true | |
echo "" | |
echo "Checking if running in container:" | |
if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi | |
echo "" | |
echo "Available disk space:" | |
df -h | |
echo "" | |
echo "Memory info:" | |
free -h | |
echo "=== End Diagnostics ===" | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Create KIND config with explicit networking | |
run: | | |
cat > /tmp/kind-config.yaml <<EOF | |
kind: Cluster | |
apiVersion: kind.x-k8s.io/v1alpha4 | |
networking: | |
# Explicitly set pod subnet to avoid conflicts | |
podSubnet: "10.244.0.0/16" | |
serviceSubnet: "10.96.0.0/16" | |
# Disable default CNI so we can ensure it's properly installed | |
disableDefaultCNI: false | |
# Set MTU for better compatibility | |
kubeProxyMode: "iptables" | |
nodes: | |
- role: control-plane | |
# Extra mounts that might be needed for GPU | |
extraMounts: | |
- containerPath: /dev/shm | |
hostPath: /dev/shm | |
propagation: HostToContainer | |
- role: worker | |
extraMounts: | |
- containerPath: /dev/shm | |
hostPath: /dev/shm | |
propagation: HostToContainer | |
containerdConfigPatches: | |
- |- | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | |
runtime_type = "io.containerd.runc.v2" | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | |
SystemdCgroup = true | |
EOF | |
echo "KIND configuration:" | |
cat /tmp/kind-config.yaml | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 1 | |
kind-config: /tmp/kind-config.yaml | |
- name: Wait for KIND cluster and CNI initialization | |
run: | | |
echo "Waiting for KIND cluster to initialize..." | |
# First ensure cluster API is responsive | |
for i in {1..60}; do | |
if kubectl cluster-info &>/dev/null; then | |
echo "✓ Cluster API is responsive" | |
break | |
fi | |
echo "Waiting for cluster API... ($i/60)" | |
sleep 5 | |
done | |
# Check initial node status | |
echo "Initial node status:" | |
kubectl get nodes -o wide || true | |
# Wait for CNI to initialize on all nodes | |
echo "Waiting for CNI plugin to initialize..." | |
for i in {1..120}; do | |
# Check if nodes exist | |
node_count=$(kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0") | |
if [ "$node_count" -eq "0" ]; then | |
echo "No nodes found yet... ($i/120)" | |
sleep 5 | |
continue | |
fi | |
# Check if CNI is initialized (nodes won't have NetworkUnavailable condition) | |
if kubectl get nodes -o json | jq -r '.items[].status.conditions[] | select(.type=="NetworkUnavailable") | .status' | grep -v "False" > /dev/null 2>&1; then | |
echo "CNI still initializing... ($i/120)" | |
if [ $((i % 10)) -eq 0 ]; then | |
echo "Current node conditions:" | |
kubectl describe nodes | grep -A10 "Conditions:" || true | |
fi | |
else | |
echo "✓ CNI initialized on all nodes" | |
break | |
fi | |
if [ $i -eq 120 ]; then | |
echo "ERROR: CNI failed to initialize" | |
echo "Node details:" | |
kubectl describe nodes | |
echo "KIND logs:" | |
docker ps -a | grep kind | |
docker logs kind-control-plane 2>&1 | tail -100 || true | |
exit 1 | |
fi | |
sleep 5 | |
done | |
# Wait for nodes to be fully ready | |
echo "Waiting for all nodes to be ready..." | |
kubectl wait --for=condition=Ready nodes --all --timeout=300s || { | |
echo "ERROR: Nodes failed to become ready" | |
kubectl describe nodes | |
kubectl get pods -A -o wide | |
exit 1 | |
} | |
echo "✓ All nodes are ready:" | |
kubectl get nodes -o wide | |
# Verify CNI with a test pod | |
echo "Verifying CNI functionality..." | |
kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" || { | |
echo "WARNING: CNI test pod failed, checking kindnet pods..." | |
kubectl get pods -n kube-system -l app=kindnet -o wide | |
kubectl logs -n kube-system -l app=kindnet --tail=50 || true | |
} | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers | |
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Run e2e tests | |
run: | | |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
echo "Running e2e tests..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log |