RHOAIENG-32532: Update kueue integration #1916
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# e2e tests workflow for CodeFlare-SDK | |
name: e2e | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
- ray-jobs-feature | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
jobs: | |
kubernetes: | |
runs-on: gpu-t4-4-core | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
cache: 'pip' # caching pip dependencies | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 1 | |
- name: Wait for KIND nodes to be ready | |
run: | | |
echo "=== Waiting for KIND nodes to be ready ===" | |
echo "Initial node status:" | |
kubectl get nodes -o wide | |
# Wait for all nodes to be ready | |
echo "Waiting for all nodes to reach Ready state..." | |
kubectl wait --for=condition=Ready nodes --all --timeout=300s || { | |
echo "ERROR: Nodes did not become ready!" | |
echo "Node status:" | |
kubectl get nodes -o wide | |
echo "Node conditions:" | |
kubectl describe nodes | |
echo "System pods:" | |
kubectl get pods -n kube-system -o wide | |
exit 1 | |
} | |
# Verify CNI is ready | |
echo "Verifying CNI (kindnet) pods are ready..." | |
kubectl wait --for=condition=Ready pods -n kube-system -l app=kindnet --timeout=60s | |
echo "Final cluster state:" | |
kubectl get nodes -o wide | |
kubectl get pods -n kube-system -o wide | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
echo "=== Configuring RBAC for sdk-user ===" | |
# Create a comprehensive ClusterRole with all needed permissions | |
cat <<EOF | kubectl apply -f - | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: sdk-user-role | |
rules: | |
# Core resources | |
- apiGroups: [""] | |
resources: ["pods", "pods/log", "pods/status", "pods/portforward", "services", "endpoints", "persistentvolumeclaims", "events", "configmaps", "secrets", "nodes", "namespaces", "serviceaccounts", "replicationcontrollers"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# Apps resources | |
- apiGroups: ["apps"] | |
resources: ["deployments", "daemonsets", "replicasets", "statefulsets"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# Batch resources | |
- apiGroups: ["batch", "batch/v1"] | |
resources: ["jobs", "cronjobs"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# Autoscaling resources | |
- apiGroups: ["autoscaling"] | |
resources: ["horizontalpodautoscalers"] | |
verbs: ["get", "list", "watch"] | |
# Networking resources | |
- apiGroups: ["networking.k8s.io"] | |
resources: ["ingresses", "networkpolicies"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# RBAC resources (read-only) | |
- apiGroups: ["rbac.authorization.k8s.io"] | |
resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"] | |
verbs: ["get", "list", "watch"] | |
# CRD resources | |
- apiGroups: ["apiextensions.k8s.io"] | |
resources: ["customresourcedefinitions"] | |
verbs: ["get", "list", "watch"] | |
# Ray resources | |
- apiGroups: ["ray.io"] | |
resources: ["rayclusters", "rayjobs", "rayservices", "rayclusters/status", "rayjobs/status", "rayservices/status"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# AppWrapper resources (MCAD) | |
- apiGroups: ["workload.codeflare.dev"] | |
resources: ["appwrappers", "appwrappers/status"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# Kueue resources | |
- apiGroups: ["kueue.x-k8s.io"] | |
resources: ["clusterqueues", "localqueues", "resourceflavors", "workloads", "workloads/status"] | |
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] | |
# Metrics | |
- apiGroups: ["metrics.k8s.io"] | |
resources: ["pods", "nodes"] | |
verbs: ["get", "list"] | |
EOF | |
# Create ClusterRoleBinding | |
cat <<EOF | kubectl apply -f - | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: sdk-user-role-binding | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: sdk-user-role | |
subjects: | |
- kind: User | |
name: sdk-user | |
apiGroup: rbac.authorization.k8s.io | |
EOF | |
echo "RBAC configuration complete. Switching context to sdk-user..." | |
kubectl config use-context sdk-user | |
# Verify permissions | |
echo "Verifying sdk-user permissions..." | |
kubectl auth can-i --list || true | |
- name: Run e2e tests | |
run: | | |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
echo "Running e2e tests..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log |