Skip to content

RHOAIENG-32532: Update kueue integration #1916

RHOAIENG-32532: Update kueue integration

RHOAIENG-32532: Update kueue integration #1916

Workflow file for this run

# e2e tests workflow for CodeFlare-SDK
name: e2e
on:
pull_request:
branches:
- main
- 'release-*'
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
jobs:
kubernetes:
runs-on: gpu-t4-4-core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: "./codeflare-operator/go.sum"
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Setup and start KinD cluster
uses: ./common/github-actions/kind
with:
worker-nodes: 1
- name: Wait for KIND nodes to be ready
run: |
echo "=== Waiting for KIND nodes to be ready ==="
echo "Initial node status:"
kubectl get nodes -o wide
# Wait for all nodes to be ready
echo "Waiting for all nodes to reach Ready state..."
kubectl wait --for=condition=Ready nodes --all --timeout=300s || {
echo "ERROR: Nodes did not become ready!"
echo "Node status:"
kubectl get nodes -o wide
echo "Node conditions:"
kubectl describe nodes
echo "System pods:"
kubectl get pods -n kube-system -o wide
exit 1
}
# Verify CNI is ready
echo "Verifying CNI (kindnet) pods are ready..."
kubectl wait --for=condition=Ready pods -n kube-system -l app=kindnet --timeout=60s
echo "Final cluster state:"
kubectl get nodes -o wide
kubectl get pods -n kube-system -o wide
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
echo "=== Configuring RBAC for sdk-user ==="
# Create a comprehensive ClusterRole with all needed permissions
cat <<EOF | kubectl apply -f -
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: sdk-user-role
rules:
# Core resources
- apiGroups: [""]
resources: ["pods", "pods/log", "pods/status", "pods/portforward", "services", "endpoints", "persistentvolumeclaims", "events", "configmaps", "secrets", "nodes", "namespaces", "serviceaccounts", "replicationcontrollers"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Apps resources
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "replicasets", "statefulsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Batch resources
- apiGroups: ["batch", "batch/v1"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Autoscaling resources
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["get", "list", "watch"]
# Networking resources
- apiGroups: ["networking.k8s.io"]
resources: ["ingresses", "networkpolicies"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# RBAC resources (read-only)
- apiGroups: ["rbac.authorization.k8s.io"]
resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"]
verbs: ["get", "list", "watch"]
# CRD resources
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["get", "list", "watch"]
# Ray resources
- apiGroups: ["ray.io"]
resources: ["rayclusters", "rayjobs", "rayservices", "rayclusters/status", "rayjobs/status", "rayservices/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# AppWrapper resources (MCAD)
- apiGroups: ["workload.codeflare.dev"]
resources: ["appwrappers", "appwrappers/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Kueue resources
- apiGroups: ["kueue.x-k8s.io"]
resources: ["clusterqueues", "localqueues", "resourceflavors", "workloads", "workloads/status"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Metrics
- apiGroups: ["metrics.k8s.io"]
resources: ["pods", "nodes"]
verbs: ["get", "list"]
EOF
# Create ClusterRoleBinding
cat <<EOF | kubectl apply -f -
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sdk-user-role-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: sdk-user-role
subjects:
- kind: User
name: sdk-user
apiGroup: rbac.authorization.k8s.io
EOF
echo "RBAC configuration complete. Switching context to sdk-user..."
kubectl config use-context sdk-user
# Verify permissions
echo "Verifying sdk-user permissions..."
kubectl auth can-i --list || true
- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log