RHOAIENG-32532: Update kueue integration #1906
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# e2e tests workflow for CodeFlare-SDK | |
name: e2e | |
on: | |
pull_request: | |
branches: | |
- main | |
- 'release-*' | |
- ray-jobs-feature | |
paths-ignore: | |
- 'docs/**' | |
- '**.adoc' | |
- '**.md' | |
- 'LICENSE' | |
concurrency: | |
group: ${{ github.head_ref }}-${{ github.workflow }} | |
cancel-in-progress: true | |
env: | |
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | |
# Explicitly set Docker MTU for KIND to avoid network issues | |
KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge" | |
jobs: | |
kubernetes: | |
runs-on: gpu-t4-4-core | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Checkout common repo code | |
uses: actions/checkout@v4 | |
with: | |
repository: 'project-codeflare/codeflare-common' | |
ref: 'main' | |
path: 'common' | |
- name: Checkout CodeFlare operator repository | |
uses: actions/checkout@v4 | |
with: | |
repository: project-codeflare/codeflare-operator | |
path: codeflare-operator | |
- name: Set Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version-file: './codeflare-operator/go.mod' | |
cache-dependency-path: "./codeflare-operator/go.sum" | |
- name: Set up gotestfmt | |
uses: gotesttools/gotestfmt-action@v2 | |
with: | |
token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up specific Python version | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
cache: 'pip' # caching pip dependencies | |
- name: Install required tools | |
run: | | |
echo "Installing KIND..." | |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.22.0/kind-linux-amd64 | |
chmod +x ./kind | |
sudo mv ./kind /usr/local/bin/kind | |
kind version | |
echo "Installing kubectl if not present..." | |
if ! command -v kubectl &> /dev/null; then | |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | |
chmod +x kubectl | |
sudo mv kubectl /usr/local/bin/ | |
fi | |
kubectl version --client | |
echo "Checking for podman..." | |
if ! command -v podman &> /dev/null; then | |
echo "Podman not found, installing..." | |
sudo apt-get update | |
sudo apt-get install -y podman || echo "Podman installation failed, might not be needed" | |
fi | |
- name: Setup NVidia GPU environment for KinD | |
uses: ./common/github-actions/nvidia-gpu-setup | |
- name: Setup and start KinD cluster | |
uses: ./common/github-actions/kind | |
with: | |
worker-nodes: 1 | |
- name: Verify Kind cluster | |
run: | | |
echo "Checking Kind clusters..." | |
kind get clusters | |
echo "Current kubectl context:" | |
kubectl config current-context | |
echo "Checking nodes:" | |
kubectl get nodes | |
echo "Waiting for nodes to be ready..." | |
kubectl wait --for=condition=Ready nodes --all --timeout=300s | |
- name: Install NVidia GPU operator for KinD | |
uses: ./common/github-actions/nvidia-gpu-operator | |
- name: Deploy KubeRay operator | |
run: | | |
KUBERAY_VERSION="v1.2.2" | |
echo "Deploying KubeRay ${KUBERAY_VERSION}" | |
kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=180s" | |
echo "Waiting for KubeRay operator to become ready..." | |
kubectl wait --for=condition=Available --timeout=300s deployment/kuberay-operator -n ray-system || { | |
echo "KubeRay operator not found, checking pods..." | |
kubectl get pods -A | grep kuberay | |
exit 1 | |
} | |
- name: Deploy CodeFlare stack | |
id: deploy | |
run: | | |
cd codeflare-operator | |
echo Setting up CodeFlare stack | |
make setup-e2e | |
echo Deploying CodeFlare operator | |
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | |
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | |
cd .. | |
- name: Add user to KinD | |
uses: ./common/github-actions/kind-add-user | |
with: | |
user-name: sdk-user | |
- name: Configure RBAC for sdk user with limited permissions | |
run: | | |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | |
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | |
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | |
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | |
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters | |
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | |
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers | |
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user | |
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | |
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | |
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | |
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | |
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | |
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | |
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | |
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | |
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | |
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | |
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | |
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | |
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | |
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | |
kubectl config use-context sdk-user | |
- name: Setup test output directory | |
run: | | |
CODEFLARE_TEST_OUTPUT_DIR="${{ runner.temp }}/test-logs" | |
mkdir -p ${CODEFLARE_TEST_OUTPUT_DIR} | |
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | |
- name: Run e2e tests | |
run: | | |
set -euo pipefail | |
pip install poetry | |
poetry install --with test,docs | |
echo "Running e2e tests..." | |
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | |
env: | |
GRPC_DNS_RESOLVER: "native" | |
- name: Switch to kind-cluster context to print logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: kubectl config use-context kind-cluster | |
- name: Print Pytest output log | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing Pytest output logs" | |
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log | |
- name: Print CodeFlare operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing CodeFlare operator logs" | |
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | |
- name: Print KubeRay operator logs | |
if: always() && steps.deploy.outcome == 'success' | |
run: | | |
echo "Printing KubeRay operator logs" | |
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | |
- name: Export all KinD pod logs | |
uses: ./common/github-actions/kind-export-logs | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
output-directory: ${{ env.CODEFLARE_TEST_OUTPUT_DIR }} | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: always() && steps.deploy.outcome == 'success' | |
with: | |
name: logs | |
retention-days: 10 | |
path: | | |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log | |
if-no-files-found: warn |