feat(RHOAIENG-26487): Cluster lifecycling via RayJob #1768
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # e2e tests workflow for CodeFlare-SDK | ||
| name: e2e | ||
| on: | ||
| pull_request: | ||
| branches: | ||
| - main | ||
| - 'release-*' | ||
| - ray-jobs-feature | ||
| paths-ignore: | ||
| - 'docs/**' | ||
| - '**.adoc' | ||
| - '**.md' | ||
| - 'LICENSE' | ||
| concurrency: | ||
| group: ${{ github.head_ref }}-${{ github.workflow }} | ||
| cancel-in-progress: true | ||
| env: | ||
| CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" | ||
| jobs: | ||
| kubernetes: | ||
| runs-on: gpu-t4-4-core | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| submodules: recursive | ||
| - name: Checkout common repo code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| repository: 'project-codeflare/codeflare-common' | ||
| ref: 'main' | ||
| path: 'common' | ||
| - name: Checkout CodeFlare operator repository | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| repository: project-codeflare/codeflare-operator | ||
| path: codeflare-operator | ||
| - name: Set Go | ||
| uses: actions/setup-go@v5 | ||
| with: | ||
| go-version-file: './codeflare-operator/go.mod' | ||
| cache-dependency-path: "./codeflare-operator/go.sum" | ||
| - name: Set up gotestfmt | ||
| uses: gotesttools/gotestfmt-action@v2 | ||
| with: | ||
| token: ${{ secrets.GITHUB_TOKEN }} | ||
| - name: Set up specific Python version | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: '3.11' | ||
| cache: 'pip' # caching pip dependencies | ||
| - name: Setup NVidia GPU environment for KinD | ||
| uses: ./common/github-actions/nvidia-gpu-setup | ||
| - name: Init directories | ||
| run: | | ||
| TEMP_DIR="$(pwd)/tmp" | ||
| mkdir -p "${TEMP_DIR}" | ||
| echo "TEMP_DIR=${TEMP_DIR}" >> $GITHUB_ENV | ||
| mkdir -p "$(pwd)/bin" | ||
| echo "$(pwd)/bin" >> $GITHUB_PATH | ||
| - name: Container image registry | ||
| run: | | ||
| podman run -d -p 5000:5000 --name registry registry:2.8.1 | ||
| export NODE_IMAGE="kindest/node:v1.26.0@sha256:691e24bd2417609db7e589e1a479b902d2e209892a10ce375fab60a8407c7352" | ||
| export REGISTRY_ADDRESS=$(hostname -i):5000 | ||
| echo "REGISTRY_ADDRESS=${REGISTRY_ADDRESS}" >> $GITHUB_ENV | ||
| echo "Container image registry started at ${REGISTRY_ADDRESS}" | ||
| KIND_CONFIG_FILE=${TEMP_DIR}/kind.yaml | ||
| # Create KIND config with 1 worker node | ||
| cat > ${KIND_CONFIG_FILE} <<EOF | ||
| kind: Cluster | ||
| apiVersion: kind.x-k8s.io/v1alpha4 | ||
| containerdConfigPatches: | ||
| - |- | ||
| [plugins."io.containerd.grpc.v1.cri".registry.mirrors."${REGISTRY_ADDRESS}"] | ||
| endpoint = ["http://${REGISTRY_ADDRESS}"] | ||
| nodes: | ||
| - role: control-plane | ||
| image: ${NODE_IMAGE} | ||
| kubeadmConfigPatches: | ||
| - | | ||
| kind: InitConfiguration | ||
| nodeRegistration: | ||
| kubeletExtraArgs: | ||
| node-labels: "ingress-ready=true" | ||
| extraPortMappings: | ||
| - containerPort: 80 | ||
| hostPort: 80 | ||
| protocol: TCP | ||
| - containerPort: 443 | ||
| hostPort: 443 | ||
| protocol: TCP | ||
| - role: worker | ||
| image: ${NODE_IMAGE} | ||
| labels: | ||
| worker-1: true | ||
| extraMounts: | ||
| - hostPath: /dev/null | ||
| containerPath: /var/run/nvidia-container-devices/all | ||
| EOF | ||
| echo "KIND_CONFIG_FILE=${KIND_CONFIG_FILE}" >> $GITHUB_ENV | ||
| sudo --preserve-env=REGISTRY_ADDRESS sh -c 'cat > /etc/containers/registries.conf.d/local.conf <<EOF | ||
| [[registry]] | ||
| prefix = "$REGISTRY_ADDRESS" | ||
| insecure = true | ||
| location = "$REGISTRY_ADDRESS" | ||
| EOF' | ||
| - name: Setup KinD cluster | ||
| uses: helm/[email protected] | ||
| with: | ||
| cluster_name: cluster | ||
| version: v0.17.0 | ||
| config: ${{ env.KIND_CONFIG_FILE }} | ||
| - name: Print cluster info | ||
| run: | | ||
| echo "KinD cluster:" | ||
| kubectl cluster-info | ||
| kubectl describe nodes | ||
| - name: Install Ingress controller | ||
| run: | | ||
| VERSION=controller-v1.9.6 | ||
| echo "Deploying Ingress controller into KinD cluster" | ||
| curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f - | ||
| kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true" | ||
| # Turn on SSL Passthrough | ||
| kubectl patch deploy --type json --patch '[{"op":"add","path": "/spec/template/spec/containers/0/args/-","value":"--enable-ssl-passthrough"}]' ingress-nginx-controller -n ingress-nginx | ||
| kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all | ||
| - name: Setup Dnsmasq to resolve hostnames with domain name kind | ||
| run: | | ||
| # Based on https://sixfeetup.com/blog/local-development-with-wildcard-dns-on-linux | ||
| sudo apt-get install -y dnsmasq | ||
| sudo sed -i -E "s/#DNS=/DNS=127.0.0.2/" /etc/systemd/resolved.conf | ||
| sudo sed -i -E "s/#Domains=/Domains=~kind/" /etc/systemd/resolved.conf | ||
| sudo systemctl restart systemd-resolved | ||
| sudo sed -i -E "s/#IGNORE_RESOLVCONF=yes/IGNORE_RESOLVCONF=yes/" /etc/default/dnsmasq | ||
| sudo sed -i -E "s/#listen-address=/listen-address=127.0.0.2/" /etc/dnsmasq.conf | ||
| sudo sed -i -E "s/#bind-interfaces/bind-interfaces/" /etc/dnsmasq.conf | ||
| sudo sed -i -E "s|#(address=).*|\1/kind/127.0.0.1|" /etc/dnsmasq.conf | ||
| sudo systemctl restart dnsmasq | ||
| systemctl status dnsmasq | ||
| - name: Set env variables for tests to properly leverage KinD cluster | ||
| run: | | ||
| echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV | ||
| echo "CLUSTER_HOSTNAME=kind" >> $GITHUB_ENV | ||
| - name: Install NVidia GPU operator for KinD | ||
| uses: ./common/github-actions/nvidia-gpu-operator | ||
| - name: Deploy CodeFlare stack | ||
| id: deploy | ||
| run: | | ||
| cd codeflare-operator | ||
| echo Setting up CodeFlare stack | ||
| make setup-e2e | ||
| echo Deploying CodeFlare operator | ||
| make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" | ||
| kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager | ||
| cd .. | ||
| - name: Add user to KinD | ||
| uses: ./common/github-actions/kind-add-user | ||
| with: | ||
| user-name: sdk-user | ||
| - name: Configure RBAC for sdk user with limited permissions | ||
| run: | | ||
| kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses | ||
| kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user | ||
| kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces | ||
| kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user | ||
| kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters | ||
| kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user | ||
| kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers | ||
| kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user | ||
| kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors | ||
| kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user | ||
| kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues | ||
| kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user | ||
| kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues | ||
| kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user | ||
| kubectl create clusterrole list-secrets --verb=get,list --resource=secrets | ||
| kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user | ||
| kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods | ||
| kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user | ||
| kubectl create clusterrole service-reader --verb=get,list,watch --resource=services | ||
| kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user | ||
| kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward | ||
| kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user | ||
| kubectl config use-context sdk-user | ||
| - name: Run e2e tests | ||
| run: | | ||
| export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} | ||
| echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV | ||
| set -euo pipefail | ||
| pip install poetry | ||
| poetry install --with test,docs | ||
| echo "Running e2e tests..." | ||
| poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 | ||
| env: | ||
| GRPC_DNS_RESOLVER: "native" | ||
| - name: Switch to kind-cluster context to print logs | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| run: kubectl config use-context kind-cluster | ||
| - name: Print Pytest output log | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| run: | | ||
| echo "Printing Pytest output logs" | ||
| cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log | ||
| - name: Print CodeFlare operator logs | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| run: | | ||
| echo "Printing CodeFlare operator logs" | ||
| kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log | ||
| - name: Print KubeRay operator logs | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| run: | | ||
| echo "Printing KubeRay operator logs" | ||
| kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log | ||
| - name: Export all KinD pod logs | ||
| uses: ./common/github-actions/kind-export-logs | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| with: | ||
| output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} | ||
| - name: Upload logs | ||
| uses: actions/upload-artifact@v4 | ||
| if: always() && steps.deploy.outcome == 'success' | ||
| with: | ||
| name: logs | ||
| retention-days: 10 | ||
| path: | | ||
| ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log | ||