Skip to content

feat(RHOAIENG-26487): Cluster lifecycling via RayJob #1768

feat(RHOAIENG-26487): Cluster lifecycling via RayJob

feat(RHOAIENG-26487): Cluster lifecycling via RayJob #1768

Workflow file for this run

# e2e tests workflow for CodeFlare-SDK
name: e2e
on:
pull_request:
branches:
- main
- 'release-*'
- ray-jobs-feature
paths-ignore:
- 'docs/**'
- '**.adoc'
- '**.md'
- 'LICENSE'
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
jobs:
kubernetes:
runs-on: gpu-t4-4-core
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Checkout common repo code
uses: actions/checkout@v4
with:
repository: 'project-codeflare/codeflare-common'
ref: 'main'
path: 'common'
- name: Checkout CodeFlare operator repository
uses: actions/checkout@v4
with:
repository: project-codeflare/codeflare-operator
path: codeflare-operator
- name: Set Go
uses: actions/setup-go@v5
with:
go-version-file: './codeflare-operator/go.mod'
cache-dependency-path: "./codeflare-operator/go.sum"
- name: Set up gotestfmt
uses: gotesttools/gotestfmt-action@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
uses: ./common/github-actions/nvidia-gpu-setup
- name: Init directories
run: |
TEMP_DIR="$(pwd)/tmp"
mkdir -p "${TEMP_DIR}"
echo "TEMP_DIR=${TEMP_DIR}" >> $GITHUB_ENV
mkdir -p "$(pwd)/bin"
echo "$(pwd)/bin" >> $GITHUB_PATH
- name: Container image registry
run: |
podman run -d -p 5000:5000 --name registry registry:2.8.1
export NODE_IMAGE="kindest/node:v1.26.0@sha256:691e24bd2417609db7e589e1a479b902d2e209892a10ce375fab60a8407c7352"
export REGISTRY_ADDRESS=$(hostname -i):5000
echo "REGISTRY_ADDRESS=${REGISTRY_ADDRESS}" >> $GITHUB_ENV
echo "Container image registry started at ${REGISTRY_ADDRESS}"
KIND_CONFIG_FILE=${TEMP_DIR}/kind.yaml
# Create KIND config with 1 worker node
cat > ${KIND_CONFIG_FILE} <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
containerdConfigPatches:
- |-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."${REGISTRY_ADDRESS}"]
endpoint = ["http://${REGISTRY_ADDRESS}"]
nodes:
- role: control-plane
image: ${NODE_IMAGE}
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
extraPortMappings:
- containerPort: 80
hostPort: 80
protocol: TCP
- containerPort: 443
hostPort: 443
protocol: TCP
- role: worker
image: ${NODE_IMAGE}
labels:
worker-1: true
extraMounts:
- hostPath: /dev/null
containerPath: /var/run/nvidia-container-devices/all
EOF

Check failure on line 118 in .github/workflows/e2e_tests.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/e2e_tests.yaml

Invalid workflow file

You have an error in your yaml syntax on line 118
echo "KIND_CONFIG_FILE=${KIND_CONFIG_FILE}" >> $GITHUB_ENV
sudo --preserve-env=REGISTRY_ADDRESS sh -c 'cat > /etc/containers/registries.conf.d/local.conf <<EOF
[[registry]]
prefix = "$REGISTRY_ADDRESS"
insecure = true
location = "$REGISTRY_ADDRESS"
EOF'
- name: Setup KinD cluster
uses: helm/[email protected]
with:
cluster_name: cluster
version: v0.17.0
config: ${{ env.KIND_CONFIG_FILE }}
- name: Print cluster info
run: |
echo "KinD cluster:"
kubectl cluster-info
kubectl describe nodes
- name: Install Ingress controller
run: |
VERSION=controller-v1.9.6
echo "Deploying Ingress controller into KinD cluster"
curl https://raw.githubusercontent.com/kubernetes/ingress-nginx/"${VERSION}"/deploy/static/provider/kind/deploy.yaml | sed "s/--publish-status-address=localhost/--report-node-internal-ip-address\\n - --status-update-interval=10/g" | kubectl apply -f -
kubectl annotate ingressclass nginx "ingressclass.kubernetes.io/is-default-class=true"
# Turn on SSL Passthrough
kubectl patch deploy --type json --patch '[{"op":"add","path": "/spec/template/spec/containers/0/args/-","value":"--enable-ssl-passthrough"}]' ingress-nginx-controller -n ingress-nginx
kubectl -n ingress-nginx wait --timeout=300s --for=condition=Available deployments --all
- name: Setup Dnsmasq to resolve hostnames with domain name kind
run: |
# Based on https://sixfeetup.com/blog/local-development-with-wildcard-dns-on-linux
sudo apt-get install -y dnsmasq
sudo sed -i -E "s/#DNS=/DNS=127.0.0.2/" /etc/systemd/resolved.conf
sudo sed -i -E "s/#Domains=/Domains=~kind/" /etc/systemd/resolved.conf
sudo systemctl restart systemd-resolved
sudo sed -i -E "s/#IGNORE_RESOLVCONF=yes/IGNORE_RESOLVCONF=yes/" /etc/default/dnsmasq
sudo sed -i -E "s/#listen-address=/listen-address=127.0.0.2/" /etc/dnsmasq.conf
sudo sed -i -E "s/#bind-interfaces/bind-interfaces/" /etc/dnsmasq.conf
sudo sed -i -E "s|#(address=).*|\1/kind/127.0.0.1|" /etc/dnsmasq.conf
sudo systemctl restart dnsmasq
systemctl status dnsmasq
- name: Set env variables for tests to properly leverage KinD cluster
run: |
echo "CLUSTER_TYPE=KIND" >> $GITHUB_ENV
echo "CLUSTER_HOSTNAME=kind" >> $GITHUB_ENV
- name: Install NVidia GPU operator for KinD
uses: ./common/github-actions/nvidia-gpu-operator
- name: Deploy CodeFlare stack
id: deploy
run: |
cd codeflare-operator
echo Setting up CodeFlare stack
make setup-e2e
echo Deploying CodeFlare operator
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
cd ..
- name: Add user to KinD
uses: ./common/github-actions/kind-add-user
with:
user-name: sdk-user
- name: Configure RBAC for sdk user with limited permissions
run: |
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
kubectl config use-context sdk-user
- name: Run e2e tests
run: |
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
set -euo pipefail
pip install poetry
poetry install --with test,docs
echo "Running e2e tests..."
poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
env:
GRPC_DNS_RESOLVER: "native"
- name: Switch to kind-cluster context to print logs
if: always() && steps.deploy.outcome == 'success'
run: kubectl config use-context kind-cluster
- name: Print Pytest output log
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing Pytest output logs"
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
- name: Print CodeFlare operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing CodeFlare operator logs"
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
- name: Print KubeRay operator logs
if: always() && steps.deploy.outcome == 'success'
run: |
echo "Printing KubeRay operator logs"
kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
- name: Export all KinD pod logs
uses: ./common/github-actions/kind-export-logs
if: always() && steps.deploy.outcome == 'success'
with:
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
- name: Upload logs
uses: actions/upload-artifact@v4
if: always() && steps.deploy.outcome == 'success'
with:
name: logs
retention-days: 10
path: |
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log