e2e

RHOAIENG-32532: Update kueue integration #1905

Workflow file for this run

.github/workflows/e2e_tests.yaml at 274ba47

	# e2e tests workflow for CodeFlare-SDK
	name: e2e

	on:
	pull_request:
	branches:
	- main
	- 'release-*'
	- ray-jobs-feature
	paths-ignore:
	- 'docs/**'
	- '**.adoc'
	- '**.md'
	- 'LICENSE'

	concurrency:
	group: ${{ github.head_ref }}-${{ github.workflow }}
	cancel-in-progress: true

	env:
	CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
	# Explicitly set Docker MTU for KIND to avoid network issues
	KIND_EXPERIMENTAL_DOCKER_NETWORK: "bridge"

	jobs:
	kubernetes:
	runs-on: gpu-t4-4-core
	strategy:
	fail-fast: false
	matrix:
	# Try with and without GPU setup to isolate the issue
	gpu-setup: [true, false]

	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Checkout common repo code
	uses: actions/checkout@v4
	with:
	repository: 'project-codeflare/codeflare-common'
	ref: 'main'
	path: 'common'

	- name: Checkout CodeFlare operator repository
	uses: actions/checkout@v4
	with:
	repository: project-codeflare/codeflare-operator
	path: codeflare-operator

	- name: Set Go
	uses: actions/setup-go@v5
	with:
	go-version-file: './codeflare-operator/go.mod'
	cache-dependency-path: "./codeflare-operator/go.sum"

	- name: Set up gotestfmt
	uses: gotesttools/gotestfmt-action@v2
	with:
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Set up specific Python version
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'
	cache: 'pip' # caching pip dependencies

	- name: Diagnose Docker environment on GPU runner
	run: \|
	echo "=== Docker Environment Diagnostics ==="
	echo "Docker version:"
	docker version \|\| true
	echo ""
	echo "Docker info:"
	docker info \|\| true
	echo ""
	echo "System info:"
	uname -a
	echo ""
	echo "Network interfaces:"
	ip addr show \|\| true
	echo ""
	echo "Checking cgroup version:"
	stat -fc %T /sys/fs/cgroup/ \|\| true
	echo ""
	echo "Checking if running in container:"
	if [ -f /.dockerenv ]; then echo "Running inside Docker"; else echo "Not in Docker"; fi
	echo ""
	echo "Available disk space:"
	df -h
	echo ""
	echo "Memory info:"
	free -h
	echo ""
	echo "DNS Configuration:"
	cat /etc/resolv.conf \|\| true
	echo ""
	echo "Testing DNS resolution:"
	nslookup google.com \|\| true
	echo "=== End Diagnostics ==="

	- name: Setup NVidia GPU environment for KinD
	if: matrix.gpu-setup == true
	uses: ./common/github-actions/nvidia-gpu-setup

	- name: Create KIND config with explicit networking
	run: \|
	cat > /tmp/kind-config.yaml <<EOF
	kind: Cluster
	apiVersion: kind.x-k8s.io/v1alpha4
	networking:
	# Explicitly set pod subnet to avoid conflicts
	podSubnet: "10.244.0.0/16"
	serviceSubnet: "10.96.0.0/16"
	# Disable default CNI so we can ensure it's properly installed
	disableDefaultCNI: false
	# Set MTU for better compatibility
	kubeProxyMode: "iptables"
	nodes:
	- role: control-plane
	# Extra mounts that might be needed for GPU
	extraMounts:
	- containerPath: /dev/shm
	hostPath: /dev/shm
	propagation: HostToContainer
	- role: worker
	extraMounts:
	- containerPath: /dev/shm
	hostPath: /dev/shm
	propagation: HostToContainer
	containerdConfigPatches:
	- \|-
	[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
	runtime_type = "io.containerd.runc.v2"
	[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
	SystemdCgroup = true
	kubeadmConfigPatches:
	- \|
	kind: ClusterConfiguration
	apiServer:
	extraArgs:
	"enable-admission-plugins": "NodeRestriction,ResourceQuota"
	- \|
	kind: KubeletConfiguration
	serverTLSBootstrap: true
	cgroupDriver: systemd
	containerRuntimeEndpoint: unix:///run/containerd/containerd.sock
	- \|
	kind: InitConfiguration
	nodeRegistration:
	kubeletExtraArgs:
	pod-infra-container-image: registry.k8s.io/pause:3.9
	- \|
	kind: JoinConfiguration
	discovery:
	bootstrapToken:
	apiServerEndpoint: "{{ .ControlPlaneEndpoint }}"
	token: "{{ .Token }}"
	unsafeSkipCAVerification: true
	nodeRegistration:
	kubeletExtraArgs:
	pod-infra-container-image: registry.k8s.io/pause:3.9
	EOF

	echo "KIND configuration:"
	cat /tmp/kind-config.yaml

	- name: Setup and start KinD cluster
	uses: ./common/github-actions/kind
	with:
	worker-nodes: 1
	kind-config: /tmp/kind-config.yaml
	continue-on-error: true
	id: kind-setup

	- name: Fallback KIND setup if custom config fails
	if: steps.kind-setup.outcome == 'failure'
	run: \|
	echo "Custom KIND config failed, trying with default settings..."
	# Clean up any failed attempts
	kind delete cluster --name kind \|\| true
	docker rm -f $(docker ps -aq --filter name=kind-) \|\| true

	# Create cluster with simpler config
	cat > /tmp/kind-simple.yaml <<EOF
	kind: Cluster
	apiVersion: kind.x-k8s.io/v1alpha4
	nodes:
	- role: control-plane
	- role: worker
	EOF

	kind create cluster --config /tmp/kind-simple.yaml --wait 5m \|\| {
	echo "ERROR: KIND cluster creation failed"
	docker ps -a
	exit 1
	}

	- name: Fix KIND DNS and wait for cluster initialization
	run: \|
	echo "=== KIND Cluster Setup Diagnostics ==="

	# Check KIND containers
	echo "KIND containers:"
	docker ps -a --filter name=kind-

	# Get control plane container name
	CONTROL_PLANE=$(docker ps --filter name=kind-control-plane --format "{{.Names}}" \| head -1)
	if [ -z "$CONTROL_PLANE" ]; then
	CONTROL_PLANE="kind-control-plane"
	fi
	echo "Control plane container: $CONTROL_PLANE"

	# Get control plane IP
	CONTROL_PLANE_IP=$(docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $CONTROL_PLANE 2>/dev/null \|\| echo "")
	echo "Control plane IP: $CONTROL_PLANE_IP"

	# Check Docker network
	echo "Docker networks:"
	docker network ls
	KIND_NETWORK=$(docker network ls \| grep kind \| awk '{print $2}' \| head -1)
	if [ -n "$KIND_NETWORK" ]; then
	echo "KIND network: $KIND_NETWORK"
	echo "Containers on KIND network:"
	docker network inspect $KIND_NETWORK \| jq -r '.Containers \| to_entries \| .[] \| "\(.value.Name): \(.value.IPv4Address)"' \|\| true
	fi

	# Ensure all KIND containers are on the same network
	for container in $(docker ps -a --filter name=kind- --format "{{.Names}}"); do
	echo "Checking network for $container"
	docker inspect $container \| jq -r '.[0].NetworkSettings.Networks \| keys[]' \|\| true
	done

	echo "=== Waiting for cluster initialization ==="

	# Wait for API server
	for i in {1..60}; do
	if kubectl cluster-info &>/dev/null; then
	echo "✓ Cluster API is responsive"
	break
	fi
	echo "Waiting for cluster API... ($i/60)"

	# Try to diagnose connection issues
	if [ $i -eq 30 ]; then
	echo "Debugging cluster connection..."
	kubectl cluster-info dump --output-directory=/tmp/cluster-dump \|\| true
	echo "Kubeconfig:"
	kubectl config view \|\| true
	fi
	sleep 5
	done

	# Check initial node status
	echo "Initial node status:"
	kubectl get nodes -o wide \|\| true

	# Wait for CNI to initialize on all nodes
	echo "Waiting for CNI plugin to initialize..."
	for i in {1..120}; do
	# Check if nodes exist
	node_count=$(kubectl get nodes --no-headers 2>/dev/null \| wc -l \|\| echo "0")
	if [ "$node_count" -eq "0" ]; then
	echo "No nodes found yet... ($i/120)"
	sleep 5
	continue
	fi

	# Check if CNI is initialized (nodes won't have NetworkUnavailable condition)
	if kubectl get nodes -o json \| jq -r '.items[].status.conditions[] \| select(.type=="NetworkUnavailable") \| .status' \| grep -v "False" > /dev/null 2>&1; then
	echo "CNI still initializing... ($i/120)"
	if [ $((i % 10)) -eq 0 ]; then
	echo "Current node conditions:"
	kubectl describe nodes \| grep -A10 "Conditions:" \|\| true
	fi
	else
	echo "✓ CNI initialized on all nodes"
	break
	fi

	if [ $i -eq 120 ]; then
	echo "ERROR: CNI failed to initialize"
	echo "Node details:"
	kubectl describe nodes
	echo "KIND logs:"
	docker ps -a \| grep kind
	docker logs kind-control-plane 2>&1 \| tail -100 \|\| true
	exit 1
	fi
	sleep 5
	done

	# Wait for nodes to be fully ready
	echo "Waiting for all nodes to be ready..."
	kubectl wait --for=condition=Ready nodes --all --timeout=300s \|\| {
	echo "ERROR: Nodes failed to become ready"
	kubectl describe nodes
	kubectl get pods -A -o wide
	exit 1
	}

	echo "✓ All nodes are ready:"
	kubectl get nodes -o wide

	# Verify CNI with a test pod
	echo "Verifying CNI functionality..."
	kubectl run test-cni --image=busybox:latest --rm -it --restart=Never --command -- sh -c "echo 'CNI test successful'" \|\| {
	echo "WARNING: CNI test pod failed, checking kindnet pods..."
	kubectl get pods -n kube-system -l app=kindnet -o wide
	kubectl logs -n kube-system -l app=kindnet --tail=50 \|\| true
	}

	- name: Install NVidia GPU operator for KinD
	if: matrix.gpu-setup == true
	uses: ./common/github-actions/nvidia-gpu-operator

	- name: Deploy CodeFlare stack
	id: deploy
	run: \|
	cd codeflare-operator
	echo Setting up CodeFlare stack
	make setup-e2e
	echo Deploying CodeFlare operator
	make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
	kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
	cd ..

	- name: Add user to KinD
	uses: ./common/github-actions/kind-add-user
	with:
	user-name: sdk-user

	- name: Configure RBAC for sdk user with limited permissions
	run: \|
	kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
	kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
	kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
	kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
	kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
	kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
	kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
	kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
	kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
	kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
	kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
	kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
	kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
	kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
	kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
	kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
	kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
	kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
	kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
	kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
	kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
	kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
	kubectl config use-context sdk-user

	- name: Run e2e tests
	run: \|
	export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
	echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV

	set -euo pipefail
	pip install poetry
	poetry install --with test,docs
	echo "Running e2e tests..."
	if [ "${{ matrix.gpu-setup }}" == "true" ]; then
	echo "Running GPU tests..."
	poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
	else
	echo "Running non-GPU tests (GPU setup disabled for debugging)..."
	poetry run pytest -v -s ./tests/e2e -m 'kind and not nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 \|\| echo "No non-GPU tests found"
	fi
	env:
	GRPC_DNS_RESOLVER: "native"

	- name: Switch to kind-cluster context to print logs
	if: always() && steps.deploy.outcome == 'success'
	run: kubectl config use-context kind-cluster

	- name: Print Pytest output log
	if: always() && steps.deploy.outcome == 'success'
	run: \|
	echo "Printing Pytest output logs"
	cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log

	- name: Print CodeFlare operator logs
	if: always() && steps.deploy.outcome == 'success'
	run: \|
	echo "Printing CodeFlare operator logs"
	kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator \| tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log

	- name: Print KubeRay operator logs
	if: always() && steps.deploy.outcome == 'success'
	run: \|
	echo "Printing KubeRay operator logs"
	kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay \| tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log

	- name: Export all KinD pod logs
	uses: ./common/github-actions/kind-export-logs
	if: always() && steps.deploy.outcome == 'success'
	with:
	output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}

	- name: Upload logs
	uses: actions/upload-artifact@v4
	if: always() && steps.deploy.outcome == 'success'
	with:
	name: logs
	retention-days: 10
	path: \|
	${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/*/.log

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

RHOAIENG-32532: Update kueue integration #1905

Workflow file

RHOAIENG-32532: Update kueue integration #1905

Uh oh!

Jobs

Run details

Workflow file for this run