diff --git a/docs/usage/advanced/cuda.md b/docs/usage/advanced/cuda.md index be2d0ba48f..1ca34fd7a5 100644 --- a/docs/usage/advanced/cuda.md +++ b/docs/usage/advanced/cuda.md @@ -5,6 +5,21 @@ CUDA workloads require the NVIDIA Container Runtime, so containerd needs to be c The K3s container itself also needs to run with this runtime. If you are using Docker you can install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). +## Preparing Server to run Keda + +To Create a Server that has all the drivers installed you can use exmaple script prepared for Ubuntu 24.04 + +[install_gpu_drivers_server.sh](cuda/install_gpu_drivers_server.sh): + +```install_gpu_drivers_server.sh +{% + include-markdown "./cuda/install_gpu_drivers_server.sh" + comments=false +%} +``` + +This will install all drivers and such to Ubuntu 24.04 Server + ## Building a customized K3s image To get the NVIDIA container runtime in the K3s image you need to build your own K3s image. @@ -25,7 +40,7 @@ To get around this we need to build the image with a supported base image. This Dockerfile is based on the [K3s Dockerfile](https://github.com/rancher/k3s/blob/master/package/Dockerfile) The following changes are applied: -1. Change the base images to nvidia/cuda:12.4.1-base-ubuntu22.04 so the NVIDIA Container Toolkit can be installed. The version of `cuda:xx.x.x` must match the one you're planning to use. +1. Change the base images to nvidia/cuda:12.8.1-base-ubuntu24.04 so the NVIDIA Container Toolkit can be installed. The version of `cuda:xx.x.x` must match the one you're planning to use. 2. Add a manifest for the NVIDIA driver plugin for Kubernetes with an added RuntimeClass definition. See [k3s documentation](https://docs.k3s.io/advanced#nvidia-container-runtime-support). ### The NVIDIA device plugin @@ -126,3 +141,4 @@ Most of the information in this article was obtained from various sources: * [@vainkop](https://github.com/vainkop) * [@iwilltry42](https://github.com/iwilltry42) * [@dbreyfogle](https://github.com/dbreyfogle) +* [@omerfsen](https://github.com/omerfsen) diff --git a/docs/usage/advanced/cuda/Dockerfile b/docs/usage/advanced/cuda/Dockerfile index 728be226c1..69c6c2afc4 100644 --- a/docs/usage/advanced/cuda/Dockerfile +++ b/docs/usage/advanced/cuda/Dockerfile @@ -1,30 +1,60 @@ -ARG K3S_TAG="v1.28.8-k3s1" -ARG CUDA_TAG="12.4.1-base-ubuntu22.04" +ARG K3S_TAG="v1.31.7-k3s1" +ARG CUDA_TAG="12.8.1-base-ubuntu24.04" +ARG NVIDIA_DRIVER_VERS="570" -FROM rancher/k3s:$K3S_TAG as k3s -FROM nvcr.io/nvidia/cuda:$CUDA_TAG +# Stage 1: Pull k3s base image +FROM rancher/k3s:${K3S_TAG} AS k3s +# Nothing else needed here except the base -# Install the NVIDIA container toolkit -RUN apt-get update && apt-get install -y curl \ - && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ +# Stage 2: CUDA + NVIDIA Toolkit layer +FROM nvcr.io/nvidia/cuda:${CUDA_TAG} + +# Re-declare all ARGs you want to use in this stage +ARG NVIDIA_DRIVER_VERS + +# Optional: useful for runtime debugging +ENV NVIDIA_DRIVER_VERS=${NVIDIA_DRIVER_VERS} +ENV PAGER=less + +# Install NVIDIA container toolkit & matching utilities +# Install NVIDIA container toolkit, utilities, and graphics-drivers PPA +# YES WE KNOW we are at Ubuntu 24.04 but NVidia container toolkit does not support 24.04 YET +# https://github.com/NVIDIA/nvidia-container-toolkit/issues/482 +RUN apt-get update && apt-get install -y \ + curl \ + gnupg \ + less \ + ca-certificates \ + software-properties-common \ + lsb-release \ + less \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ - && apt-get update && apt-get install -y nvidia-container-toolkit \ - && nvidia-ctk runtime configure --runtime=containerd + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update \ + && apt-get install -y \ + nvidia-container-toolkit \ + nvidia-utils-${NVIDIA_DRIVER_VERS}-server \ + && nvidia-ctk runtime configure --runtime=containerd \ + && apt-get autoremove -y && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Copy full K3s filesystem, split /bin for layering purposes COPY --from=k3s / / --exclude=/bin COPY --from=k3s /bin /bin -# Deploy the nvidia driver plugin on startup +# Inject NVIDIA plugin manifest COPY device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml +# Define required volumes for K3s runtime VOLUME /var/lib/kubelet VOLUME /var/lib/rancher/k3s VOLUME /var/lib/cni VOLUME /var/log +# Set path and defaults ENV PATH="$PATH:/bin/aux" - ENTRYPOINT ["/bin/k3s"] -CMD ["agent"] \ No newline at end of file +CMD ["agent"] diff --git a/docs/usage/advanced/cuda/build.sh b/docs/usage/advanced/cuda/build.sh index afbc475b75..9d69dc38a6 100755 --- a/docs/usage/advanced/cuda/build.sh +++ b/docs/usage/advanced/cuda/build.sh @@ -1,19 +1,52 @@ -#!/bin/bash +#!/usr/bin/env bash +# set -euxo pipefail -set -euxo pipefail - -K3S_TAG=${K3S_TAG:="v1.28.8-k3s1"} # replace + with -, if needed -CUDA_TAG=${CUDA_TAG:="12.4.1-base-ubuntu22.04"} -IMAGE_REGISTRY=${IMAGE_REGISTRY:="MY_REGISTRY"} -IMAGE_REPOSITORY=${IMAGE_REPOSITORY:="rancher/k3s"} -IMAGE_TAG="$K3S_TAG-cuda-$CUDA_TAG" +# Set default values +K3S_TAG=${K3S_TAG:="v1.31.7-k3s1"} # replace + with -, if needed +CUDA_TAG=${CUDA_TAG:="12.8.1-base-ubuntu24.04"} +#IMAGE_REGISTRY=${IMAGE_REGISTRY:="techmakers.azurecr.io"} +IMAGE_REGISTRY=${IMAGE_REGISTRY:="docker.io"} +IMAGE_REPOSITORY=${IMAGE_REPOSITORY:="k3s"} +IMAGE_TAG="${K3S_TAG//+/-}-cuda-$CUDA_TAG" IMAGE=${IMAGE:="$IMAGE_REGISTRY/$IMAGE_REPOSITORY:$IMAGE_TAG"} echo "IMAGE=$IMAGE" -docker build \ +# Check if Docker is installed +if ! command -v docker &> /dev/null; then + echo "Docker is not installed. Please install Docker first." >&2 + exit 1 +fi + +# Check if Docker service is running +if ! systemctl is-active --quiet docker; then + echo "Docker service is not running. Attempting to start it..." >&2 + sudo systemctl start docker +fi + +# Check if user is in docker group +if ! groups | grep -q '\bdocker\b'; then + echo "WARNING: You are not in the 'docker' group. You may need to use sudo for docker commands." +fi + +# Check if az CLI is installed +if ! command -v az &> /dev/null; then + echo "Azure CLI (az) is not installed. Please install it first." >&2 + exit 1 +fi + +# Login to Azure container registry +# echo "Logging into Azure..." +# az acr login --name "$(echo $IMAGE_REGISTRY | cut -d. -f1)" + +# --- Build and Push --- +echo "Building image..." +docker build --debug \ --build-arg K3S_TAG=$K3S_TAG \ --build-arg CUDA_TAG=$CUDA_TAG \ - -t $IMAGE . -docker push $IMAGE -echo "Done!" \ No newline at end of file + -t "$IMAGE" . + +# echo "Pushing image..." +# docker push "$IMAGE" + +echo "Done!" diff --git a/docs/usage/advanced/cuda/device-plugin-daemonset.yaml b/docs/usage/advanced/cuda/device-plugin-daemonset.yaml index a52bb06d27..fadd15a0c2 100644 --- a/docs/usage/advanced/cuda/device-plugin-daemonset.yaml +++ b/docs/usage/advanced/cuda/device-plugin-daemonset.yaml @@ -1,3 +1,4 @@ +--- apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: @@ -31,7 +32,7 @@ spec: # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR @@ -46,4 +47,4 @@ spec: volumes: - name: device-plugin hostPath: - path: /var/lib/kubelet/device-plugins \ No newline at end of file + path: /var/lib/kubelet/device-plugins diff --git a/docs/usage/advanced/cuda/install_gpu_drivers_server.sh b/docs/usage/advanced/cuda/install_gpu_drivers_server.sh new file mode 100755 index 0000000000..357fdef273 --- /dev/null +++ b/docs/usage/advanced/cuda/install_gpu_drivers_server.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +NVIDIA_DRIVER_VERS="570" +sudo apt-get remove --purge '^nvidia-.*' 'libnvidia-*' -y && \ +sudo apt-get autoremove -y && \ +sudo apt-get autoclean -y && \ +sudo apt update && \ +sudo apt-get install -y software-properties-common && \ +sudo add-apt-repository -y ppa:graphics-drivers/ppa && \ +sudo apt-get update && \ +sudo apt install nvidia-driver-${NVIDIA_DRIVER_VERS} -y && \ +sudo reboot +# Then install following to have nvidia-smi +# sudo apt install nvidia-utils-570 -y +# +# +# Add NVIDIA GPG key +distribution="ubuntu22.04" +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +# Install +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +# Configure Docker to use NVIDIA runtime +sudo nvidia-ctk runtime configure --runtime=docker + +# Restart Docker +sudo systemctl restart docker + + + +# Add NVIDIA GPG key +curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +# Add repo using fake distribution +curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + + +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +sudo nvidia-ctk runtime configure --runtime=docker + +sudo systemctl restart docker + +docker run --rm --gpus all nvidia/cuda:12.8.1-base-ubuntu24.04 nvidia-smi