Skip to content

Commit 7403a4b

Browse files
add Konflux dockerfile for universal training image
Signed-off-by: Brian Gallagher <[email protected]>
1 parent 80b4713 commit 7403a4b

File tree

1 file changed

+280
-0
lines changed

1 file changed

+280
-0
lines changed
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
# Universal Image Dockerfile
2+
#
3+
# FIPS-friendly Features:
4+
# - uv is used only in build stage (not shipped in runtime image)
5+
# - Build tools are isolated in intermediate stages
6+
# - Final image contains only runtime dependencies
7+
# - OpenSSL FIPS mode supported via base image
8+
9+
################################################################################
10+
# Build Arguments
11+
################################################################################
12+
ARG BASE_IMAGE=quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
13+
# CUDA_VERSION: Used for environment variables and documentation purposes
14+
# - Sets CUDA_VERSION env var (helps with debugging and tooling)
15+
# - Not used for package installation (specific versions hardcoded below)
16+
ARG CUDA_VERSION=12.8
17+
# PYTHON_VERSION: Critical for path resolution in multi-stage build
18+
# - Used to locate site-packages directory (e.g., /opt/app-root/lib/python3.12/site-packages)
19+
# - Must match base image Python version
20+
ARG PYTHON_VERSION=3.12
21+
22+
################################################################################
23+
# Builder Stage - Install uv for dependency resolution
24+
################################################################################
25+
FROM ${BASE_IMAGE} AS builder
26+
27+
USER 0
28+
WORKDIR /tmp/builder
29+
30+
# Install latest version of uv in builder stage
31+
# Why: Even if base image has uv, we want the latest version for:
32+
# - Latest bug fixes and performance improvements
33+
# - Consistent behavior across builds
34+
# - Specific version control independent of base image
35+
# Note: This uv is isolated in builder stage and copied selectively to other stages
36+
RUN pip install --no-cache-dir uv
37+
38+
################################################################################
39+
# Base Stage
40+
################################################################################
41+
FROM ${BASE_IMAGE} AS base
42+
43+
LABEL name="universal:py312-cuda128-torch280" \
44+
summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.8.0" \
45+
description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.8.0, FlashAttention 2.8.3) on UBI9" \
46+
io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
47+
maintainer="['[email protected]']" \
48+
io.openshift.expose-services="" \
49+
com.redhat.component="odh-th03-cuda128-torch280-py312" \
50+
com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf" \
51+
io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided."
52+
53+
# Copy license file
54+
COPY LICENSE.md /licenses/cuda-license.md
55+
56+
USER 0
57+
WORKDIR /opt/app-root/bin
58+
59+
# Environment variables for NVIDIA and CUDA
60+
ENV NVIDIA_VISIBLE_DEVICES=all \
61+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
62+
CUDA_VERSION=${CUDA_VERSION} \
63+
CUDA_HOME=/usr/local/cuda \
64+
PATH=/usr/local/cuda/bin:$PATH \
65+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
66+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
67+
XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
68+
69+
################################################################################
70+
# System Dependencies Stage
71+
################################################################################
72+
FROM base AS system-deps
73+
74+
USER 0
75+
WORKDIR /opt/app-root/bin
76+
77+
# Copy repository configuration files
78+
COPY cuda.repo mellanox.repo /etc/yum.repos.d/
79+
80+
# Install system packages (RDMA, CUDA tools, build toolchain)
81+
# Package list defined in rpms.in.yaml for documentation
82+
#
83+
# RDMA/InfiniBand packages (from mellanox.repo):
84+
# - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities
85+
# - libibumad: User-space MAD (Management Datagram) library for InfiniBand
86+
# - librdmacm, librdmacm-utils: RDMA connection management
87+
# - rdma-core: Core RDMA user-space libraries
88+
#
89+
# NOTE: mlnx-tools is intentionally NOT included
90+
# - mlnx-tools provides Mellanox-specific diagnostics (mlxlink, mlxconfig, etc.)
91+
# - For containerized ML workloads, standard InfiniBand tools (infiniband-diags) are sufficient
92+
# - Reduces image size and dependency complexity
93+
# - If needed, can be added: mlnx-tools \
94+
#
95+
# CUDA packages (from cuda.repo):
96+
# - cuda-command-line-tools-12-8: CUDA CLI utilities
97+
# - cuda-cudart-devel-12-8: CUDA runtime development headers
98+
# - cuda-nvcc-12-8-12.8.93-1: CUDA compiler (specific version for reproducibility)
99+
#
100+
# Build toolchain (from UBI repos):
101+
# - gcc, gcc-c++, make: C/C++ compilation tools
102+
# - python3-devel: Python headers for building native extensions
103+
# - cmake: Build system (required by some Python packages)
104+
# - git: Version control (some pip installs need it)
105+
#
106+
# --setopt=install_weak_deps=False: Don't install recommended packages (minimizes image size)
107+
RUN dnf install -y --setopt=install_weak_deps=False \
108+
libibverbs-utils \
109+
infiniband-diags \
110+
libibumad \
111+
librdmacm \
112+
librdmacm-utils \
113+
rdma-core \
114+
cuda-command-line-tools-12-8 \
115+
cuda-cudart-devel-12-8 \
116+
cuda-nvcc-12-8-12.8.93-1 \
117+
gcc \
118+
gcc-c++ \
119+
make \
120+
python3-devel \
121+
cmake \
122+
git && dnf clean all && rm -rf /var/cache/dnf/*
123+
124+
# Verify CUDA toolkit
125+
RUN /usr/local/cuda/bin/nvcc -V
126+
RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || \
127+
(echo "[fail-fast] CUDA libs not found" >&2; exit 1)
128+
129+
# Bundle RDMA runtime libs to a staging dir
130+
RUN mkdir -p /opt/rdma-runtime \
131+
&& cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \
132+
&& cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \
133+
&& cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \
134+
&& cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \
135+
&& cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true
136+
137+
################################################################################
138+
# Python Dependencies Stage
139+
################################################################################
140+
FROM system-deps AS python-deps
141+
142+
USER 0
143+
WORKDIR /tmp/deps
144+
145+
# Copy uv from builder stage (FIPS: uv only used during build, not in runtime)
146+
COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
147+
148+
# Copy pyproject.toml, pylock.toml, and requirements-special.txt
149+
# pylock.toml contains most dependencies
150+
# requirements-special.txt contains packages needing --no-build-isolation
151+
COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
152+
153+
# Switch to user 1001 for pip installations
154+
USER 1001
155+
WORKDIR /opt/app-root/src
156+
157+
# Install main dependencies from pylock.toml using uv pip sync
158+
# This syncs the environment to match exactly what's in the lockfile
159+
#
160+
# UV_NO_CACHE explained:
161+
# What: Sets UV_NO_CACHE=1 temporarily, then unsets it (empty string)
162+
# Why: Running as user 1001 causes uv to try writing to cache directory
163+
# which may have permission issues. Disabling cache avoids this.
164+
# Why unset: ENV changes persist across layers. Unsetting prevents
165+
# the variable from affecting subsequent operations or runtime.
166+
# Empty value effectively unsets the environment variable.
167+
ENV UV_NO_CACHE=1
168+
RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
169+
ENV UV_NO_CACHE=
170+
171+
# Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
172+
# TODO: use aipcc index
173+
RUN pip install --retries 5 --timeout 300 --no-cache-dir \
174+
"git+https://github.com/opendatahub-io/kubeflow-sdk@main"
175+
176+
# Install special packages with proper flags
177+
# These packages require --no-build-isolation to use pre-installed CUDA tools
178+
# and must be installed in a specific order
179+
180+
# Copy requirements-special.txt for installation
181+
COPY --chown=1001:0 requirements-special.txt /tmp/deps/
182+
183+
# 1. Flash Attention (standalone, needs --no-build-isolation --no-deps)
184+
RUN pip install --no-build-isolation --no-cache-dir --no-deps \
185+
$(grep "^flash-attn" /tmp/deps/requirements-special.txt)
186+
187+
# 2. Mamba SSM dependencies (order matters!)
188+
# - causal-conv1d first (needs --no-build-isolation)
189+
# - mamba-ssm second (needs --no-build-isolation --no-deps)
190+
RUN pip install --no-build-isolation --no-cache-dir \
191+
$(grep "^causal-conv1d" /tmp/deps/requirements-special.txt) \
192+
&& pip install --no-build-isolation --no-cache-dir --no-deps \
193+
$(grep "^mamba-ssm" /tmp/deps/requirements-special.txt)
194+
195+
# Fix permissions for OpenShift
196+
# What: Adjusts file permissions for OpenShift/Kubernetes compatibility
197+
# Why: OpenShift runs containers with arbitrary user IDs but fixed group ID (root group)
198+
# - chmod g+w: Allows group write access to site-packages (for pip installs at runtime)
199+
# - fix-permissions: UBI-provided script that ensures group ownership/permissions
200+
# When needed: Required for any container that may run in OpenShift with arbitrary UIDs
201+
ARG PYTHON_VERSION
202+
USER 0
203+
RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \
204+
&& fix-permissions /opt/app-root -P
205+
206+
# Clean up uv and build artifacts (FIPS: remove build-only tools)
207+
RUN rm -f /usr/local/bin/uv \
208+
&& rm -rf /tmp/deps \
209+
&& dnf remove -y gcc gcc-c++ cmake python3-devel \
210+
&& dnf clean all \
211+
&& rm -rf /var/cache/dnf/*
212+
213+
################################################################################
214+
# Final Stage - FIPS-friendly Runtime
215+
################################################################################
216+
FROM ${BASE_IMAGE} AS final
217+
218+
USER 0
219+
WORKDIR /opt/app-root/src
220+
221+
# Copy Python site-packages and CLI entry points from python-deps stage
222+
# This excludes build tools like gcc, cmake, uv (FIPS friendly)
223+
ARG PYTHON_VERSION
224+
COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
225+
COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin
226+
227+
# Copy CUDA runtime from system-deps (built Python packages need CUDA libs)
228+
# Contains all necessary CUDA libraries - no need to install via dnf
229+
COPY --from=system-deps /usr/local/cuda /usr/local/cuda
230+
231+
# Copy RDMA runtime libraries from system-deps
232+
# These are needed for InfiniBand/RDMA support at runtime
233+
COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/
234+
235+
# Update dynamic linker cache for CUDA libraries
236+
# What: ldconfig updates the runtime linker's cache of shared libraries
237+
# Why: After copying CUDA libraries to /usr/local/cuda, the system needs to know where to find them
238+
# - Scans directories like /usr/local/cuda/lib64 (defined in /etc/ld.so.conf.d/)
239+
# - Updates /etc/ld.so.cache so programs can locate libcudart.so, libcublas.so, etc.
240+
# When needed: Required after installing/copying shared libraries to non-standard locations
241+
# Test: Run "ldconfig -p | grep cuda" to see if CUDA libs are in the cache
242+
RUN ldconfig
243+
244+
# FIPS-friendly: Remove uv from final image (inherited from base image)
245+
# uv is only needed during build, not at runtime
246+
RUN rm -f /opt/app-root/bin/uv
247+
248+
# Environment variables for NVIDIA and CUDA
249+
ENV NVIDIA_VISIBLE_DEVICES=all \
250+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
251+
CUDA_VERSION=${CUDA_VERSION} \
252+
CUDA_HOME=/usr/local/cuda \
253+
PATH=/usr/local/cuda/bin:$PATH \
254+
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
255+
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
256+
XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
257+
258+
# Copy license file
259+
COPY LICENSE.md /licenses/cuda-license.md
260+
261+
# Copy entrypoint
262+
COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
263+
264+
# Fix permissions for OpenShift (final stage)
265+
# What: Ensures proper permissions for OpenShift/Kubernetes arbitrary UIDs
266+
# Why: After copying site-packages from python-deps stage, permissions need adjustment
267+
# - OpenShift assigns random UID but fixed GID (usually 0, root group)
268+
# - Group write permissions allow pip to install packages at runtime
269+
# - fix-permissions ensures all files have correct group ownership
270+
# When: Required in final stage because COPY operations reset permissions
271+
# Context: This is the second time we do this - once after building packages,
272+
# and again after copying them to the final stage
273+
RUN fix-permissions /opt/app-root -P \
274+
&& chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
275+
276+
USER 1001
277+
WORKDIR /opt/app-root/src
278+
279+
ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
280+
CMD ["start-notebook.sh"]

0 commit comments

Comments
 (0)