1+ # Universal Image Dockerfile
2+ #
3+ # FIPS-friendly Features:
4+ # - uv is used only in build stage (not shipped in runtime image)
5+ # - Build tools are isolated in intermediate stages
6+ # - Final image contains only runtime dependencies
7+ # - OpenSSL FIPS mode supported via base image
8+
9+ ################################################################################
10+ # Build Arguments
11+ ################################################################################
12+ ARG BASE_IMAGE=quay.io/opendatahub/workbench-images:cuda-jupyter-minimal-ubi9-python-3.12-2025a_20250903
13+ # CUDA_VERSION: Used for environment variables and documentation purposes
14+ # - Sets CUDA_VERSION env var (helps with debugging and tooling)
15+ # - Not used for package installation (specific versions hardcoded below)
16+ ARG CUDA_VERSION=12.8
17+ # PYTHON_VERSION: Critical for path resolution in multi-stage build
18+ # - Used to locate site-packages directory (e.g., /opt/app-root/lib/python3.12/site-packages)
19+ # - Must match base image Python version
20+ ARG PYTHON_VERSION=3.12
21+
22+ ################################################################################
23+ # Builder Stage - Install uv for dependency resolution
24+ ################################################################################
25+ FROM ${BASE_IMAGE} AS builder
26+
27+ USER 0
28+ WORKDIR /tmp/builder
29+
30+ # Install latest version of uv in builder stage
31+ # Why: Even if base image has uv, we want the latest version for:
32+ # - Latest bug fixes and performance improvements
33+ # - Consistent behavior across builds
34+ # - Specific version control independent of base image
35+ # Note: This uv is isolated in builder stage and copied selectively to other stages
36+ RUN pip install --no-cache-dir uv
37+
38+ ################################################################################
39+ # Base Stage
40+ ################################################################################
41+ FROM ${BASE_IMAGE} AS base
42+
43+ LABEL name="universal:py312-cuda128-torch280" \
44+ summary="Universal CUDA 12.8 Python 3.12 image with PyTorch 2.8.0" \
45+ description="Universal image combining minimal Jupyter workbench and runtime ML stack (CUDA 12.8, PyTorch 2.8.0, FlashAttention 2.8.3) on UBI9" \
46+ io.k8s.display-name="Universal CUDA 12.8 Python 3.12 (Workbench + Runtime)" \
47+ maintainer="['
[email protected] ']" \
48+ io.openshift.expose-services="" \
49+ com.redhat.component="odh-th03-cuda128-torch280-py312" \
50+ com.redhat.license_terms="https://www.redhat.com/licenses/Red_Hat_Standard_EULA_20191108.pdf" \
51+ io.k8s.description="Universal image: Jupyter workbench by default; runtime when command provided."
52+
53+ # Copy license file
54+ COPY LICENSE.md /licenses/cuda-license.md
55+
56+ USER 0
57+ WORKDIR /opt/app-root/bin
58+
59+ # Environment variables for NVIDIA and CUDA
60+ ENV NVIDIA_VISIBLE_DEVICES=all \
61+ NVIDIA_DRIVER_CAPABILITIES=compute,utility \
62+ CUDA_VERSION=${CUDA_VERSION} \
63+ CUDA_HOME=/usr/local/cuda \
64+ PATH=/usr/local/cuda/bin:$PATH \
65+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
66+ TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
67+ XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
68+
69+ ################################################################################
70+ # System Dependencies Stage
71+ ################################################################################
72+ FROM base AS system-deps
73+
74+ USER 0
75+ WORKDIR /opt/app-root/bin
76+
77+ # Copy repository configuration files
78+ COPY cuda.repo mellanox.repo /etc/yum.repos.d/
79+
80+ # Install system packages (RDMA, CUDA tools, build toolchain)
81+ # Package list defined in rpms.in.yaml for documentation
82+ #
83+ # RDMA/InfiniBand packages (from mellanox.repo):
84+ # - libibverbs-utils, infiniband-diags: RDMA diagnostics and utilities
85+ # - libibumad: User-space MAD (Management Datagram) library for InfiniBand
86+ # - librdmacm, librdmacm-utils: RDMA connection management
87+ # - rdma-core: Core RDMA user-space libraries
88+ #
89+ # NOTE: mlnx-tools is intentionally NOT included
90+ # - mlnx-tools provides Mellanox-specific diagnostics (mlxlink, mlxconfig, etc.)
91+ # - For containerized ML workloads, standard InfiniBand tools (infiniband-diags) are sufficient
92+ # - Reduces image size and dependency complexity
93+ # - If needed, can be added: mlnx-tools \
94+ #
95+ # CUDA packages (from cuda.repo):
96+ # - cuda-command-line-tools-12-8: CUDA CLI utilities
97+ # - cuda-cudart-devel-12-8: CUDA runtime development headers
98+ # - cuda-nvcc-12-8-12.8.93-1: CUDA compiler (specific version for reproducibility)
99+ #
100+ # Build toolchain (from UBI repos):
101+ # - gcc, gcc-c++, make: C/C++ compilation tools
102+ # - python3-devel: Python headers for building native extensions
103+ # - cmake: Build system (required by some Python packages)
104+ # - git: Version control (some pip installs need it)
105+ #
106+ # --setopt=install_weak_deps=False: Don't install recommended packages (minimizes image size)
107+ RUN dnf install -y --setopt=install_weak_deps=False \
108+ libibverbs-utils \
109+ infiniband-diags \
110+ libibumad \
111+ librdmacm \
112+ librdmacm-utils \
113+ rdma-core \
114+ cuda-command-line-tools-12-8 \
115+ cuda-cudart-devel-12-8 \
116+ cuda-nvcc-12-8-12.8.93-1 \
117+ gcc \
118+ gcc-c++ \
119+ make \
120+ python3-devel \
121+ cmake \
122+ git && dnf clean all && rm -rf /var/cache/dnf/*
123+
124+ # Verify CUDA toolkit
125+ RUN /usr/local/cuda/bin/nvcc -V
126+ RUN ldconfig -p | grep -E 'libcudart|libcublas|libcudnn' || \
127+ (echo "[fail-fast] CUDA libs not found" >&2; exit 1)
128+
129+ # Bundle RDMA runtime libs to a staging dir
130+ RUN mkdir -p /opt/rdma-runtime \
131+ && cp -a /usr/lib64/libibverbs* /opt/rdma-runtime/ || true \
132+ && cp -a /usr/lib64/librdmacm* /opt/rdma-runtime/ || true \
133+ && cp -a /usr/lib64/libibumad* /opt/rdma-runtime/ || true \
134+ && cp -a /usr/lib64/libmlx* /opt/rdma-runtime/ || true \
135+ && cp -a /usr/lib64/libibnetdisc* /opt/rdma-runtime/ || true
136+
137+ ################################################################################
138+ # Python Dependencies Stage
139+ ################################################################################
140+ FROM system-deps AS python-deps
141+
142+ USER 0
143+ WORKDIR /tmp/deps
144+
145+ # Copy uv from builder stage (FIPS: uv only used during build, not in runtime)
146+ COPY --from=builder /opt/app-root/bin/uv /usr/local/bin/uv
147+
148+ # Copy pyproject.toml, pylock.toml, and requirements-special.txt
149+ # pylock.toml contains most dependencies
150+ # requirements-special.txt contains packages needing --no-build-isolation
151+ COPY --chown=1001:0 pyproject.toml pylock.toml requirements-special.txt ./
152+
153+ # Switch to user 1001 for pip installations
154+ USER 1001
155+ WORKDIR /opt/app-root/src
156+
157+ # Install main dependencies from pylock.toml using uv pip sync
158+ # This syncs the environment to match exactly what's in the lockfile
159+ #
160+ # UV_NO_CACHE explained:
161+ # What: Sets UV_NO_CACHE=1 temporarily, then unsets it (empty string)
162+ # Why: Running as user 1001 causes uv to try writing to cache directory
163+ # which may have permission issues. Disabling cache avoids this.
164+ # Why unset: ENV changes persist across layers. Unsetting prevents
165+ # the variable from affecting subsequent operations or runtime.
166+ # Empty value effectively unsets the environment variable.
167+ ENV UV_NO_CACHE=1
168+ RUN uv pip sync --python-platform=linux --python-version=3.12 /tmp/deps/pylock.toml
169+ ENV UV_NO_CACHE=
170+
171+ # Install kubeflow-sdk from Git (not in pylock.toml or requirements-special.txt)
172+ # TODO: use aipcc index
173+ RUN pip install --retries 5 --timeout 300 --no-cache-dir \
174+ "git+https://github.com/opendatahub-io/kubeflow-sdk@main"
175+
176+ # Install special packages with proper flags
177+ # These packages require --no-build-isolation to use pre-installed CUDA tools
178+ # and must be installed in a specific order
179+
180+ # Copy requirements-special.txt for installation
181+ COPY --chown=1001:0 requirements-special.txt /tmp/deps/
182+
183+ # 1. Flash Attention (standalone, needs --no-build-isolation --no-deps)
184+ RUN pip install --no-build-isolation --no-cache-dir --no-deps \
185+ $(grep "^flash-attn" /tmp/deps/requirements-special.txt)
186+
187+ # 2. Mamba SSM dependencies (order matters!)
188+ # - causal-conv1d first (needs --no-build-isolation)
189+ # - mamba-ssm second (needs --no-build-isolation --no-deps)
190+ RUN pip install --no-build-isolation --no-cache-dir \
191+ $(grep "^causal-conv1d" /tmp/deps/requirements-special.txt) \
192+ && pip install --no-build-isolation --no-cache-dir --no-deps \
193+ $(grep "^mamba-ssm" /tmp/deps/requirements-special.txt)
194+
195+ # Fix permissions for OpenShift
196+ # What: Adjusts file permissions for OpenShift/Kubernetes compatibility
197+ # Why: OpenShift runs containers with arbitrary user IDs but fixed group ID (root group)
198+ # - chmod g+w: Allows group write access to site-packages (for pip installs at runtime)
199+ # - fix-permissions: UBI-provided script that ensures group ownership/permissions
200+ # When needed: Required for any container that may run in OpenShift with arbitrary UIDs
201+ ARG PYTHON_VERSION
202+ USER 0
203+ RUN chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages \
204+ && fix-permissions /opt/app-root -P
205+
206+ # Clean up uv and build artifacts (FIPS: remove build-only tools)
207+ RUN rm -f /usr/local/bin/uv \
208+ && rm -rf /tmp/deps \
209+ && dnf remove -y gcc gcc-c++ cmake python3-devel \
210+ && dnf clean all \
211+ && rm -rf /var/cache/dnf/*
212+
213+ ################################################################################
214+ # Final Stage - FIPS-friendly Runtime
215+ ################################################################################
216+ FROM ${BASE_IMAGE} AS final
217+
218+ USER 0
219+ WORKDIR /opt/app-root/src
220+
221+ # Copy Python site-packages and CLI entry points from python-deps stage
222+ # This excludes build tools like gcc, cmake, uv (FIPS friendly)
223+ ARG PYTHON_VERSION
224+ COPY --from=python-deps /opt/app-root/lib/python${PYTHON_VERSION}/site-packages /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
225+ COPY --from=python-deps /opt/app-root/bin /opt/app-root/bin
226+
227+ # Copy CUDA runtime from system-deps (built Python packages need CUDA libs)
228+ # Contains all necessary CUDA libraries - no need to install via dnf
229+ COPY --from=system-deps /usr/local/cuda /usr/local/cuda
230+
231+ # Copy RDMA runtime libraries from system-deps
232+ # These are needed for InfiniBand/RDMA support at runtime
233+ COPY --from=system-deps /opt/rdma-runtime/ /usr/lib64/
234+
235+ # Update dynamic linker cache for CUDA libraries
236+ # What: ldconfig updates the runtime linker's cache of shared libraries
237+ # Why: After copying CUDA libraries to /usr/local/cuda, the system needs to know where to find them
238+ # - Scans directories like /usr/local/cuda/lib64 (defined in /etc/ld.so.conf.d/)
239+ # - Updates /etc/ld.so.cache so programs can locate libcudart.so, libcublas.so, etc.
240+ # When needed: Required after installing/copying shared libraries to non-standard locations
241+ # Test: Run "ldconfig -p | grep cuda" to see if CUDA libs are in the cache
242+ RUN ldconfig
243+
244+ # FIPS-friendly: Remove uv from final image (inherited from base image)
245+ # uv is only needed during build, not at runtime
246+ RUN rm -f /opt/app-root/bin/uv
247+
248+ # Environment variables for NVIDIA and CUDA
249+ ENV NVIDIA_VISIBLE_DEVICES=all \
250+ NVIDIA_DRIVER_CAPABILITIES=compute,utility \
251+ CUDA_VERSION=${CUDA_VERSION} \
252+ CUDA_HOME=/usr/local/cuda \
253+ PATH=/usr/local/cuda/bin:$PATH \
254+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
255+ TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" \
256+ XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
257+
258+ # Copy license file
259+ COPY LICENSE.md /licenses/cuda-license.md
260+
261+ # Copy entrypoint
262+ COPY --chmod=0755 entrypoint-universal.sh /usr/local/bin/entrypoint-universal.sh
263+
264+ # Fix permissions for OpenShift (final stage)
265+ # What: Ensures proper permissions for OpenShift/Kubernetes arbitrary UIDs
266+ # Why: After copying site-packages from python-deps stage, permissions need adjustment
267+ # - OpenShift assigns random UID but fixed GID (usually 0, root group)
268+ # - Group write permissions allow pip to install packages at runtime
269+ # - fix-permissions ensures all files have correct group ownership
270+ # When: Required in final stage because COPY operations reset permissions
271+ # Context: This is the second time we do this - once after building packages,
272+ # and again after copying them to the final stage
273+ RUN fix-permissions /opt/app-root -P \
274+ && chmod -R g+w /opt/app-root/lib/python${PYTHON_VERSION}/site-packages
275+
276+ USER 1001
277+ WORKDIR /opt/app-root/src
278+
279+ ENTRYPOINT ["/usr/local/bin/entrypoint-universal.sh"]
280+ CMD ["start-notebook.sh"]
0 commit comments