From 2a493b48c574e283e766b265e0136c6af23c7711 Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Tue, 16 Sep 2025 21:14:47 +0000 Subject: [PATCH 1/6] Install libfabric manually and make the use efa_installer to only install the rdma_core with efa support - Add libfabric installation and configuration to build script - Update Dockerfiles to include libfabric build steps - Configure libfabric with EFA, CUDA, and GDRCopy support --- .gitlab/build.sh | 42 +++++++++++++++++++++----- benchmark/nixlbench/contrib/Dockerfile | 30 +++++++++++++++--- contrib/Dockerfile | 30 +++++++++++++++--- 3 files changed, 84 insertions(+), 18 deletions(-) diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 52226d497..4998713ff 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -22,14 +22,16 @@ set -o pipefail # and second argument being the UCX installation directory. INSTALL_DIR=$1 UCX_INSTALL_DIR=$2 -EXTRA_BUILD_ARGS=${3:-""} +LIBFABRIC_INSTALL_DIR=$3 +EXTRA_BUILD_ARGS=${4:-""} # UCX_VERSION is the version of UCX to build override default with env variable. UCX_VERSION=${UCX_VERSION:-v1.19.0} # EFA_INSTALLER_VERSION is the version of EFA installer to use, defaults to "latest" EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION:-latest} - +# LIBFABRIC_VERSION is the version of libfabric to build override default with env variable. +LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v2.3.0} if [ -z "$INSTALL_DIR" ]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi @@ -37,6 +39,10 @@ if [ -z "$UCX_INSTALL_DIR" ]; then UCX_INSTALL_DIR=$INSTALL_DIR fi +if [ -z "$LIBFABRIC_INSTALL_DIR" ]; then + LIBFABRIC_INSTALL_DIR=$INSTALL_DIR +fi + # For running as user - check if running as root, if not set sudo variable if [ "$(id -u)" -ne 0 ]; then @@ -126,7 +132,27 @@ curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz ( \ cd aws-efa-installer && \ - $SUDO ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ + $SUDO ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ + $SUDO ldconfig \ +) + +curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj +( \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_DIR}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j && \ + make install && \ $SUDO ldconfig \ ) @@ -152,10 +178,10 @@ curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLE ) export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/cuda/lib64" -export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${INSTALL_DIR}/lib:/opt/amazon/efa/lib" -export CPATH="${INSTALL_DIR}/include:/opt/amazon/efa/include:$CPATH" +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${INSTALL_DIR}/lib:${LIBFABRIC_INSTALL_DIR}/lib" +export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH" export PATH="${INSTALL_DIR}/bin:$PATH" -export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:/opt/amazon/efa/lib/pkgconfig:$PKG_CONFIG_PATH" +export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:${LIBFABRIC_INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH" export NIXL_PLUGIN_DIR="${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins" export CMAKE_PREFIX_PATH="${INSTALL_DIR}:${CMAKE_PREFIX_PATH}" @@ -164,7 +190,7 @@ export CMAKE_PREFIX_PATH="${INSTALL_DIR}:${CMAKE_PREFIX_PATH}" export UCX_TLS=^cuda_ipc # shellcheck disable=SC2086 -meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=${UCX_INSTALL_DIR} -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="/opt/amazon/efa" +meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=${UCX_INSTALL_DIR} -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="${LIBFABRIC_INSTALL_DIR}" ninja -C nixl_build && ninja -C nixl_build install # TODO(kapila): Copy the nixl.pc file to the install directory if needed. diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index dff89c482..54778753b 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -105,7 +105,8 @@ ARG WHL_PYTHON_VERSIONS="3.12" ARG WHL_PLATFORM="manylinux_2_39_$ARCH" ARG BUILD_TYPE="release" ARG EFA_INSTALLER_VERSION="latest" -ARG EFA_INSTALL_PATH="/opt/amazon/efa" +ARG LIBFABRIC_VERSION="v2.3.0" +ARG LIBFABRIC_INSTALL_PATH="/usr/local" ARG NPROC WORKDIR /workspace @@ -122,10 +123,29 @@ COPY --from=nixlbench . /workspace/nixlbench # Install AWS SDK C++ dependencies and build RUN apt-get update && apt-get install -y libcurl4-openssl-dev libssl-dev uuid-dev zlib1g-dev hwloc libhwloc-dev -# Install EFA (Elastic Fabric Adapter) +# Install EFA (Elastic Fabric Adapter) - minimal installation RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ cd aws-efa-installer && \ - ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ + ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ + ldconfig + +# Build libfabric from source +RUN curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ @@ -137,7 +157,7 @@ RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github WORKDIR /workspace/nixl -ENV LD_LIBRARY_PATH=/usr/local/lib:$EFA_INSTALL_PATH/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/lib:$LIBFABRIC_INSTALL_PATH/lib:$LD_LIBRARY_PATH ENV VIRTUAL_ENV=/workspace/nixl/.venv RUN uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \ @@ -149,7 +169,7 @@ RUN CUDA_SHORT_VERSION=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .) && \ RUN rm -rf build && \ mkdir build && \ - uv run meson setup build -Dlibfabric_path=$EFA_INSTALL_PATH --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ + uv run meson setup build -Dlibfabric_path=$LIBFABRIC_INSTALL_PATH --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ cd build && \ ninja && \ ninja install diff --git a/contrib/Dockerfile b/contrib/Dockerfile index ba16ae0aa..4d0af2a97 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -31,7 +31,8 @@ ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins" ARG NPROC ARG WHL_DEFAULT_PYTHON_VERSIONS="3.12" ARG EFA_INSTALLER_VERSION="latest" -ARG EFA_INSTALL_PATH="/opt/amazon/efa" +ARG LIBFABRIC_VERSION="v2.3.0" +ARG LIBFABRIC_INSTALL_PATH="/usr/local" RUN apt-get update -y && \ apt-get install -y ubuntu-keyring && \ @@ -74,10 +75,29 @@ RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && mkdir build && cd build && \ cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install -# Install EFA (Elastic Fabric Adapter) +# Install EFA (Elastic Fabric Adapter) - minimal installation RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ cd aws-efa-installer && \ - ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ + ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ + ldconfig + +# Build libfabric from source +RUN curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ @@ -143,7 +163,7 @@ RUN cd /usr/local/src && \ WORKDIR /workspace/nixl COPY . /workspace/nixl -ENV LD_LIBRARY_PATH=/usr/local/lib:$EFA_INSTALL_PATH/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/lib:$LIBFABRIC_INSTALL_PATH/lib:$LD_LIBRARY_PATH ENV VIRTUAL_ENV=/workspace/nixl/.venv RUN rm -rf $VIRTUAL_ENV && uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \ @@ -156,7 +176,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends pybind11-dev ENV NIXL_PREFIX=$NIXL_PREFIX RUN rm -rf build && \ mkdir build && \ - uv run meson setup -Dlibfabric_path=$EFA_INSTALL_PATH build/ --prefix=$NIXL_PREFIX && \ + uv run meson setup -Dlibfabric_path=$LIBFABRIC_INSTALL_PATH build/ --prefix=$NIXL_PREFIX && \ cd build && \ ninja && \ ninja install From 15049e0a8516a842675b754b74445ea2e671013f Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Thu, 25 Sep 2025 01:09:02 +0000 Subject: [PATCH 2/6] Refactor build script to use environment variables and improve download reliability - Change libfabric install dir from required parameter to environment variable - Replace curl with wget for more reliable downloads with retry logic - Add libfabric installation to Dockerfile.manylinux --- .gitlab/build.sh | 20 ++++++------ benchmark/nixlbench/contrib/Dockerfile | 30 ++--------------- contrib/Dockerfile | 45 ++++++++++++-------------- contrib/Dockerfile.manylinux | 22 +++++++++++++ 4 files changed, 56 insertions(+), 61 deletions(-) diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 4998713ff..901433742 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -22,16 +22,18 @@ set -o pipefail # and second argument being the UCX installation directory. INSTALL_DIR=$1 UCX_INSTALL_DIR=$2 -LIBFABRIC_INSTALL_DIR=$3 -EXTRA_BUILD_ARGS=${4:-""} +EXTRA_BUILD_ARGS=${3:-""} # UCX_VERSION is the version of UCX to build override default with env variable. UCX_VERSION=${UCX_VERSION:-v1.19.0} # EFA_INSTALLER_VERSION is the version of EFA installer to use, defaults to "latest" EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION:-latest} # LIBFABRIC_VERSION is the version of libfabric to build override default with env variable. LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v2.3.0} +# LIBFABRIC_INSTALL_DIR can be set via environment variable, defaults to INSTALL_DIR +LIBFABRIC_INSTALL_DIR=${LIBFABRIC_INSTALL_DIR:-$INSTALL_DIR} + if [ -z "$INSTALL_DIR" ]; then - echo "Usage: $0 " + echo "Usage: $0 " exit 1 fi @@ -39,10 +41,6 @@ if [ -z "$UCX_INSTALL_DIR" ]; then UCX_INSTALL_DIR=$INSTALL_DIR fi -if [ -z "$LIBFABRIC_INSTALL_DIR" ]; then - LIBFABRIC_INSTALL_DIR=$INSTALL_DIR -fi - # For running as user - check if running as root, if not set sudo variable if [ "$(id -u)" -ne 0 ]; then @@ -129,14 +127,18 @@ curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz $SUDO ldconfig \ ) -curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz +wget --tries=3 --waitretry=5 -O "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" +tar xzf "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" +rm "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" ( \ cd aws-efa-installer && \ $SUDO ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ $SUDO ldconfig \ ) -curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj +wget --tries=3 --waitretry=5 -O "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" +tar xjf "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" +rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" ( \ cd libfabric-* && \ ./autogen.sh && \ diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index 54778753b..42e178c4a 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -105,8 +105,6 @@ ARG WHL_PYTHON_VERSIONS="3.12" ARG WHL_PLATFORM="manylinux_2_39_$ARCH" ARG BUILD_TYPE="release" ARG EFA_INSTALLER_VERSION="latest" -ARG LIBFABRIC_VERSION="v2.3.0" -ARG LIBFABRIC_INSTALL_PATH="/usr/local" ARG NPROC WORKDIR /workspace @@ -123,30 +121,6 @@ COPY --from=nixlbench . /workspace/nixlbench # Install AWS SDK C++ dependencies and build RUN apt-get update && apt-get install -y libcurl4-openssl-dev libssl-dev uuid-dev zlib1g-dev hwloc libhwloc-dev -# Install EFA (Elastic Fabric Adapter) - minimal installation -RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ - cd aws-efa-installer && \ - ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ - ldconfig - -# Build libfabric from source -RUN curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj && \ - cd libfabric-* && \ - ./autogen.sh && \ - ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ - --disable-verbs \ - --disable-psm3 \ - --disable-opx \ - --disable-usnic \ - --disable-rstream \ - --enable-efa \ - --with-cuda=/usr/local/cuda \ - --enable-cuda-dlopen \ - --with-gdrcopy \ - --enable-gdrcopy-dlopen && \ - make -j${NPROC:-$(nproc)} && \ - make install && \ - ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ mkdir sdk_build && \ @@ -157,7 +131,7 @@ RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github WORKDIR /workspace/nixl -ENV LD_LIBRARY_PATH=/usr/local/lib:$LIBFABRIC_INSTALL_PATH/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ENV VIRTUAL_ENV=/workspace/nixl/.venv RUN uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \ @@ -169,7 +143,7 @@ RUN CUDA_SHORT_VERSION=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .) && \ RUN rm -rf build && \ mkdir build && \ - uv run meson setup build -Dlibfabric_path=$LIBFABRIC_INSTALL_PATH --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ + uv run meson setup build --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ cd build && \ ninja && \ ninja install diff --git a/contrib/Dockerfile b/contrib/Dockerfile index 4d0af2a97..e05060784 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -75,30 +75,6 @@ RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && mkdir build && cd build && \ cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install -# Install EFA (Elastic Fabric Adapter) - minimal installation -RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ - cd aws-efa-installer && \ - ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ - ldconfig - -# Build libfabric from source -RUN curl -fSsL "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" | tar xj && \ - cd libfabric-* && \ - ./autogen.sh && \ - ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ - --disable-verbs \ - --disable-psm3 \ - --disable-opx \ - --disable-usnic \ - --disable-rstream \ - --enable-efa \ - --with-cuda=/usr/local/cuda \ - --enable-cuda-dlopen \ - --with-gdrcopy \ - --enable-gdrcopy-dlopen && \ - make -j${NPROC:-$(nproc)} && \ - make install && \ - ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ mkdir aws_sdk_build && cd aws_sdk_build && \ @@ -160,6 +136,27 @@ RUN cd /usr/local/src && \ make -j${NPROC:-$(nproc)} install-strip && \ ldconfig +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ + ldconfig + WORKDIR /workspace/nixl COPY . /workspace/nixl diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 507f0f12e..76c99fa9e 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -21,6 +21,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} ARG DEFAULT_PYTHON_VERSION="3.12" ARG ARCH="x86_64" ARG UCX_REF="v1.19.0" +ARG LIBFABRIC_VERSION="v2.3.0" RUN yum groupinstall -y 'Development Tools' && \ dnf install -y almalinux-release-synergy && \ @@ -198,6 +199,27 @@ RUN cd /usr/local/src && \ make -j install-strip && \ ldconfig +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j$(nproc) && \ + make install && \ + ldconfig + COPY . /workspace/nixl RUN rm -rf build && \ From a1a28ec1dd100b7a00647180202c3d6ed1d7e2df Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Fri, 26 Sep 2025 19:54:51 +0000 Subject: [PATCH 3/6] Remove CUDA and GDRCopy dependencies from libfabric build Disable CUDA and GDRCopy support in libfabric configuration to simplify the build process and remove unnecessary GPU-related dependencies. --- .gitlab/build.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 901433742..aca925a8c 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -148,11 +148,7 @@ rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" --disable-opx \ --disable-usnic \ --disable-rstream \ - --enable-efa \ - --with-cuda=/usr/local/cuda \ - --enable-cuda-dlopen \ - --with-gdrcopy \ - --enable-gdrcopy-dlopen && \ + --enable-efa && \ make -j && \ make install && \ $SUDO ldconfig \ From ed12136ae41e790cb2d43234d2a1570ed638fb3f Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Fri, 26 Sep 2025 23:05:22 +0000 Subject: [PATCH 4/6] Fix libfabric include path handling for absolute paths Use compile_args instead of include_directories when libfabric_path is an absolute path to avoid meson build issues with external library includes. --- meson.build | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index adf891c6e..15dadfdab 100644 --- a/meson.build +++ b/meson.build @@ -155,9 +155,16 @@ libfabric_path = get_option('libfabric_path') if libfabric_path != '' libfabric_lib_path = libfabric_path + '/lib' libfabric_inc_path = libfabric_path + '/include' - libfabric_dep = declare_dependency( - link_args : ['-L' + libfabric_lib_path, '-lfabric'], - include_directories : include_directories(libfabric_inc_path)) + # Check if path is absolute + if libfabric_inc_path.startswith('/') + libfabric_dep = declare_dependency( + link_args : ['-L' + libfabric_lib_path, '-lfabric'], + compile_args : ['-I' + libfabric_inc_path]) + else + libfabric_dep = declare_dependency( + link_args : ['-L' + libfabric_lib_path, '-lfabric'], + include_directories : include_directories(libfabric_inc_path)) + endif else libfabric_dep = dependency('libfabric', required: false) endif From 7889d3fd2d48432984042efbcf2e30d55167dc3e Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Sat, 27 Sep 2025 04:48:22 +0000 Subject: [PATCH 5/6] Add hwloc dependencies to manylinux Docker build Add hwloc and hwloc-devel packages to support hardware locality detection. --- contrib/Dockerfile.manylinux | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 76c99fa9e..5ba854ef6 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -54,6 +54,8 @@ RUN yum groupinstall -y 'Development Tools' && \ libibumad-devel \ numactl-devel \ librdmacm-devel \ + hwloc \ + hwloc-devel \ wget \ zlib From bc82fc8ad850b7265a919e968e313d354ee79aaf Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Sat, 27 Sep 2025 07:57:22 +0000 Subject: [PATCH 6/6] Update nixlbench Dockerfile to support libfabric backend - Add hwloc and libhwloc-dev packages to nixlbench/contrib Dockerfile - Add rdmacore dependencies to the nixlbench/contrib Dockerfile - Add libfabric ARG v2.3.0 source build - Remove Stale ARG EFA_INSTALLER_VERSION arg from contrib Dockerfile --- benchmark/nixlbench/contrib/Dockerfile | 32 +++++++++++++++++++++++++- contrib/Dockerfile | 1 - 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index 42e178c4a..29ce19e3b 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -48,6 +48,8 @@ RUN apt-get update -y && \ libz-dev \ flex \ libgtest-dev \ + hwloc \ + libhwloc-dev \ build-essential # Add DOCA repository and install packages @@ -104,10 +106,38 @@ ARG DEFAULT_PYTHON_VERSION ARG WHL_PYTHON_VERSIONS="3.12" ARG WHL_PLATFORM="manylinux_2_39_$ARCH" ARG BUILD_TYPE="release" -ARG EFA_INSTALLER_VERSION="latest" +ARG LIBFABRIC_VERSION="v2.3.0" ARG NPROC WORKDIR /workspace + +# Build libfabric from source +# Install RDMA/EFA packages required for libfabric EFA provider +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install \ + --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ + libnuma-dev librdmacm-dev ibverbs-providers + +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ + ldconfig + RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \ cd etcd-cpp-apiv3 && \ sed -i '/^find_dependency(cpprestsdk)$/d' etcd-cpp-api-config.in.cmake && \ diff --git a/contrib/Dockerfile b/contrib/Dockerfile index e05060784..c02880eeb 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -30,7 +30,6 @@ ARG NIXL_PREFIX="/usr/local/nixl" ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins" ARG NPROC ARG WHL_DEFAULT_PYTHON_VERSIONS="3.12" -ARG EFA_INSTALLER_VERSION="latest" ARG LIBFABRIC_VERSION="v2.3.0" ARG LIBFABRIC_INSTALL_PATH="/usr/local"