diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 52226d497..aca925a8c 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -27,6 +27,10 @@ EXTRA_BUILD_ARGS=${3:-""} UCX_VERSION=${UCX_VERSION:-v1.19.0} # EFA_INSTALLER_VERSION is the version of EFA installer to use, defaults to "latest" EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION:-latest} +# LIBFABRIC_VERSION is the version of libfabric to build override default with env variable. +LIBFABRIC_VERSION=${LIBFABRIC_VERSION:-v2.3.0} +# LIBFABRIC_INSTALL_DIR can be set via environment variable, defaults to INSTALL_DIR +LIBFABRIC_INSTALL_DIR=${LIBFABRIC_INSTALL_DIR:-$INSTALL_DIR} if [ -z "$INSTALL_DIR" ]; then echo "Usage: $0 " @@ -123,10 +127,30 @@ curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz $SUDO ldconfig \ ) -curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz +wget --tries=3 --waitretry=5 -O "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" +tar xzf "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" +rm "aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" ( \ cd aws-efa-installer && \ - $SUDO ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ + $SUDO ./efa_installer.sh -y --minimal --skip-kmod --skip-limit-conf --no-verify && \ + $SUDO ldconfig \ +) + +wget --tries=3 --waitretry=5 -O "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" +tar xjf "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" +rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" +( \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_DIR}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa && \ + make -j && \ + make install && \ $SUDO ldconfig \ ) @@ -152,10 +176,10 @@ curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLE ) export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/cuda/lib64" -export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${INSTALL_DIR}/lib:/opt/amazon/efa/lib" -export CPATH="${INSTALL_DIR}/include:/opt/amazon/efa/include:$CPATH" +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${INSTALL_DIR}/lib:${LIBFABRIC_INSTALL_DIR}/lib" +export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH" export PATH="${INSTALL_DIR}/bin:$PATH" -export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:/opt/amazon/efa/lib/pkgconfig:$PKG_CONFIG_PATH" +export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:${LIBFABRIC_INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH" export NIXL_PLUGIN_DIR="${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins" export CMAKE_PREFIX_PATH="${INSTALL_DIR}:${CMAKE_PREFIX_PATH}" @@ -164,7 +188,7 @@ export CMAKE_PREFIX_PATH="${INSTALL_DIR}:${CMAKE_PREFIX_PATH}" export UCX_TLS=^cuda_ipc # shellcheck disable=SC2086 -meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=${UCX_INSTALL_DIR} -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="/opt/amazon/efa" +meson setup nixl_build --prefix=${INSTALL_DIR} -Ducx_path=${UCX_INSTALL_DIR} -Dbuild_docs=true -Drust=false ${EXTRA_BUILD_ARGS} -Dlibfabric_path="${LIBFABRIC_INSTALL_DIR}" ninja -C nixl_build && ninja -C nixl_build install # TODO(kapila): Copy the nixl.pc file to the install directory if needed. diff --git a/benchmark/nixlbench/contrib/Dockerfile b/benchmark/nixlbench/contrib/Dockerfile index dff89c482..29ce19e3b 100644 --- a/benchmark/nixlbench/contrib/Dockerfile +++ b/benchmark/nixlbench/contrib/Dockerfile @@ -48,6 +48,8 @@ RUN apt-get update -y && \ libz-dev \ flex \ libgtest-dev \ + hwloc \ + libhwloc-dev \ build-essential # Add DOCA repository and install packages @@ -104,11 +106,38 @@ ARG DEFAULT_PYTHON_VERSION ARG WHL_PYTHON_VERSIONS="3.12" ARG WHL_PLATFORM="manylinux_2_39_$ARCH" ARG BUILD_TYPE="release" -ARG EFA_INSTALLER_VERSION="latest" -ARG EFA_INSTALL_PATH="/opt/amazon/efa" +ARG LIBFABRIC_VERSION="v2.3.0" ARG NPROC WORKDIR /workspace + +# Build libfabric from source +# Install RDMA/EFA packages required for libfabric EFA provider +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install \ + --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ + libnuma-dev librdmacm-dev ibverbs-providers + +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ + ldconfig + RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \ cd etcd-cpp-apiv3 && \ sed -i '/^find_dependency(cpprestsdk)$/d' etcd-cpp-api-config.in.cmake && \ @@ -122,11 +151,6 @@ COPY --from=nixlbench . /workspace/nixlbench # Install AWS SDK C++ dependencies and build RUN apt-get update && apt-get install -y libcurl4-openssl-dev libssl-dev uuid-dev zlib1g-dev hwloc libhwloc-dev -# Install EFA (Elastic Fabric Adapter) -RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ - cd aws-efa-installer && \ - ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ - ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ mkdir sdk_build && \ @@ -137,7 +161,7 @@ RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github WORKDIR /workspace/nixl -ENV LD_LIBRARY_PATH=/usr/local/lib:$EFA_INSTALL_PATH/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ENV VIRTUAL_ENV=/workspace/nixl/.venv RUN uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \ @@ -149,7 +173,7 @@ RUN CUDA_SHORT_VERSION=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d .) && \ RUN rm -rf build && \ mkdir build && \ - uv run meson setup build -Dlibfabric_path=$EFA_INSTALL_PATH --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ + uv run meson setup build --prefix=/usr/local/nixl --buildtype=$BUILD_TYPE && \ cd build && \ ninja && \ ninja install diff --git a/contrib/Dockerfile b/contrib/Dockerfile index ba16ae0aa..c02880eeb 100644 --- a/contrib/Dockerfile +++ b/contrib/Dockerfile @@ -30,8 +30,8 @@ ARG NIXL_PREFIX="/usr/local/nixl" ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins" ARG NPROC ARG WHL_DEFAULT_PYTHON_VERSIONS="3.12" -ARG EFA_INSTALLER_VERSION="latest" -ARG EFA_INSTALL_PATH="/opt/amazon/efa" +ARG LIBFABRIC_VERSION="v2.3.0" +ARG LIBFABRIC_INSTALL_PATH="/usr/local" RUN apt-get update -y && \ apt-get install -y ubuntu-keyring && \ @@ -74,11 +74,6 @@ RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && mkdir build && cd build && \ cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install -# Install EFA (Elastic Fabric Adapter) -RUN curl -fsSL "https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz" | tar xz && \ - cd aws-efa-installer && \ - ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \ - ldconfig RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \ mkdir aws_sdk_build && cd aws_sdk_build && \ @@ -140,10 +135,31 @@ RUN cd /usr/local/src && \ make -j${NPROC:-$(nproc)} install-strip && \ ldconfig +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix="${LIBFABRIC_INSTALL_PATH}" \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j${NPROC:-$(nproc)} && \ + make install && \ + ldconfig + WORKDIR /workspace/nixl COPY . /workspace/nixl -ENV LD_LIBRARY_PATH=/usr/local/lib:$EFA_INSTALL_PATH/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH=/usr/local/lib:$LIBFABRIC_INSTALL_PATH/lib:$LD_LIBRARY_PATH ENV VIRTUAL_ENV=/workspace/nixl/.venv RUN rm -rf $VIRTUAL_ENV && uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \ @@ -156,7 +172,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends pybind11-dev ENV NIXL_PREFIX=$NIXL_PREFIX RUN rm -rf build && \ mkdir build && \ - uv run meson setup -Dlibfabric_path=$EFA_INSTALL_PATH build/ --prefix=$NIXL_PREFIX && \ + uv run meson setup -Dlibfabric_path=$LIBFABRIC_INSTALL_PATH build/ --prefix=$NIXL_PREFIX && \ cd build && \ ninja && \ ninja install diff --git a/contrib/Dockerfile.manylinux b/contrib/Dockerfile.manylinux index 507f0f12e..5ba854ef6 100644 --- a/contrib/Dockerfile.manylinux +++ b/contrib/Dockerfile.manylinux @@ -21,6 +21,7 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} ARG DEFAULT_PYTHON_VERSION="3.12" ARG ARCH="x86_64" ARG UCX_REF="v1.19.0" +ARG LIBFABRIC_VERSION="v2.3.0" RUN yum groupinstall -y 'Development Tools' && \ dnf install -y almalinux-release-synergy && \ @@ -53,6 +54,8 @@ RUN yum groupinstall -y 'Development Tools' && \ libibumad-devel \ numactl-devel \ librdmacm-devel \ + hwloc \ + hwloc-devel \ wget \ zlib @@ -198,6 +201,27 @@ RUN cd /usr/local/src && \ make -j install-strip && \ ldconfig +# Build libfabric from source +RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \ + "https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \ + tar xjf libfabric.tar.bz2 && rm libfabric.tar.bz2 && \ + cd libfabric-* && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local \ + --disable-verbs \ + --disable-psm3 \ + --disable-opx \ + --disable-usnic \ + --disable-rstream \ + --enable-efa \ + --with-cuda=/usr/local/cuda \ + --enable-cuda-dlopen \ + --with-gdrcopy \ + --enable-gdrcopy-dlopen && \ + make -j$(nproc) && \ + make install && \ + ldconfig + COPY . /workspace/nixl RUN rm -rf build && \ diff --git a/meson.build b/meson.build index 19f3f3b51..8a7be09c9 100644 --- a/meson.build +++ b/meson.build @@ -155,9 +155,16 @@ libfabric_path = get_option('libfabric_path') if libfabric_path != '' libfabric_lib_path = libfabric_path + '/lib' libfabric_inc_path = libfabric_path + '/include' - libfabric_dep = declare_dependency( - link_args : ['-L' + libfabric_lib_path, '-lfabric'], - include_directories : include_directories(libfabric_inc_path)) + # Check if path is absolute + if libfabric_inc_path.startswith('/') + libfabric_dep = declare_dependency( + link_args : ['-L' + libfabric_lib_path, '-lfabric'], + compile_args : ['-I' + libfabric_inc_path]) + else + libfabric_dep = declare_dependency( + link_args : ['-L' + libfabric_lib_path, '-lfabric'], + include_directories : include_directories(libfabric_inc_path)) + endif else libfabric_dep = dependency('libfabric', required: false) endif