Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
b10bce6
NIXLBENCH: Add GPU tests to CI
ovidiusm Sep 5, 2025
7a47b98
Remove redundant parameter
ovidiusm Sep 5, 2025
a7cdcde
Use two workers in UCX tests
ovidiusm Sep 5, 2025
49305eb
Remove CUDA compat paths
ovidiusm Sep 5, 2025
2c18242
Build UCX with CUDA support if possible
ovidiusm Sep 5, 2025
083dc6b
Debug
ovidiusm Sep 5, 2025
62b94f0
Fix build
ovidiusm Sep 5, 2025
607fdfc
Fix build
ovidiusm Sep 5, 2025
344d019
Cleanup
ovidiusm Sep 5, 2025
28d6601
Fix tests on non-GPU workers
ovidiusm Sep 5, 2025
e2ae443
Skip broken test
ovidiusm Sep 5, 2025
e099bfe
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Sep 16, 2025
2e234ca
Silence telemetry errors
ovidiusm Sep 16, 2025
090b9c6
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Sep 16, 2025
f08de65
Track registrations
ovidiusm Sep 16, 2025
af73eca
Revert "Track registrations"
ovidiusm Sep 17, 2025
93d91fb
Silence error
ovidiusm Sep 17, 2025
36b6fe7
Isolate gtest
ovidiusm Sep 17, 2025
f27977c
Run gtest via gtest-parallel
ovidiusm Sep 17, 2025
0a102e5
Revert code removal
ovidiusm Sep 17, 2025
cbb5dbf
Add timeout for build
ovidiusm Sep 17, 2025
35e75b9
Install gtest-parallel in Dockerfile
ovidiusm Sep 17, 2025
1656495
Cleanup scripts
ovidiusm Sep 17, 2025
10ba539
Cleanup unrelated changes
ovidiusm Sep 18, 2025
5e336c1
Use CUDA stubs when there is no GPU, to fix CI envs without GPU
ovidiusm Sep 18, 2025
25d2b7b
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Sep 18, 2025
842eea0
Try another way of loading CUDA libs on CI workers without GPUs
ovidiusm Sep 18, 2025
9aab08a
Revert changes to CUDA load path
ovidiusm Sep 18, 2025
43e0d5a
Check what happens with UCX CUDA auto-detection
ovidiusm Sep 19, 2025
4091b1e
Try another way of setting lib path
ovidiusm Sep 22, 2025
00f5df0
Simplify
ovidiusm Sep 22, 2025
c2bbc2b
Revert removal of UCX cuda option
ovidiusm Sep 22, 2025
ee12fe4
Move back the lib loading path
ovidiusm Sep 22, 2025
4e600b9
Fix SIGINT, add more workers
ovidiusm Sep 22, 2025
de81063
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Sep 22, 2025
4993804
Use a single worker in tests
ovidiusm Sep 22, 2025
9b34995
Adjust number of parallel workers depending on environment
ovidiusm Sep 23, 2025
6d12cd1
Add etcd namespace isolation for unit tests
ovidiusm Sep 23, 2025
2e9703f
Use a single gtest worker without GPU
ovidiusm Sep 23, 2025
a9ebaa5
Reduce the number of gtest workers to 1 (gitlab CI fails with paralle…
ovidiusm Sep 24, 2025
c3e6a28
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Sep 24, 2025
b7bc101
Refactor scripts
ovidiusm Sep 24, 2025
78deae5
Attempt run without gtest-parallel
ovidiusm Sep 24, 2025
f3dd55d
Revert "Attempt run without gtest-parallel"
ovidiusm Sep 24, 2025
ec3f080
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Oct 1, 2025
8f4bb95
Workaround DGX issue
ovidiusm Oct 1, 2025
1c4b7e8
Merge remote-tracking branch 'dynamo/main' into nixlbench-gpu-tests
ovidiusm Oct 2, 2025
d0b3178
Fix for std::runtime_error exception on connection close
ovidiusm Oct 2, 2025
7ba49db
Remove redundant tests
ovidiusm Oct 2, 2025
d60cbf4
Remove peer in place
ovidiusm Oct 2, 2025
154b398
Revert "Remove peer in place"
ovidiusm Oct 2, 2025
99749f5
Revert "Fix for std::runtime_error exception on connection close"
ovidiusm Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .ci/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,15 @@ gtest_offset=$((tcp_port_range / 2))
min_gtest_port=$((tcp_port_min + gtest_offset))
# shellcheck disable=SC2034
max_gtest_port=$((tcp_port_max + gtest_offset))

# Check if a GPU is present
nvidia-smi -L | grep -q '^GPU' && HAS_GPU=true || HAS_GPU=false

if $HAS_GPU && test -d "$CUDA_HOME"
then
UCX_CUDA_BUILD_ARGS="--with-cuda=${CUDA_HOME}"
else
UCX_CUDA_BUILD_ARGS=""
# This sequence ensures that we can link and load the binaries in all CI environments, even if a GPU is not present
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/compat:/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH
fi
14 changes: 12 additions & 2 deletions .gitlab/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# shellcheck disable=SC1091
. "$(dirname "$0")/../.ci/scripts/common.sh"

set -e
set -x
set -o pipefail
Expand Down Expand Up @@ -122,6 +125,7 @@ curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz
--enable-devel-headers \
--with-verbs \
--with-dm \
${UCX_CUDA_BUILD_ARGS} \
--enable-mt && \
make -j && \
make -j install-strip && \
Expand Down Expand Up @@ -167,8 +171,14 @@ rm "libfabric-${LIBFABRIC_VERSION#v}.tar.bz2"
$SUDO make install
)

export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/cuda/lib64"
export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${INSTALL_DIR}/lib:${LIBFABRIC_INSTALL_DIR}/lib"
( \
cd /tmp &&
git clone --depth 1 https://github.com/google/gtest-parallel.git &&
mkdir -p ${INSTALL_DIR}/bin &&
cp gtest-parallel/* ${INSTALL_DIR}/bin/
)

export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:${LIBFABRIC_INSTALL_DIR}/lib"
export CPATH="${INSTALL_DIR}/include:${LIBFABRIC_INSTALL_DIR}/include:$CPATH"
export PATH="${INSTALL_DIR}/bin:$PATH"
export PKG_CONFIG_PATH="${INSTALL_DIR}/lib/pkgconfig:${INSTALL_DIR}/lib64/pkgconfig:${INSTALL_DIR}:${LIBFABRIC_INSTALL_DIR}/lib/pkgconfig:$PKG_CONFIG_PATH"
Expand Down
22 changes: 6 additions & 16 deletions .gitlab/test_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,6 @@ set -x
TEXT_YELLOW="\033[1;33m"
TEXT_CLEAR="\033[0m"

# For running as user - check if running as root, if not set sudo variable
if [ "$(id -u)" -ne 0 ]; then
SUDO=sudo
else
SUDO=""
fi

$SUDO apt-get update
$SUDO apt-get -qq install -y libaio-dev


# Parse commandline arguments with first argument being the install directory.
INSTALL_DIR=$1

Expand All @@ -46,8 +35,6 @@ ARCH=$(uname -m)
[ "$ARCH" = "arm64" ] && ARCH="aarch64"

export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH

export CPATH=${INSTALL_DIR}/include:$CPATH
export PATH=${INSTALL_DIR}/bin:$PATH
Expand Down Expand Up @@ -77,14 +64,17 @@ cd ${INSTALL_DIR}
./bin/nixl_example
./bin/nixl_etcd_example
./bin/ucx_backend_test
./bin/ucx_mo_backend_test
# Skip UCX_MO backend test on GPU worker, fails VRAM transfers
if ! $HAS_GPU ; then
./bin/ucx_mo_backend_test
fi
mkdir -p /tmp/telemetry_test
NIXL_TELEMETRY_ENABLE=y NIXL_TELEMETRY_DIR=/tmp/telemetry_test ./bin/agent_example &
sleep 1
./bin/telemetry_reader /tmp/telemetry_test/Agent001 &
telePID=$!
sleep 6
kill -s SIGINT $telePID
kill -s INT $telePID

# POSIX test disabled until we solve io_uring and Docker compatibility

Expand All @@ -94,7 +84,7 @@ kill -s SIGINT $telePID
./bin/serdes_test

# shellcheck disable=SC2154
./bin/gtest --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port"
gtest-parallel --workers=1 --serialize_test_cases ./bin/gtest -- --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port"
./bin/test_plugin

# Run NIXL client-server test
Expand Down
28 changes: 21 additions & 7 deletions .gitlab/test_nixlbench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ ARCH=$(uname -m)
[ "$ARCH" = "arm64" ] && ARCH="aarch64"

export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH

export CPATH=${INSTALL_DIR}/include:$CPATH
export PATH=${INSTALL_DIR}/bin:$PATH
Expand Down Expand Up @@ -62,7 +60,7 @@ cd ${INSTALL_DIR}

run_nixlbench() {
args="$@"
./bin/nixlbench --etcd-endpoints ${NIXL_ETCD_ENDPOINTS} --initiator_seg_type DRAM --target_seg_type DRAM --filepath /tmp --total_buffer_size 80000000 --start_block_size 4096 --max_block_size 16384 --start_batch_size 1 --max_batch_size 4 $args
./bin/nixlbench --etcd-endpoints ${NIXL_ETCD_ENDPOINTS} --filepath /tmp --total_buffer_size 80000000 --start_block_size 4096 --max_block_size 16384 --start_batch_size 1 --max_batch_size 4 $args
}

run_nixlbench_one_worker() {
Expand All @@ -81,9 +79,25 @@ run_nixlbench_two_workers() {
wait $pid
}

run_nixlbench_two_workers --backend UCX --op_type READ
run_nixlbench_two_workers --backend UCX --op_type WRITE
run_nixlbench_one_worker --backend POSIX --op_type READ
run_nixlbench_one_worker --backend POSIX --op_type WRITE
if $HAS_GPU ; then
seg_types="VRAM DRAM"
else
seg_types="DRAM"
echo "Worker without GPU, skipping VRAM tests"
fi

for op_type in READ WRITE; do
for initiator in $seg_types; do
for target in $seg_types; do
run_nixlbench_two_workers --backend UCX --op_type $op_type --initiator_seg_type $initiator --target_seg_type $target
done
done
done

for op_type in READ WRITE; do
for target in $seg_types; do
run_nixlbench_one_worker --backend POSIX --op_type $op_type --target_seg_type $target
done
done

pkill etcd
2 changes: 0 additions & 2 deletions .gitlab/test_plugins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ ARCH=$(uname -m)
[ "$ARCH" = "arm64" ] && ARCH="aarch64"

export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:$LD_LIBRARY_PATH

export CPATH=${INSTALL_DIR}/include:$CPATH
export PATH=${INSTALL_DIR}/bin:$PATH
Expand Down
12 changes: 1 addition & 11 deletions .gitlab/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,11 @@ if [ -z "$INSTALL_DIR" ]; then
exit 1
fi

# For running as user - check if running as root, if not set sudo variable
if [ "$(id -u)" -ne 0 ]; then
SUDO=sudo
else
SUDO=""
fi

$SUDO apt-get -qq install liburing-dev

ARCH=$(uname -m)
[ "$ARCH" = "arm64" ] && ARCH="aarch64"

export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${INSTALL_DIR}/lib/$ARCH-linux-gnu:${INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins:/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64:/usr/local/cuda-12.8/compat:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/usr/local/cuda/compat/lib.real:/opt/amazon/efa/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
export CPATH=${INSTALL_DIR}/include:/opt/amazon/efa/include:$CPATH
export PATH=${INSTALL_DIR}/bin:$PATH
export PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig:/opt/amazon/efa/lib/pkgconfig:$PKG_CONFIG_PATH
Expand Down
6 changes: 6 additions & 0 deletions contrib/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ RUN cd /usr/local/src && \
make -j${NPROC:-$(nproc)} install-strip && \
ldconfig

RUN cd /tmp && \
git clone --depth 1 https://github.com/google/gtest-parallel.git && \
mkdir -p /usr/local/bin && \
cp gtest-parallel/gtest-parallel gtest-parallel/gtest_parallel.py /usr/local/bin/
ENV PATH=/usr/local/bin:$PATH

# Build libfabric from source
RUN wget --tries=3 --waitretry=5 --timeout=30 --read-timeout=60 \
"https://github.com/ofiwg/libfabric/releases/download/${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION#v}.tar.bz2" -O libfabric.tar.bz2 && \
Expand Down