[None][fix] Clean up linking to CUDA stub libraries in build_wheel.py (#6823)

MartinMarciniszyn · Linda-Stadter · web-flow · commit 425dad01fdf2 · 2025-08-18T11:20:51.000-04:00
Signed-off-by: Linda-Stadter &lt;57756729+Linda-Stadter@users.noreply.github.com&gt;
Signed-off-by: Martin Marciniszyn Mehringer &lt;11665257+MartinMarciniszyn@users.noreply.github.com&gt;
Co-authored-by: Linda-Stadter &lt;57756729+Linda-Stadter@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -43,6 +43,7 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
+         CUDA::cuda_driver
          ${CUDA_NVML_LIB}
          th_common)
 target_compile_definitions(
@@ -54,6 +55,6 @@ if(NOT WIN32)
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
+         CUDA::cuda_driver
          ${CUDA_NVML_LIB}
          th_common)
 target_compile_definitions(
@@ -55,6 +56,6 @@ if(NOT WIN32)
     ${TRTLLM_PYBIND_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Install OpenCV with FFMPEG support
-RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
-RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
+RUN pip3 uninstall -y opencv && \
+    rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
+    pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
 
 # WARs against security issues inherited from pytorch:25.06
 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
@@ -16,8 +16,10 @@
 
 import os
 import platform
+import re
 import sys
 import sysconfig
+import tempfile
 import warnings
 from argparse import ArgumentParser
 from contextlib import contextmanager
@@ -27,7 +29,7 @@
 from shutil import copy, copytree, rmtree
 from subprocess import DEVNULL, CalledProcessError, check_output, run
 from textwrap import dedent
-from typing import List
+from typing import Sequence
 
 try:
     from packaging.requirements import Requirement
@@ -120,7 +122,8 @@ def create_venv(project_dir: Path):
     return venv_prefix
 
 
-def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
+def setup_venv(project_dir: Path, requirements_file: Path,
+               no_venv: bool) -> tuple[Path, Path]:
     """Creates/updates a venv and installs requirements.
 
     Args:
@@ -279,14 +282,147 @@ def generate_fmha_cu(project_dir, venv_python):
     os.chdir(project_dir)
 
 
+def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
+    """
+    Creates symbolic links for CUDA stub libraries in a temporary directory.
+
+    Args:
+        cuda_stub_dir (str): Path to the directory containing CUDA stubs.
+        missing_libs: Versioned names of the missing libraries.
+
+    Returns:
+        str: Path to the temporary directory where links were created.
+    """
+    cuda_stub_path = Path(cuda_stub_dir)
+    if not cuda_stub_path.exists():
+        raise RuntimeError(
+            f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
+
+    # Create a temporary directory for the symbolic links
+    temp_dir = tempfile.mkdtemp(prefix="cuda_stub_links_")
+    temp_dir_path = Path(temp_dir)
+
+    version_pattern = r'\.\d+'
+    for missing_lib in filter(lambda x: re.search(version_pattern, x),
+                              missing_libs):
+        # Define `so` as the first part of `missing_lib` with trailing '.' and digits removed
+        so = cuda_stub_path / re.sub(version_pattern, '', missing_lib)
+        so_versioned = temp_dir_path / missing_lib
+
+        # Check if the library exists in the original directory
+        if so.exists():
+            try:
+                # Create the symbolic link in the temporary directory
+                so_versioned.symlink_to(so)
+            except OSError as e:
+                # Clean up the temporary directory on error
+                rmtree(temp_dir)
+                raise RuntimeError(
+                    f"Failed to create symbolic link for '{missing_lib}' in temporary directory '{temp_dir}': {e}"
+                )
+        else:
+            warnings.warn(
+                f"Warning: Source library '{so}' does not exist and was skipped."
+            )
+
+    # Return the path to the temporary directory where the links were created
+    return str(temp_dir_path)
+
+
+def check_missing_libs(so_prefix: str) -> list[str]:
+    result = build_run(f"ldd {so_prefix}.cpython*.so",
+                       capture_output=True,
+                       text=True)
+    missing = []
+    for line in result.stdout.splitlines():
+        if "not found" in line:
+            lib_name = line.split()[
+                0]  # Extract the library name before "=> not found"
+            if lib_name not in missing:
+                missing.append(lib_name)
+    return missing
+
+
+def generate_python_stubs_linux(binding_type: str, venv_python: Path,
+                                deep_ep: bool):
+    is_nanobind = binding_type == "nanobind"
+    if is_nanobind:
+        build_run(f"\"{venv_python}\" -m pip install nanobind")
+    build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
+
+    env_stub_gen = os.environ.copy()
+    cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
+        "CUDA_PATH") or "/usr/local/cuda"
+    missing_libs = check_missing_libs("bindings")
+    cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
+
+    if missing_libs and Path(cuda_stub_dir).exists():
+        # Create symbolic links for the CUDA stubs
+        link_dir = create_cuda_stub_links(cuda_stub_dir, missing_libs)
+        ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
+        env_stub_gen["LD_LIBRARY_PATH"] = ":".join(
+            filter(None, [link_dir, cuda_stub_dir, ld_library_path]))
+    else:
+        link_dir = None
+
+    try:
+        if is_nanobind:
+            build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
+                      env=env_stub_gen)
+        else:
+            build_run(
+                f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
+                env=env_stub_gen)
+        build_run(
+            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
+            env=env_stub_gen)
+        if deep_ep:
+            build_run(
+                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
+                env=env_stub_gen)
+    finally:
+        if link_dir:
+            rmtree(link_dir)
+
+
+def generate_python_stubs_windows(binding_type: str, venv_python: Path,
+                                  pkg_dir: Path, lib_dir: Path):
+    if binding_type == "nanobind":
+        print("Windows not yet supported for nanobind stubs")
+        exit(1)
+    else:
+        build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
+        stubgen = "stubgen.py"
+        stubgen_contents = """
+                        # Loading torch, trt before bindings is required to avoid import errors on windows.
+                        # isort: off
+                        import torch
+                        import tensorrt as trt
+                        # isort: on
+                        import os
+                        import platform
+
+                        from pybind11_stubgen import main
+
+                        if __name__ == "__main__":
+                            # Load dlls from `libs` directory before launching bindings.
+                            if platform.system() == "Windows":
+                                os.add_dll_directory(r\"{lib_dir}\")
+                            main()
+                        """.format(lib_dir=lib_dir)
+        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
+        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
+        (pkg_dir / stubgen).unlink()
+
+
 def main(*,
          build_type: str = "Release",
          generator: str = "",
          build_dir: Path = None,
          dist_dir: Path = None,
          cuda_architectures: str = None,
          job_count: int = None,
-         extra_cmake_vars: List[str] = list(),
+         extra_cmake_vars: Sequence[str] = tuple(),
          extra_make_targets: str = "",
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
@@ -361,7 +497,7 @@ def main(*,
 
     if on_windows:
         # Windows does not support multi-device currently.
-        extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
+        extra_cmake_vars = list(extra_cmake_vars) + ["ENABLE_MULTI_DEVICE=0"]
 
         # The Ninja CMake generator is used for our Windows build
         # (Easier than MSBuild to make compatible with our Docker image)
@@ -703,81 +839,14 @@ def get_binding_lib(subdirectory, name):
                      dirs_exist_ok=True)
 
         if not skip_stubs:
-            with working_directory(project_dir):
-                if binding_type == "nanobind":
-                    build_run(f"\"{venv_python}\" -m pip install nanobind")
-                else:
-                    build_run(
-                        f"\"{venv_python}\" -m pip install pybind11-stubgen")
             with working_directory(pkg_dir):
                 if on_windows:
-                    if binding_type == "nanobind":
-                        print("Windows not yet supported for nanobind stubs")
-                        exit(1)
-                    else:
-                        stubgen = "stubgen.py"
-                        stubgen_contents = """
-                        # Loading torch, trt before bindings is required to avoid import errors on windows.
-                        # isort: off
-                        import torch
-                        import tensorrt as trt
-                        # isort: on
-                        import os
-                        import platform
-
-                        from pybind11_stubgen import main
-
-                        if __name__ == "__main__":
-                            # Load dlls from `libs` directory before launching bindings.
-                            if platform.system() == "Windows":
-                                os.add_dll_directory(r\"{lib_dir}\")
-                            main()
-                        """.format(lib_dir=lib_dir)
-                        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
-                        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
-                        (pkg_dir / stubgen).unlink()
-                else:
-                    env_ld = os.environ.copy()
-
-                    new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
-                    if 'LD_LIBRARY_PATH' in env_ld:
-                        new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
-
-                    result = build_run("find /usr -name *libnvidia-ml.so*",
-                                       capture_output=True,
-                                       text=True)
-                    assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
-
-                    # Build containers only contain stub version of libnvidia-ml.so and not the real version.
-                    # If real version not in system, we need to create symbolic link to stub version to prevent import errors.
-                    if "libnvidia-ml.so.1" not in result.stdout:
-                        if "libnvidia-ml.so" in result.stdout:
-                            line = result.stdout.splitlines()[0]
-                            path = os.path.dirname(line)
-                            new_library_path += f":{path}"
-                            build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
-                        else:
-                            print(
-                                f"Failed to find libnvidia-ml.so: {result.stderr}",
-                                file=sys.stderr)
-                            exit(1)
-
-                    env_ld["LD_LIBRARY_PATH"] = new_library_path
-                    if binding_type == "nanobind":
-                        build_run(
-                            f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
-                            env=env_ld)
-                    else:
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
-                            env=env_ld)
-                        if deep_ep_cuda_architectures:
-                            build_run(
-                                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
-                                env=env_ld)
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
-                            env=env_ld)
+                    generate_python_stubs_windows(binding_type, venv_python,
+                                                  pkg_dir, lib_dir)
+                else:  # on linux
+                    generate_python_stubs_linux(
+                        binding_type, venv_python,
+                        bool(deep_ep_cuda_architectures))
 
     if not skip_building_wheel:
         if dist_dir is None: