NVIDIA
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 2 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 18 additions & 2 deletions b/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions b/‎cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions b/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp‎
Lines changed: 4 additions & 2 deletions b/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/disaggregated/README.md‎
Lines changed: 3 additions & 0 deletions b/‎examples/disaggregated/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎scripts/build_wheel.py‎
Lines changed: 6 additions & 5 deletions b/‎scripts/build_wheel.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎scripts/format_test_list.py‎
Lines changed: 48 additions & 0 deletions b/‎scripts/format_test_list.py‎
Lines changed: 48 additions & 0 deletions
@@ -84,6 +84,11 @@ repos:
         files: ".*/auto_deploy/.*"
 -   repo: local
     hooks:
+    -   id: test lists format
+        name: Check for tabs and multiple spaces in test_lists txt files
+        entry: ./scripts/format_test_list.py
+        language: script
+        files: tests/integration/test_lists/.*\.txt$
     -   id: DCO check
         name: Checks the commit message for a developer certificate of origin signature
         entry: ./scripts/dco_check.py
 
@@ -1456,13 +1456,15 @@ class CacheTransceiverConfig
         UCX = 2,
         NIXL = 3
     };
-    explicit CacheTransceiverConfig(
-        std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
+    explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
+        std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
     void setBackendType(std::optional<BackendType> backendType);
     void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
+    void setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs);
 
+    [[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
     [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
     [[nodiscard]] std::optional<BackendType> getBackendType() const;
 
@@ -1472,6 +1474,7 @@ class CacheTransceiverConfig
     /// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
     /// transfer may be degraded.
     std::optional<size_t> mMaxTokensInBuffer;
+    std::optional<int> mKvTransferTimeoutMs;
 };
 
 /// @brief Configuration class for the model executor
 
@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor
 {
 
 CacheTransceiverConfig::CacheTransceiverConfig(
-    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)
+    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)
     : mBackendType(backendType)
     , mMaxTokensInBuffer(maxNumTokens)
+    , mKvTransferTimeoutMs(kvTransferTimeoutMs)
 {
 }
 
 bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const
 {
-    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;
+    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType
+        && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;
 }
 
 void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)
@@ -43,6 +45,15 @@ void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxToken
     mMaxTokensInBuffer = maxTokensInBuffer;
 }
 
+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)
+{
+    if (kvTransferTimeoutMs.has_value() && kvTransferTimeoutMs.value() <= 0)
+    {
+        TLLM_THROW("kvTransferTimeoutMs must be positive");
+    }
+    mKvTransferTimeoutMs = kvTransferTimeoutMs;
+}
+
 std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const
 {
     return mBackendType;
@@ -53,4 +64,9 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const
     return mMaxTokensInBuffer;
 }
 
+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const
+{
+    return mKvTransferTimeoutMs;
+}
+
 } // namespace tensorrt_llm::executor
@@ -433,15 +433,15 @@ void initConfigBindings(nb::module_& m)
         .def("__setstate__", guidedDecodingConfigSetstate);
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        new (&self) tle::CacheTransceiverConfig(
-            nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]), nb::cast<std::optional<size_t>>(state[1]));
+        new (&self) tle::CacheTransceiverConfig(nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]),
+            nb::cast<std::optional<size_t>>(state[1]), nb::cast<std::optional<int>>(state[2]));
     };
 
     nb::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -464,12 +464,16 @@ void initConfigBindings(nb::module_& m)
             });
 
     nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
+                 std::optional<int>>(),
+            nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt,
+            nb::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_prop_rw(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_prop_rw("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_prop_rw("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def("__getstate__", cacheTransceiverConfigGetstate)
         .def("__setstate__", cacheTransceiverConfigSetstate);
 
 
@@ -415,15 +415,15 @@ void initConfigBindings(pybind11::module_& m)
         .def(py::pickle(guidedDecodingConfigGetstate, guidedDecodingConfigSetstate));
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        return tle::CacheTransceiverConfig(
-            state[0].cast<tle::CacheTransceiverConfig::BackendType>(), state[1].cast<std::optional<size_t>>());
+        return tle::CacheTransceiverConfig(state[0].cast<tle::CacheTransceiverConfig::BackendType>(),
+            state[1].cast<std::optional<size_t>>(), state[2].cast<std::optional<int>>());
     };
 
     py::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -446,12 +446,16 @@ void initConfigBindings(pybind11::module_& m)
             });
 
     py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
+                 std::optional<int>>(),
+            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt,
+            py::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_property(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_property("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_property("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def(py::pickle(cacheTransceiverConfigGetstate, cacheTransceiverConfigSetstate));
 
     auto executorConfigGetState = [](py::object const& self)
 
@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter
         }
     }
 
-    mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));
+    mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));
 }
 
 MoeLoadBalancer::~MoeLoadBalancer()
@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)
     }
 }
 
-MultiThreadWorker::MultiThreadWorker(int numThreads)
+MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)
     : mNumThreads(numThreads)
+    , mCudaDeviceId(cudaDeviceId)
     , mRunning(false)
     , mNextTaskId(0)
 {
@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()
 
 void MultiThreadWorker::workerLoop(int rank)
 {
+    TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));
     auto& topologyDetector = TopologyDetector::getInstance();
     topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode
     while (true)
 
@@ -219,7 +219,7 @@ class SingleLayerMoeLoadBalancer
 class MultiThreadWorker
 {
 public:
-    explicit MultiThreadWorker(int numThreads);
+    explicit MultiThreadWorker(int numThreads, int cudaDeviceId);
     ~MultiThreadWorker();
 
     void start();
@@ -239,6 +239,7 @@ class MultiThreadWorker
     void workerLoop(int rank);
 
     int mNumThreads;
+    int mCudaDeviceId;
     std::vector<std::thread> mThreads;
     std::mutex mMutex;
     std::condition_variable mCondition;
 
@@ -16,6 +16,9 @@ cache_transceiver_config:
   backend: <str>
   # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
   max_tokens_in_buffer: <int>
+  # KV cache transfer timeout in milliseconds
+  # For requests, if they do not send/receive the KV cache in time they are cancelled and cleaned up
+  kv_transfer_timeout_ms: <int>
 ```
 
 The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
 
@@ -387,8 +387,9 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
 
     try:
         if is_nanobind:
-            build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
-                      env=env_stub_gen)
+            build_run(
+                f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
+                env=env_stub_gen)
         else:
             build_run(
                 f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
@@ -575,9 +576,9 @@ def get_binding_type_from_cache():
         nanobind_dir = build_dir / "tensorrt_llm" / "nanobind"
         if nanobind_dir.exists():
             rmtree(nanobind_dir)
-        nanobind_stub_file = project_dir / "tensorrt_llm" / "bindings.pyi"
-        if nanobind_stub_file.exists():
-            nanobind_stub_file.unlink()
+        nanobind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
+        if nanobind_stub_dir.exists():
+            rmtree(nanobind_stub_dir)
 
         pybind_dir = build_dir / "tensorrt_llm" / "pybind"
         if pybind_dir.exists():
 
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""Normalize tabs and multiple spaces to single spaces in files."""
+import argparse
+import re
+import sys
+
+
+def normalize_whitespace(content: str) -> str:
+    """Remove leading whitespace, replace tabs and multiple spaces with single spaces."""
+    lines = content.splitlines(keepends=True)
+    normalized_lines = []
+
+    for line in lines:
+        # Remove leading whitespace and tabs
+        line = line.lstrip(' \t')
+        # Replace tabs with single space
+        line = line.replace('\t', ' ')
+        # Replace multiple spaces with single space
+        line = re.sub(r'  +', ' ', line)
+        normalized_lines.append(line)
+
+    return ''.join(normalized_lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Normalize tabs and multiple spaces to single spaces')
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args()
+
+    retval = 0
+    for filename in args.filenames:
+        with open(filename, 'r', encoding='utf-8') as f:
+            original_contents = f.read()
+
+        normalized_contents = normalize_whitespace(original_contents)
+
+        if original_contents != normalized_contents:
+            print(f'Fixing {filename}')
+            with open(filename, 'w', encoding='utf-8') as f:
+                f.write(normalized_contents)
+            retval = 1
+
+    return retval
+
+
+if __name__ == '__main__':
+    sys.exit(main())
Original file line number	Diff line number	Diff line change
`@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor`
`22`	`22`	`{`
`23`	`23`
`24`	`24`	`CacheTransceiverConfig::CacheTransceiverConfig(`
`25`		`- std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)`
	`25`	`+ std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)`
`26`	`26`	`: mBackendType(backendType)`
`27`	`27`	`, mMaxTokensInBuffer(maxNumTokens)`
	`28`	`+ , mKvTransferTimeoutMs(kvTransferTimeoutMs)`
`28`	`29`	`{`
`29`	`30`	`}`
`30`	`31`
`31`	`32`	`bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const`
`32`	`33`	`{`
`33`		`- return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;`
	`34`	`+ return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType`
	`35`	`+ && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;`
`34`	`36`	`}`
`35`	`37`
`36`	`38`	`void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)`
`@@ -43,6 +45,15 @@ void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxToken`
`43`	`45`	`mMaxTokensInBuffer = maxTokensInBuffer;`
`44`	`46`	`}`
`45`	`47`
	`48`	`+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)`
	`49`	`+{`
	`50`	`+ if (kvTransferTimeoutMs.has_value() && kvTransferTimeoutMs.value() <= 0)`
	`51`	`+ {`
	`52`	`+ TLLM_THROW("kvTransferTimeoutMs must be positive");`
	`53`	`+ }`
	`54`	`+ mKvTransferTimeoutMs = kvTransferTimeoutMs;`
	`55`	`+}`
	`56`	`+`
`46`	`57`	`std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const`
`47`	`58`	`{`
`48`	`59`	`return mBackendType;`
`@@ -53,4 +64,9 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const`
`53`	`64`	`return mMaxTokensInBuffer;`
`54`	`65`	`}`
`55`	`66`
	`67`	`+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const`
	`68`	`+{`
	`69`	`+ return mKvTransferTimeoutMs;`
	`70`	`+}`
	`71`	`+`
`56`	`72`	`} // namespace tensorrt_llm::executor`
Original file line number	Diff line number	Diff line change
`@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter`
`871`	`871`	`}`
`872`	`872`	`}`
`873`	`873`
`874`		`- mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));`
	`874`	`+ mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));`
`875`	`875`	`}`
`876`	`876`
`877`	`877`	`MoeLoadBalancer::~MoeLoadBalancer()`
`@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)`
`1064`	`1064`	`}`
`1065`	`1065`	`}`
`1066`	`1066`
`1067`		`-MultiThreadWorker::MultiThreadWorker(int numThreads)`
	`1067`	`+MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)`
`1068`	`1068`	`: mNumThreads(numThreads)`
	`1069`	`+ , mCudaDeviceId(cudaDeviceId)`
`1069`	`1070`	`, mRunning(false)`
`1070`	`1071`	`, mNextTaskId(0)`
`1071`	`1072`	`{`
`@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()`
`1139`	`1140`
`1140`	`1141`	`void MultiThreadWorker::workerLoop(int rank)`
`1141`	`1142`	`{`
	`1143`	`+ TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));`
`1142`	`1144`	`auto& topologyDetector = TopologyDetector::getInstance();`
`1143`	`1145`	`topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode`
`1144`	`1146`	`while (true)`