Skip to content

Commit ad6c8a0

Browse files
2 parents b7d77eb + df689f8 commit ad6c8a0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+580
-208
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ repos:
8484
files: ".*/auto_deploy/.*"
8585
- repo: local
8686
hooks:
87+
- id: test lists format
88+
name: Check for tabs and multiple spaces in test_lists txt files
89+
entry: ./scripts/format_test_list.py
90+
language: script
91+
files: tests/integration/test_lists/.*\.txt$
8792
- id: DCO check
8893
name: Checks the commit message for a developer certificate of origin signature
8994
entry: ./scripts/dco_check.py

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,13 +1456,15 @@ class CacheTransceiverConfig
14561456
UCX = 2,
14571457
NIXL = 3
14581458
};
1459-
explicit CacheTransceiverConfig(
1460-
std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
1459+
explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
1460+
std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt);
14611461

14621462
bool operator==(CacheTransceiverConfig const& other) const;
14631463
void setBackendType(std::optional<BackendType> backendType);
14641464
void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
1465+
void setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs);
14651466

1467+
[[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
14661468
[[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
14671469
[[nodiscard]] std::optional<BackendType> getBackendType() const;
14681470

@@ -1472,6 +1474,7 @@ class CacheTransceiverConfig
14721474
/// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
14731475
/// transfer may be degraded.
14741476
std::optional<size_t> mMaxTokensInBuffer;
1477+
std::optional<int> mKvTransferTimeoutMs;
14751478
};
14761479

14771480
/// @brief Configuration class for the model executor

cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor
2222
{
2323

2424
CacheTransceiverConfig::CacheTransceiverConfig(
25-
std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)
25+
std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)
2626
: mBackendType(backendType)
2727
, mMaxTokensInBuffer(maxNumTokens)
28+
, mKvTransferTimeoutMs(kvTransferTimeoutMs)
2829
{
2930
}
3031

3132
bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const
3233
{
33-
return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;
34+
return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType
35+
&& mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;
3436
}
3537

3638
void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)
@@ -43,6 +45,15 @@ void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxToken
4345
mMaxTokensInBuffer = maxTokensInBuffer;
4446
}
4547

48+
void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)
49+
{
50+
if (kvTransferTimeoutMs.has_value() && kvTransferTimeoutMs.value() <= 0)
51+
{
52+
TLLM_THROW("kvTransferTimeoutMs must be positive");
53+
}
54+
mKvTransferTimeoutMs = kvTransferTimeoutMs;
55+
}
56+
4657
std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const
4758
{
4859
return mBackendType;
@@ -53,4 +64,9 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const
5364
return mMaxTokensInBuffer;
5465
}
5566

67+
std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const
68+
{
69+
return mKvTransferTimeoutMs;
70+
}
71+
5672
} // namespace tensorrt_llm::executor

cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -433,15 +433,15 @@ void initConfigBindings(nb::module_& m)
433433
.def("__setstate__", guidedDecodingConfigSetstate);
434434

435435
auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
436-
{ return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
436+
{ return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
437437
auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
438438
{
439-
if (state.size() != 2)
439+
if (state.size() != 3)
440440
{
441441
throw std::runtime_error("Invalid CacheTransceiverConfig state!");
442442
}
443-
new (&self) tle::CacheTransceiverConfig(
444-
nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]), nb::cast<std::optional<size_t>>(state[1]));
443+
new (&self) tle::CacheTransceiverConfig(nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]),
444+
nb::cast<std::optional<size_t>>(state[1]), nb::cast<std::optional<int>>(state[2]));
445445
};
446446

447447
nb::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -464,12 +464,16 @@ void initConfigBindings(nb::module_& m)
464464
});
465465

466466
nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
467-
.def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
468-
nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt)
467+
.def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
468+
std::optional<int>>(),
469+
nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt,
470+
nb::arg("kv_transfer_timeout_ms") = std::nullopt)
469471
.def_prop_rw(
470472
"backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
471473
.def_prop_rw("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
472474
&tle::CacheTransceiverConfig::setMaxTokensInBuffer)
475+
.def_prop_rw("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
476+
&tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
473477
.def("__getstate__", cacheTransceiverConfigGetstate)
474478
.def("__setstate__", cacheTransceiverConfigSetstate);
475479

cpp/tensorrt_llm/pybind/executor/executorConfig.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -415,15 +415,15 @@ void initConfigBindings(pybind11::module_& m)
415415
.def(py::pickle(guidedDecodingConfigGetstate, guidedDecodingConfigSetstate));
416416

417417
auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
418-
{ return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
418+
{ return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
419419
auto cacheTransceiverConfigSetstate = [](py::tuple const& state)
420420
{
421-
if (state.size() != 2)
421+
if (state.size() != 3)
422422
{
423423
throw std::runtime_error("Invalid CacheTransceiverConfig state!");
424424
}
425-
return tle::CacheTransceiverConfig(
426-
state[0].cast<tle::CacheTransceiverConfig::BackendType>(), state[1].cast<std::optional<size_t>>());
425+
return tle::CacheTransceiverConfig(state[0].cast<tle::CacheTransceiverConfig::BackendType>(),
426+
state[1].cast<std::optional<size_t>>(), state[2].cast<std::optional<int>>());
427427
};
428428

429429
py::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -446,12 +446,16 @@ void initConfigBindings(pybind11::module_& m)
446446
});
447447

448448
py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
449-
.def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
450-
py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt)
449+
.def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
450+
std::optional<int>>(),
451+
py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt,
452+
py::arg("kv_transfer_timeout_ms") = std::nullopt)
451453
.def_property(
452454
"backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
453455
.def_property("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
454456
&tle::CacheTransceiverConfig::setMaxTokensInBuffer)
457+
.def_property("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
458+
&tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
455459
.def(py::pickle(cacheTransceiverConfigGetstate, cacheTransceiverConfigSetstate));
456460

457461
auto executorConfigGetState = [](py::object const& self)

cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -871,7 +871,7 @@ MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter
871871
}
872872
}
873873

874-
mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads));
874+
mMultiThreadWorker.reset(new MultiThreadWorker(numCopyThreads, mCudaDeviceId));
875875
}
876876

877877
MoeLoadBalancer::~MoeLoadBalancer()
@@ -1064,8 +1064,9 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)
10641064
}
10651065
}
10661066

1067-
MultiThreadWorker::MultiThreadWorker(int numThreads)
1067+
MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)
10681068
: mNumThreads(numThreads)
1069+
, mCudaDeviceId(cudaDeviceId)
10691070
, mRunning(false)
10701071
, mNextTaskId(0)
10711072
{
@@ -1139,6 +1140,7 @@ void MultiThreadWorker::stop()
11391140

11401141
void MultiThreadWorker::workerLoop(int rank)
11411142
{
1143+
TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));
11421144
auto& topologyDetector = TopologyDetector::getInstance();
11431145
topologyDetector.bindThreadByCurrentGpu(); // use relaxed mode
11441146
while (true)

cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ class SingleLayerMoeLoadBalancer
219219
class MultiThreadWorker
220220
{
221221
public:
222-
explicit MultiThreadWorker(int numThreads);
222+
explicit MultiThreadWorker(int numThreads, int cudaDeviceId);
223223
~MultiThreadWorker();
224224

225225
void start();
@@ -239,6 +239,7 @@ class MultiThreadWorker
239239
void workerLoop(int rank);
240240

241241
int mNumThreads;
242+
int mCudaDeviceId;
242243
std::vector<std::thread> mThreads;
243244
std::mutex mMutex;
244245
std::condition_variable mCondition;

examples/disaggregated/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ cache_transceiver_config:
1616
backend: <str>
1717
# KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
1818
max_tokens_in_buffer: <int>
19+
# KV cache transfer timeout in milliseconds
20+
# For requests, if they do not send/receive the KV cache in time they are cancelled and cleaned up
21+
kv_transfer_timeout_ms: <int>
1922
```
2023
2124
The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.

scripts/build_wheel.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,9 @@ def generate_python_stubs_linux(binding_type: str, venv_python: Path,
387387

388388
try:
389389
if is_nanobind:
390-
build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
391-
env=env_stub_gen)
390+
build_run(
391+
f"\"{venv_python}\" -m nanobind.stubgen -m bindings -r -O .",
392+
env=env_stub_gen)
392393
else:
393394
build_run(
394395
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
@@ -575,9 +576,9 @@ def get_binding_type_from_cache():
575576
nanobind_dir = build_dir / "tensorrt_llm" / "nanobind"
576577
if nanobind_dir.exists():
577578
rmtree(nanobind_dir)
578-
nanobind_stub_file = project_dir / "tensorrt_llm" / "bindings.pyi"
579-
if nanobind_stub_file.exists():
580-
nanobind_stub_file.unlink()
579+
nanobind_stub_dir = project_dir / "tensorrt_llm" / "bindings"
580+
if nanobind_stub_dir.exists():
581+
rmtree(nanobind_stub_dir)
581582

582583
pybind_dir = build_dir / "tensorrt_llm" / "pybind"
583584
if pybind_dir.exists():

scripts/format_test_list.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python3
2+
"""Normalize tabs and multiple spaces to single spaces in files."""
3+
import argparse
4+
import re
5+
import sys
6+
7+
8+
def normalize_whitespace(content: str) -> str:
9+
"""Remove leading whitespace, replace tabs and multiple spaces with single spaces."""
10+
lines = content.splitlines(keepends=True)
11+
normalized_lines = []
12+
13+
for line in lines:
14+
# Remove leading whitespace and tabs
15+
line = line.lstrip(' \t')
16+
# Replace tabs with single space
17+
line = line.replace('\t', ' ')
18+
# Replace multiple spaces with single space
19+
line = re.sub(r' +', ' ', line)
20+
normalized_lines.append(line)
21+
22+
return ''.join(normalized_lines)
23+
24+
25+
def main():
26+
parser = argparse.ArgumentParser(
27+
description='Normalize tabs and multiple spaces to single spaces')
28+
parser.add_argument('filenames', nargs='*', help='Filenames to fix')
29+
args = parser.parse_args()
30+
31+
retval = 0
32+
for filename in args.filenames:
33+
with open(filename, 'r', encoding='utf-8') as f:
34+
original_contents = f.read()
35+
36+
normalized_contents = normalize_whitespace(original_contents)
37+
38+
if original_contents != normalized_contents:
39+
print(f'Fixing {filename}')
40+
with open(filename, 'w', encoding='utf-8') as f:
41+
f.write(normalized_contents)
42+
retval = 1
43+
44+
return retval
45+
46+
47+
if __name__ == '__main__':
48+
sys.exit(main())

0 commit comments

Comments
 (0)