Skip to content

Commit 018f4ba

Browse files
authored
Merge branch 'main' into user/bo/add-more-nixl-tests-main
2 parents 4e21fab + 4200fa4 commit 018f4ba

File tree

1,498 files changed

+8273
-5287
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,498 files changed

+8273
-5287
lines changed

benchmarks/cpp/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,15 +336,15 @@ cd cpp/build
336336
`disaggServerBenchmark` only supports `decoder-only` models.
337337
Here is the basic usage:
338338
```
339-
export TRTLLM_USE_MPI_KVCACHE=1
339+
export TRTLLM_USE_UCX_KVCACHE=1
340340
mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
341341
--generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
342342
```
343343
This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
344344
345345
for example:
346346
```
347-
export TRTLLM_USE_MPI_KVCACHE=1
347+
export TRTLLM_USE_UCX_KVCACHE=1
348348
mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
349349

350350
# need 6 gpus and 7 processes to launch the benchmark.

cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,27 +75,19 @@ class CreateNewDecoderRequests : Algorithm
7575
std::vector<executor::LookaheadDecodingConfig>>
7676
operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
7777
executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
78-
runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
79-
runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
80-
SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
78+
nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
79+
CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
80+
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
8181

8282
[[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
8383
std::vector<executor::LookaheadDecodingConfig>>
8484
createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
8585
executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
86-
runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
87-
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
86+
nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
8887
runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream,
8988
SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
9089

9190
private:
92-
//! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
93-
//! GptDecoderBatched::newBatch()
94-
static void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
95-
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
96-
runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
97-
SizeType32 maxSequenceLength);
98-
9991
//! @brief Setups decoder internal tensors for new speculative decoding request
10092
static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
10193
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,

cpp/include/tensorrt_llm/runtime/decoderState.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,11 @@ class DecoderState
173173
//! @brief Workspace for beam search in streaming mode.
174174
[[nodiscard]] BeamSearchBuffers const& getBeamSearchBuffers() const;
175175

176+
//! @brief Set the beam width for a specific request in the batch.
177+
//! @param batchIdx The index of the request in the batch.
178+
//! @param beamWidth The beam width for the specified request.
179+
void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth);
180+
176181
//! @brief Cache indirection input for beam search.
177182
[[nodiscard]] TensorPtr getCacheIndirectionInput() const;
178183

cpp/include/tensorrt_llm/runtime/request.h

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,26 +31,16 @@ class Request
3131
using TensorPtr = ITensor::SharedPtr;
3232
using BufferPtr = IBuffer::SharedPtr;
3333

34-
explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt,
35-
std::optional<SizeType32> endId = std::nullopt)
36-
: ids{std::move(ids)}
37-
, inputLen(inputLen)
38-
, maxNewTokens{maxNewTokens}
39-
, endId{endId}
34+
explicit Request(SizeType32 inputLen)
35+
: inputLen(inputLen)
4036
{
4137
}
4238

4339
//! Mandatory parameters
44-
TensorConstPtr ids; // The input sequence of token ids, [inputSeqLen], on gpu
4540
SizeType32 inputLen; // Input length without draft tokens, increasing with generation steps
4641

4742
// optional parameters
48-
std::optional<SizeType32> maxNewTokens; // maximum number of tokens to generate for this request
49-
std::optional<SizeType32> endId; // end token id
5043
SizeType32 generatedTokensPerEngineStep{1}; //
51-
TensorPtr embeddingBias; // [vocabSizePadded], on gpu
52-
TensorPtr badWordsList; // [2, badWordsLength] on gpu
53-
TensorPtr stopWordsList; // [2, stopWordsLength] on gpu
5444

5545
//! Optional parameters for speculative decoding
5646
BufferPtr draftTokens; // [generatedTokensPerEngineStep - 1] on gpu

cpp/kernels/fmha_v2/fmha_test.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,23 @@ def test_trtllm_context_mla_attention_fmha(dtype, s, input_layout):
183183
shell=True,
184184
check=True)
185185

186+
# For chunked prefill, we need to enable -save-softmax (dtype: bf16, sm90, layout: paged-kv or separate-q-k-v).
187+
if dtype == "-bf16" and input_layout in [
188+
"-paged-kv", "-separate-q-k-v"
189+
]:
190+
# padding mask
191+
subprocess.run(
192+
f"bin/fmha.exe -v 0 -runs 1 -min-s 1024 -s {s} -b 8 -h 8 -d 192 -dv 128 {dtype} \
193+
{epsilon} {input_layout} -save-softmax",
194+
shell=True,
195+
check=True)
196+
# causal mask
197+
subprocess.run(
198+
f"bin/fmha.exe -v 0 -runs 1 -min-s 1024 -s {s} -b 8 -h 8 -d 192 -dv 128 {dtype} \
199+
-causal-mask {epsilon} {input_layout} -save-softmax",
200+
shell=True,
201+
check=True)
202+
186203

187204
@pytest.mark.parametrize('dtype', ["-bf16", "-e4m3", "-e4m3 -bf16-output"],
188205
ids=["bf16", "e4m3", "e4m3-bf16"])

0 commit comments

Comments
 (0)