NVIDIA
diff --git a/‎benchmarks/cpp/README.md
Lines changed: 2 additions & 2 deletions b/‎benchmarks/cpp/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Lines changed: 4 additions & 12 deletions b/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
Lines changed: 4 additions & 12 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/decoderState.h
Lines changed: 5 additions & 0 deletions b/‎cpp/include/tensorrt_llm/runtime/decoderState.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/request.h
Lines changed: 2 additions & 12 deletions b/‎cpp/include/tensorrt_llm/runtime/request.h
Lines changed: 2 additions & 12 deletions
diff --git a/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 17 additions & 0 deletions b/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 17 additions & 0 deletions
@@ -336,15 +336,15 @@ cd cpp/build
 `disaggServerBenchmark` only supports `decoder-only` models.
 Here is the basic usage:
 ```
-export TRTLLM_USE_MPI_KVCACHE=1
+export TRTLLM_USE_UCX_KVCACHE=1
 mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
 --generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
 ```
 This command will launch m context engines and n generation engines. You need to ensure `proc` is equal to the sum of the number of processes required for each engine plus 1. Since we use orchestrator mode for `disaggServerBenchmark` we need an additional process as the orchestrator. For example, if there are two context engines (one is TP2_PP1,another is TP1_PP1) and two generation engines(one is TP2_PP1,another is TP1_PP1), then the `proc` value should be set to 7.
 
 for example:
 ```
-export TRTLLM_USE_MPI_KVCACHE=1
+export TRTLLM_USE_UCX_KVCACHE=1
 mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
 
 # need 6 gpus and 7 processes to launch the benchmark.
 
@@ -75,27 +75,19 @@ class CreateNewDecoderRequests : Algorithm
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
+        nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+        CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+        SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
     [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream,
         SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
 private:
-    //! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
-    //! GptDecoderBatched::newBatch()
-    static void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength);
-
     //! @brief Setups decoder internal tensors for new speculative decoding request
     static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
 
@@ -173,6 +173,11 @@ class DecoderState
     //! @brief Workspace for beam search in streaming mode.
     [[nodiscard]] BeamSearchBuffers const& getBeamSearchBuffers() const;
 
+    //! @brief Set the beam width for a specific request in the batch.
+    //! @param batchIdx The index of the request in the batch.
+    //! @param beamWidth The beam width for the specified request.
+    void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth);
+
     //! @brief Cache indirection input for beam search.
     [[nodiscard]] TensorPtr getCacheIndirectionInput() const;
 
 
@@ -31,26 +31,16 @@ class Request
     using TensorPtr = ITensor::SharedPtr;
     using BufferPtr = IBuffer::SharedPtr;
 
-    explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt,
-        std::optional<SizeType32> endId = std::nullopt)
-        : ids{std::move(ids)}
-        , inputLen(inputLen)
-        , maxNewTokens{maxNewTokens}
-        , endId{endId}
+    explicit Request(SizeType32 inputLen)
+        : inputLen(inputLen)
     {
     }
 
     //! Mandatory parameters
-    TensorConstPtr ids;  // The input sequence of token ids, [inputSeqLen], on gpu
     SizeType32 inputLen; // Input length without draft tokens, increasing with generation steps
 
     // optional parameters
-    std::optional<SizeType32> maxNewTokens;     // maximum number of tokens to generate for this request
-    std::optional<SizeType32> endId;            // end token id
     SizeType32 generatedTokensPerEngineStep{1}; //
-    TensorPtr embeddingBias;                    // [vocabSizePadded], on gpu
-    TensorPtr badWordsList;                     // [2, badWordsLength] on gpu
-    TensorPtr stopWordsList;                    // [2, stopWordsLength] on gpu
 
     //! Optional parameters for speculative decoding
     BufferPtr draftTokens;                // [generatedTokensPerEngineStep - 1] on gpu
 
@@ -183,6 +183,23 @@ def test_trtllm_context_mla_attention_fmha(dtype, s, input_layout):
             shell=True,
             check=True)
 
+        # For chunked prefill, we need to enable -save-softmax (dtype: bf16, sm90, layout: paged-kv or separate-q-k-v).
+        if dtype == "-bf16" and input_layout in [
+                "-paged-kv", "-separate-q-k-v"
+        ]:
+            # padding mask
+            subprocess.run(
+                f"bin/fmha.exe -v 0 -runs 1 -min-s 1024 -s {s} -b 8 -h 8 -d 192 -dv 128 {dtype} \
+                {epsilon} {input_layout} -save-softmax",
+                shell=True,
+                check=True)
+            # causal mask
+            subprocess.run(
+                f"bin/fmha.exe -v 0 -runs 1 -min-s 1024 -s {s} -b 8 -h 8 -d 192 -dv 128 {dtype} \
+                -causal-mask {epsilon} {input_layout} -save-softmax",
+                shell=True,
+                check=True)
+
 
 @pytest.mark.parametrize('dtype', ["-bf16", "-e4m3", "-e4m3 -bf16-output"],
                          ids=["bf16", "e4m3", "e4m3-bf16"])