[None][refactor] Simplify finish reasons handling in DecoderState (#6524)

Funatiq · web-flow · commit 918fedf952dd · 2025-08-02T07:17:43.000+02:00
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/runtime/decoderState.h b/cpp/include/tensorrt_llm/runtime/decoderState.h
@@ -71,7 +71,7 @@ class DecoderState
     //! @returns [batchSize], number of finished sequences per request, on gpu
     [[nodiscard]] TensorPtr getFinishedSum() const;
 
-    //! @returns [batchSize, beamWidth], FinishedState value, on gpu
+    //! @returns [batchSize, beamWidth], finished states of type FinishedState, on gpu
     [[nodiscard]] TensorPtr getFinishReasons() const;
 
     //! @returns [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token
@@ -134,9 +134,6 @@ class DecoderState
     //! @returns [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
     [[nodiscard]] TensorPtr getAcceptedPackedPaths() const;
 
-    //! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
-    [[nodiscard]] TensorPtr getFinishedSteps() const;
-
     [[nodiscard]] SizeType32 getMaxBatchSize() const;
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
@@ -221,10 +218,6 @@ class DecoderState
     //! @brief Stateful outputs for the decoder. Allocated for maxBatchSize slots.
     DecodingOutputPtr mJointDecodingOutput;
 
-    //! @brief [maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState for each generated token
-    //! of maxTokensPerStep, on gpu
-    TensorPtr mFinishedSteps;
-
     //! @brief Workspace for beam search in streaming mode.
     std::unique_ptr<BeamSearchBuffers> mBeamSearchBuffers;
 
diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -272,22 +272,8 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder
         manager.setZero(*newTokensVec);
     }
 
-    // FIXME: we call setZero mMaxDecodingEngineTokens times for only 1 element
-    for (SizeType32 ti = 0; ti < decoderState.getMaxDecodingEngineTokens(); ++ti)
-    {
-        TensorPtr const finishedStepsView = ITensor::slice(decoderState.getFinishedSteps(), ti, 1);
-        finishedStepsView->squeeze(0);
-        TensorPtr const finishedSteps = ITensor::slice(finishedStepsView, batchSlot, 1);
-        if (ti < numDecodingEngineTokens)
-        {
-            manager.setZero(*finishedSteps);
-        }
-        else
-        {
-            runtime::kernels::invokeFill(
-                *finishedSteps, tk::FinishedState::skipDecoding().toUnderlying(), decoderStream);
-        }
-    }
+    TensorPtr const finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1);
+    manager.setZero(*finishedStepsSlice);
 
     // cumLogProb is mandatory for beamWidth > 1
     if ((samplingConfig.cumLogProbs.has_value() && samplingConfig.cumLogProbs->at(0)) || beamWidth > 1)
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -255,7 +255,6 @@ void initBindings(nb::module_& m)
         .def_prop_ro("next_draft_tokens_lengths", &tr::decoder::DecoderState::getNextDraftTokensLengths)
         .def_prop_ro("accepted_lengths_cum_sum", &tr::decoder::DecoderState::getAcceptedLengthsCumSum)
         .def_prop_ro("accepted_packed_paths", &tr::decoder::DecoderState::getAcceptedPackedPaths)
-        .def_prop_ro("finished_steps", &tr::decoder::DecoderState::getFinishedSteps)
         .def_prop_ro("max_beam_width", &tr::decoder::DecoderState::getMaxBeamWidth)
         .def_prop_ro("max_sequence_length", &tr::decoder::DecoderState::getMaxSequenceLength)
         .def_prop_ro("max_decoding_decoder_tokens", &tr::decoder::DecoderState::getMaxDecodingDecoderTokens)
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -349,7 +349,6 @@ void initBindings(pybind11::module_& m)
         .def_property_readonly("next_draft_tokens_lengths", &tr::decoder::DecoderState::getNextDraftTokensLengths)
         .def_property_readonly("accepted_lengths_cum_sum", &tr::decoder::DecoderState::getAcceptedLengthsCumSum)
         .def_property_readonly("accepted_packed_paths", &tr::decoder::DecoderState::getAcceptedPackedPaths)
-        .def_property_readonly("finished_steps", &tr::decoder::DecoderState::getFinishedSteps)
         .def_property_readonly("max_beam_width", &tr::decoder::DecoderState::getMaxBeamWidth)
         .def_property_readonly("max_sequence_length", &tr::decoder::DecoderState::getMaxSequenceLength)
         .def_property_readonly("max_decoding_decoder_tokens", &tr::decoder::DecoderState::getMaxDecodingDecoderTokens)
diff --git a/cpp/tensorrt_llm/runtime/decoderState.cpp b/cpp/tensorrt_llm/runtime/decoderState.cpp
@@ -89,14 +89,15 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
 
     dOutput->lengths = bufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
 
-    // use batchSize many entries instead of the usual 1
     dOutput->finishedSum = bufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     // we don't need dOutput->lengths because lengths are passed from outside
     dOutput->cumLogProbs = bufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->logProbs = bufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->beamHypotheses.empty(bufferManager);
+
     dOutput->finishReasons
         = bufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<tk::FinishedState::UnderlyingType>::value);
+    dInput->finishReasons = dOutput->finishReasons;
 
     dOutput->logProbsTiled = bufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
 
@@ -106,8 +107,6 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
     dInput->badWordsLens = bufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     dInput->embeddingBias = bufferManager.emptyTensor(MemoryType::kGPU, dtype);
 
-    mFinishedSteps = bufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<tk::FinishedState::UnderlyingType>::value);
-
     mBeamSearchBuffers = std::make_unique<BeamSearchBuffers>(bufferManager);
 
     setupCacheIndirectionBuffers(bufferManager);
@@ -245,10 +244,6 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
     auto& dOutput = *mJointDecodingOutput;
     dOutput.ids->reshape(maxTotalTokensShape);
 
-    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
-    mFinishedSteps->reshape(maxNewTokensShape);
-    bufferManager.setZero(*mFinishedSteps);
-
     dOutput.finishReasons->reshape(maxBatchSizeXmaxBeamWidthShape);
     bufferManager.setZero(*dOutput.finishReasons);
 
@@ -260,6 +255,7 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
     dOutput.finishedSum->reshape(maxBatchSizeShape);
     bufferManager.setZero(*dOutput.finishedSum);
 
+    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
     dOutput.newTokensSteps->reshape(maxNewTokensShape);
     bufferManager.setZero(*dOutput.newTokensSteps);
 
@@ -342,8 +338,6 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
         mMaxDecodingEngineTokens);
 
     auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
-    mFinishedSteps->reshape(maxNewTokensShape);
-    bufferManager.setZero(*mFinishedSteps);
     dOutput.newTokensSteps->reshape(maxNewTokensShape);
     bufferManager.setZero(*dOutput.newTokensSteps);
 
@@ -454,7 +448,6 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
 
     auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
     mJointDecodingOutput->newTokensSteps->reshape(maxNewTokensShape);
-    mFinishedSteps->reshape(maxNewTokensShape);
 
     for (auto const& llmReq : genRequests)
     {
@@ -562,11 +555,6 @@ TensorPtr DecoderState::getAcceptedPackedPaths() const
     return mJointDecodingOutput->speculativeDecodingOutputs->pathsOffsets;
 }
 
-TensorPtr DecoderState::getFinishedSteps() const
-{
-    return mFinishedSteps;
-}
-
 SizeType32 DecoderState::getMaxBatchSize() const
 {
     return mMaxBatchSize;
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -116,16 +116,6 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,
     dInput.batchSize = static_cast<SizeType32>(dInput.batchSlots->getSize());
     dInput.logitsVec = input.logits.at(step);
 
-    TensorPtr finishedStepsInput = ITensor::slice(decoderState.getFinishedSteps(), step, 1);
-    TensorPtr finishedStepsOutput
-        = ITensor::slice(decoderState.getFinishedSteps(), std::min(input.maxDecoderSteps - 1, step + 1), 1);
-    finishedStepsInput->squeeze(0);
-    finishedStepsOutput->squeeze(0);
-    TensorPtr newTokensStepView
-        = ITensor::slice(dOutput.newTokensSteps, step, decoderState.getMaxDecodingDecoderTokens());
-
-    dInput.finishReasons = finishedStepsInput;
-
     if (speculativeDecodingMode.isDraftTokensExternal())
     {
         dInput.externalDraftTokensInputs->step = step;
@@ -136,14 +126,13 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,
             auto batchSlotsRange = BufferRange<SizeType32 const>(*dInput.batchSlots);
             for (auto batchSlot : batchSlotsRange)
             {
-                TensorPtr finishedSteps = ITensor::slice(finishedStepsInput, batchSlot, 1);
-                bufferManager.setZero(*finishedSteps);
+                TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1);
+                bufferManager.setZero(*finishedStepsSlice);
             }
         }
     }
 
-    dOutput.newTokens = newTokensStepView;
-    dOutput.finishReasons = finishedStepsOutput;
+    dOutput.newTokens = ITensor::slice(dOutput.newTokensSteps, step, decoderState.getMaxDecodingDecoderTokens());
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

Original file line number	Diff line number	Diff line change
`@@ -116,16 +116,6 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,`
`116`	`116`	`dInput.batchSize = static_cast<SizeType32>(dInput.batchSlots->getSize());`
`117`	`117`	`dInput.logitsVec = input.logits.at(step);`
`118`	`118`
`119`		`- TensorPtr finishedStepsInput = ITensor::slice(decoderState.getFinishedSteps(), step, 1);`
`120`		`- TensorPtr finishedStepsOutput`
`121`		`- = ITensor::slice(decoderState.getFinishedSteps(), std::min(input.maxDecoderSteps - 1, step + 1), 1);`
`122`		`- finishedStepsInput->squeeze(0);`
`123`		`- finishedStepsOutput->squeeze(0);`
`124`		`- TensorPtr newTokensStepView`
`125`		`- = ITensor::slice(dOutput.newTokensSteps, step, decoderState.getMaxDecodingDecoderTokens());`
`126`		`-`
`127`		`- dInput.finishReasons = finishedStepsInput;`
`128`		`-`
`129`	`119`	`if (speculativeDecodingMode.isDraftTokensExternal())`
`130`	`120`	`{`
`131`	`121`	`dInput.externalDraftTokensInputs->step = step;`
`@@ -136,14 +126,13 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,`
`136`	`126`	`auto batchSlotsRange = BufferRange<SizeType32 const>(*dInput.batchSlots);`
`137`	`127`	`for (auto batchSlot : batchSlotsRange)`
`138`	`128`	`{`
`139`		`- TensorPtr finishedSteps = ITensor::slice(finishedStepsInput, batchSlot, 1);`
`140`		`- bufferManager.setZero(*finishedSteps);`
	`129`	`+ TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1);`
	`130`	`+ bufferManager.setZero(*finishedStepsSlice);`
`141`	`131`	`}`
`142`	`132`	`}`
`143`	`133`	`}`
`144`	`134`
`145`		`- dOutput.newTokens = newTokensStepView;`
`146`		`- dOutput.finishReasons = finishedStepsOutput;`
	`135`	`+ dOutput.newTokens = ITensor::slice(dOutput.newTokensSteps, step, decoderState.getMaxDecodingDecoderTokens());`
`147`	`136`
`148`	`137`	`TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);`
`149`	`138`	`}`