@@ -89,14 +89,15 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
89
89
90
90
dOutput->lengths = bufferManager.emptyTensor (MemoryType::kGPU , nvSizeType);
91
91
92
- // use batchSize many entries instead of the usual 1
93
92
dOutput->finishedSum = bufferManager.emptyTensor (MemoryType::kGPU , nvSizeType);
94
93
// we don't need dOutput->lengths because lengths are passed from outside
95
94
dOutput->cumLogProbs = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
96
95
dOutput->logProbs = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
97
96
dOutput->beamHypotheses .empty (bufferManager);
97
+
98
98
dOutput->finishReasons
99
99
= bufferManager.emptyTensor (MemoryType::kGPU , TRTDataType<tk::FinishedState::UnderlyingType>::value);
100
+ dInput->finishReasons = dOutput->finishReasons ;
100
101
101
102
dOutput->logProbsTiled = bufferManager.emptyTensor (MemoryType::kGPU , nvFloatType);
102
103
@@ -106,8 +107,6 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
106
107
dInput->badWordsLens = bufferManager.emptyTensor (MemoryType::kPINNEDPOOL , nvSizeType);
107
108
dInput->embeddingBias = bufferManager.emptyTensor (MemoryType::kGPU , dtype);
108
109
109
- mFinishedSteps = bufferManager.emptyTensor (MemoryType::kGPU , TRTDataType<tk::FinishedState::UnderlyingType>::value);
110
-
111
110
mBeamSearchBuffers = std::make_unique<BeamSearchBuffers>(bufferManager);
112
111
113
112
setupCacheIndirectionBuffers (bufferManager);
@@ -245,10 +244,6 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
245
244
auto & dOutput = *mJointDecodingOutput ;
246
245
dOutput.ids ->reshape (maxTotalTokensShape);
247
246
248
- auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
249
- mFinishedSteps ->reshape (maxNewTokensShape);
250
- bufferManager.setZero (*mFinishedSteps );
251
-
252
247
dOutput.finishReasons ->reshape (maxBatchSizeXmaxBeamWidthShape);
253
248
bufferManager.setZero (*dOutput.finishReasons );
254
249
@@ -260,6 +255,7 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
260
255
dOutput.finishedSum ->reshape (maxBatchSizeShape);
261
256
bufferManager.setZero (*dOutput.finishedSum );
262
257
258
+ auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
263
259
dOutput.newTokensSteps ->reshape (maxNewTokensShape);
264
260
bufferManager.setZero (*dOutput.newTokensSteps );
265
261
@@ -342,8 +338,6 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
342
338
mMaxDecodingEngineTokens );
343
339
344
340
auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
345
- mFinishedSteps ->reshape (maxNewTokensShape);
346
- bufferManager.setZero (*mFinishedSteps );
347
341
dOutput.newTokensSteps ->reshape (maxNewTokensShape);
348
342
bufferManager.setZero (*dOutput.newTokensSteps );
349
343
@@ -454,7 +448,6 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
454
448
455
449
auto const maxNewTokensShape = ITensor::makeShape ({mMaxDecodingEngineTokens , mMaxBatchSize , mMaxBeamWidth });
456
450
mJointDecodingOutput ->newTokensSteps ->reshape (maxNewTokensShape);
457
- mFinishedSteps ->reshape (maxNewTokensShape);
458
451
459
452
for (auto const & llmReq : genRequests)
460
453
{
@@ -562,11 +555,6 @@ TensorPtr DecoderState::getAcceptedPackedPaths() const
562
555
return mJointDecodingOutput ->speculativeDecodingOutputs ->pathsOffsets ;
563
556
}
564
557
565
- TensorPtr DecoderState::getFinishedSteps () const
566
- {
567
- return mFinishedSteps ;
568
- }
569
-
570
558
SizeType32 DecoderState::getMaxBatchSize () const
571
559
{
572
560
return mMaxBatchSize ;
0 commit comments