NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 18 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 18 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
Lines changed: 13 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 0 additions & 10 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 0 additions & 10 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
Lines changed: 12 additions & 11 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
Lines changed: 12 additions & 11 deletions
@@ -566,6 +566,8 @@ class WindowBlockManager
 
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
+    void pinBlocks(GenerationRequest& sequence);
+
     //! \brief Release blocks of the sequence.
     void releaseBlocks(GenerationRequest& sequence);
 
@@ -883,6 +885,8 @@ class BlockManager
 
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
 
+    void pinBlocks(GenerationRequest& sequence);
+
     void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
 
     void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
@@ -1067,6 +1071,16 @@ class BlockManager
         return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
     }
 
+    [[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash);
+    }
+
+    [[nodiscard]] BlockPtr const& getBlockFromReuseTreeByHash(size_t hash, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).getBlockFromReuseTreeByHash(hash);
+    }
+
     [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
     {
         return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
@@ -1201,6 +1215,8 @@ class BaseKVCacheManager
     [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
         = 0;
 
+    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
+
     /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
     virtual void addToken(LlmRequest::RequestIdType requestId) = 0;
 
@@ -1588,6 +1604,8 @@ class KVCacheManager : public BaseKVCacheManager
     [[nodiscard]] static SizeType32 calculateMaxBlockRequirements(SizeType32 inputLength, SizeType32 outputLength,
         SizeType32 sinkTokenLength, SizeType32 windowSize, SizeType32 beamWidth, SizeType32 tokensPerBlock);
 
+    void pinBlocks(LlmRequest::RequestIdType requestId);
+
     /// @brief Calculates the number of kv-cache blocks that a sequence will require, for a single beam.
     ///
     /// @param sequenceLength The total length of the sequence (input and output).
 
@@ -80,6 +80,19 @@ class BlockRange
         mBlockIds = std::move(blockIds);
     }
 
+    void setBlockIdsFromHashes(std::vector<size_t> blockHashes)
+    {
+        TLLM_CHECK(mManager);
+        std::vector<SizeType32> blockIds;
+        blockIds.reserve(blockHashes.size());
+        auto& blockManager = mManager->getBlockManager();
+        for (auto hash : blockHashes)
+        {
+            blockIds.emplace_back(blockManager.getBlockByHash(hash, mWindowSize)->getId());
+        }
+        mBlockIds = std::move(blockIds);
+    }
+
     [[nodiscard]] std::vector<size_t> getBlockHashes() const
     {
         TLLM_CHECK(mManager);
 
@@ -1831,16 +1831,6 @@ class GenericLlmRequest
         }
     }
 
-    void setRequestedBlockHashes(std::vector<size_t> hashes)
-    {
-        mRequestedBlockHashes = std::move(hashes);
-    }
-
-    [[nodiscard]] std::vector<size_t> const& getRequestedBlockHashes() const
-    {
-        return mRequestedBlockHashes;
-    }
-
     void setIsDummyRequest(bool isDummyRequest)
     {
         mIsDummyRequest = isDummyRequest;
 
@@ -39,23 +39,20 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest)
+BlockRange getBlockRangeForSending(
+    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, std::vector<size_t> const& blockHashes)
 {
-    size_t requestBlockNum = llmRequest.getRequestedBlockHashes().size();
+    size_t requestBlockNum = blockHashes.size();
     constexpr SizeType32 beam{0};
     auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum > 1 || common::getEnvDisableSelectiveCacheTransfer())
+    if (poolNum > 1 || !cacheManager->isEnableBlockReuse())
     {
-        // disable selective cache transfer for poolNum > 1
         return blockRange;
     }
     if (requestBlockNum < blockRange.size() && requestBlockNum > 0)
     {
-        // handle block reuse, the prefix blocks are reused
-        // TODO(zhengd): pass the hashes directly instead of from llmRequest; use hash instead of block num
-        auto const& ids = blockRange.getBlockIds();
-        blockRange.setBlockIds({ids.end() - requestBlockNum, ids.end()});
+        blockRange.setBlockIds(blockHashes);
     }
     return blockRange;
 }
@@ -64,12 +61,15 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques
 {
 
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum > 1 || common::getEnvDisableSelectiveCacheTransfer())
+    if (poolNum == 1 && cacheManager->isEnableBlockReuse())
+    {
+        return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
+    }
+    else
     {
         constexpr SizeType32 beam{0};
         return BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);
     }
-    return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
 }
 
 bool CacheFormatter::needSendCache(
@@ -155,13 +155,14 @@ void CacheFormatter::format(TransferSession& session)
     auto const& selfConfig = session.getSelfState().getCacheState().value();
     auto const& destConfig = session.getOtherState().getCacheState().value();
     auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto& blockHashes = session.getBlockHashes();
     auto& bufferManager = session.getBufferManager();
     if (!needSendCache(selfConfig, destConfig, selfIdx))
     {
         return;
     }
     auto& blockManager = mCacheManager->getBlockManager();
-    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest);
+    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, blockHashes);
 
     auto const numPools = blockManager.getNumPools();
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
Original file line number	Diff line number	Diff line change
`@@ -1831,16 +1831,6 @@ class GenericLlmRequest`
`1831`	`1831`	`}`
`1832`	`1832`	`}`
`1833`	`1833`
`1834`		`- void setRequestedBlockHashes(std::vector<size_t> hashes)`
`1835`		`- {`
`1836`		`- mRequestedBlockHashes = std::move(hashes);`
`1837`		`- }`
`1838`		`-`
`1839`		`- [[nodiscard]] std::vector<size_t> const& getRequestedBlockHashes() const`
`1840`		`- {`
`1841`		`- return mRequestedBlockHashes;`
`1842`		`- }`
`1843`		`-`
`1844`	`1834`	`void setIsDummyRequest(bool isDummyRequest)`
`1845`	`1835`	`{`
`1846`	`1836`	`mIsDummyRequest = isDummyRequest;`
Original file line number	Diff line number	Diff line change
`@@ -39,23 +39,20 @@`
`39`	`39`	`namespace tensorrt_llm::batch_manager::kv_cache_manager`
`40`	`40`	`{`
`41`	`41`
`42`		`-BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest)`
	`42`	`+BlockRange getBlockRangeForSending(`
	`43`	`+ BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, std::vector<size_t> const& blockHashes)`
`43`	`44`	`{`
`44`		`- size_t requestBlockNum = llmRequest.getRequestedBlockHashes().size();`
	`45`	`+ size_t requestBlockNum = blockHashes.size();`
`45`	`46`	`constexpr SizeType32 beam{0};`
`46`	`47`	`auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);`
`47`	`48`	`auto poolNum = cacheManager->getBlockManager().getNumPools();`
`48`		`- if (poolNum > 1 \|\| common::getEnvDisableSelectiveCacheTransfer())`
	`49`	`+ if (poolNum > 1 \|\| !cacheManager->isEnableBlockReuse())`
`49`	`50`	`{`
`50`		`- // disable selective cache transfer for poolNum > 1`
`51`	`51`	`return blockRange;`
`52`	`52`	`}`
`53`	`53`	`if (requestBlockNum < blockRange.size() && requestBlockNum > 0)`
`54`	`54`	`{`
`55`		`- // handle block reuse, the prefix blocks are reused`
`56`		`- // TODO(zhengd): pass the hashes directly instead of from llmRequest; use hash instead of block num`
`57`		`- auto const& ids = blockRange.getBlockIds();`
`58`		`- blockRange.setBlockIds({ids.end() - requestBlockNum, ids.end()});`
	`55`	`+ blockRange.setBlockIds(blockHashes);`
`59`	`56`	`}`
`60`	`57`	`return blockRange;`
`61`	`58`	`}`
`@@ -64,12 +61,15 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques`
`64`	`61`	`{`
`65`	`62`
`66`	`63`	`auto poolNum = cacheManager->getBlockManager().getNumPools();`
`67`		`- if (poolNum > 1 \|\| common::getEnvDisableSelectiveCacheTransfer())`
	`64`	`+ if (poolNum == 1 && cacheManager->isEnableBlockReuse())`
	`65`	`+ {`
	`66`	`+ return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);`
	`67`	`+ }`
	`68`	`+ else`
`68`	`69`	`{`
`69`	`70`	`constexpr SizeType32 beam{0};`
`70`	`71`	`return BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);`
`71`	`72`	`}`
`72`		`- return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`bool CacheFormatter::needSendCache(`
`@@ -155,13 +155,14 @@ void CacheFormatter::format(TransferSession& session)`
`155`	`155`	`auto const& selfConfig = session.getSelfState().getCacheState().value();`
`156`	`156`	`auto const& destConfig = session.getOtherState().getCacheState().value();`
`157`	`157`	`auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();`
	`158`	`+ auto& blockHashes = session.getBlockHashes();`
`158`	`159`	`auto& bufferManager = session.getBufferManager();`
`159`	`160`	`if (!needSendCache(selfConfig, destConfig, selfIdx))`
`160`	`161`	`{`
`161`	`162`	`return;`
`162`	`163`	`}`
`163`	`164`	`auto& blockManager = mCacheManager->getBlockManager();`
`164`		`- auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest);`
	`165`	`+ auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, blockHashes);`
`165`	`166`
`166`	`167`	`auto const numPools = blockManager.getNumPools();`
`167`	`168`	`// TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...`