NVIDIA · Tabrizian · Jul 24, 2025 · Jul 25, 2025 · Aug 12, 2025 · coderabbitai
diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -34,8 +34,8 @@ namespace tensorrt_llm::batch_manager
 
 class ContextProgress;
 class BaseCacheTransceiver;
-class DataResponder;
-class DataRequester;
+class CacheSender;
+class CacheReceiver;
 
 class CacheTransceiverFactory
 {
@@ -110,15 +110,16 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     void setContextState(LlmRequest* llmRequest);
 
-    std::unique_ptr<DataResponder> mDataResponder;
-    std::unique_ptr<DataRequester> mDataRequester;
-    std::vector<std::pair<LlmRequest*, std::future<void>>> mResponderFutures;
+    std::unique_ptr<CacheSender> mCacheSender;
+    std::unique_ptr<CacheReceiver> mCacheReceiver;
+    std::vector<std::pair<LlmRequest*, std::future<void>>> mSenderFutures;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
     mpi::MpiComm const *mMpiGroupComm{nullptr}, *mMpiWorldComm{nullptr};
     std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
         mMpiGroupTPInDPComm;
     executor::kv_cache::CommState const* mCommState;
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
+    // std::unique_ptr<CacheServer> mCacheServer;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
     std::optional<executor::CacheTransceiverConfig> mCacheTransceiverConfig;
     std::unique_ptr<kv_cache_manager::CacheTransBufferManager> mCacheTransBufferManager;

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -115,6 +115,8 @@ struct BlockKey
     // Each extra key is a pair of (mm_hash, start_offset_in_block)
     std::vector<MmKey> extraKeys;
 
+    size_t hash{0};
+
     BlockKey() = default;
 
     explicit BlockKey(VecTokens const& tokens, std::optional<LoraTaskIdType> loraTaskId = std::nullopt)
@@ -127,6 +129,11 @@ struct BlockKey
         }
     }
 
+    explicit BlockKey(size_t hash)
+        : hash{hash}
+    {
+    }
-    explicit BlockKey(size_t hash)
-        : hash{hash}
-    {
-    }
+    /// @brief Construct a BlockKey with a precomputed hash.
+    /// @param hash The precomputed hash value.
+    /// @details Used when retrieving blocks by hash from the reuse tree.
+    explicit BlockKey(size_t hash)
+        : hash{hash}
+    {
+    }
-    explicit BlockKey(size_t hash)
-        : hash{hash}
-    {
-    }
+    /// @brief Construct a BlockKey with a precomputed hash.
+    /// @param hash The precomputed hash value.
+    /// @details Used when retrieving blocks by hash from the reuse tree.
+    explicit BlockKey(size_t hash)
+        : hash{hash}
+    {
+    }
+
     explicit BlockKey(bool usesExtraIds, std::optional<LoraTaskIdType> loraTaskId, VecUniqueTokens uniqueTokens,
         std::vector<MmKey> extraKeys = {})
         : usesExtraIds{usesExtraIds}
@@ -164,6 +171,10 @@ struct BlockKeyHasher
 
     std::size_t operator()(BlockKey const& blockKey, std::size_t parentHash = 0) const noexcept
     {
+        if (blockKey.hash != 0)
+        {
+            return blockKey.hash;
+        }
         return hash(blockKey, parentHash);
     }
 };
@@ -566,6 +577,8 @@ class WindowBlockManager
 
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
+    void pinBlocks(GenerationRequest& sequence);
+
-    void pinBlocks(GenerationRequest& sequence);
+    //! \brief Pin blocks associated with a sequence to prevent eviction.
+    //! \param sequence The generation request whose blocks should be pinned.
+    //! \details This method marks blocks as pinned in the KV cache to ensure they
+    //!          remain available for reuse across multiple requests.
+    void pinBlocks(GenerationRequest& sequence);
-    void pinBlocks(GenerationRequest& sequence);
+    //! \brief Pin blocks associated with a sequence to prevent eviction.
+    //! \param sequence The generation request whose blocks should be pinned.
+    //! \details This method marks blocks as pinned in the KV cache to ensure they
+    //!          remain available for reuse across multiple requests.
+    void pinBlocks(GenerationRequest& sequence);
     //! \brief Release blocks of the sequence.
     void releaseBlocks(GenerationRequest& sequence);
 
@@ -737,6 +750,9 @@ class WindowBlockManager
         return 0;
     }
 
+    [[nodiscard]] std::optional<std::shared_ptr<KVCacheBlock>> findBlocksInReuseTreeByHashes(
+        std::vector<size_t> const& hashes) const;
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -883,6 +899,8 @@ class BlockManager
 
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
 
+    void pinBlocks(GenerationRequest& sequence);
+
     void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
 
     void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
@@ -1067,6 +1085,12 @@ class BlockManager
         return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
     }
 
+    [[nodiscard]] std::optional<std::shared_ptr<KVCacheBlock>> findBlocksInReuseTreeByHashes(
+        std::vector<size_t> const& hashes, SizeType32 windowSize) const
+    {
+        return mWindowBlockManagers.at(windowSize).findBlocksInReuseTreeByHashes(hashes);
+    }
+
     [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
     {
         return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
@@ -1201,6 +1225,8 @@ class BaseKVCacheManager
     [[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
         = 0;
 
+    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
+
-    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
+    //! \brief Pin blocks associated with a request to prevent eviction.
+    //! \param requestId The ID of the request whose blocks should be pinned.
+    //! \details Implementations should mark all blocks associated with the request
+    //!          as pinned to ensure they remain available for potential reuse.
+    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
-    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
+    //! \brief Pin blocks associated with a request to prevent eviction.
+    //! \param requestId The ID of the request whose blocks should be pinned.
+    //! \details Implementations should mark all blocks associated with the request
+    //!          as pinned to ensure they remain available for potential reuse.
+    virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
     /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
     virtual void addToken(LlmRequest::RequestIdType requestId) = 0;
 
@@ -1338,6 +1364,10 @@ class BaseKVCacheManager
     [[nodiscard]] virtual SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const = 0;
 
     [[nodiscard]] virtual CacheType getCacheType() const = 0;
+
+    [[nodiscard]] virtual std::optional<std::shared_ptr<KVCacheBlock>> findBlocksInReuseTreeByHashes(
+        std::vector<size_t> const& hashes, SizeType32 windowSize) const
+        = 0;
 };
 
 class KVCacheManager : public BaseKVCacheManager
@@ -1588,6 +1618,8 @@ class KVCacheManager : public BaseKVCacheManager
     [[nodiscard]] static SizeType32 calculateMaxBlockRequirements(SizeType32 inputLength, SizeType32 outputLength,
         SizeType32 sinkTokenLength, SizeType32 windowSize, SizeType32 beamWidth, SizeType32 tokensPerBlock);
 
+    void pinBlocks(LlmRequest::RequestIdType requestId);
+
     /// @brief Calculates the number of kv-cache blocks that a sequence will require, for a single beam.
     ///
     /// @param sequenceLength The total length of the sequence (input and output).
@@ -1625,6 +1657,12 @@ class KVCacheManager : public BaseKVCacheManager
         mBlockManager.flushIterationEvents();
     }
 
+    std::optional<std::shared_ptr<KVCacheBlock>> findBlocksInReuseTreeByHashes(
+        std::vector<size_t> const& hashes, SizeType32 windowSize) const override
+    {
+        return mBlockManager.findBlocksInReuseTreeByHashes(hashes, windowSize);
+    }
+
     /// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
     ///
     /// @param inputLength The number of input tokens in the sequence.

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -48,6 +48,28 @@ class BlockRange
         return BlockRange(cacheManager, blockIds, requestId);
     }
 
+    static BlockRange fromReuseTree(BaseKVCacheManager const& cacheManager, std::vector<size_t> const& allBlockHashes,
+        std::vector<size_t> const& requestedBlockHashes)
+    {
+        auto const windowSize = firstWindowSize(cacheManager);
+        auto lastBlock = *cacheManager.findBlocksInReuseTreeByHashes(allBlockHashes, windowSize);
+        // TODO: handle the case where the last block is not found
+        TLLM_CHECK_WITH_INFO(lastBlock, "Couldn't find the requested block in the reuse tree");
+        // Assume the the last block is the requested block
+        std::vector<SizeType32> blockIds;
-    static BlockRange fromReuseTree(BaseKVCacheManager const& cacheManager, std::vector<size_t> const& allBlockHashes,
-        std::vector<size_t> const& requestedBlockHashes)
-    {
-        auto const windowSize = firstWindowSize(cacheManager);
-        auto lastBlock = *cacheManager.findBlocksInReuseTreeByHashes(allBlockHashes, windowSize);
-        // TODO: handle the case where the last block is not found
-        TLLM_CHECK_WITH_INFO(lastBlock, "Couldn't find the requested block in the reuse tree");
-        // Assume the the last block is the requested block
-        std::vector<SizeType32> blockIds;
+    static BlockRange fromReuseTree(BaseKVCacheManager const& cacheManager, std::vector<size_t> const& allBlockHashes,
+        std::vector<size_t> const& requestedBlockHashes)
+    {
+        auto const windowSize = firstWindowSize(cacheManager);
+        auto lastBlockOpt = cacheManager.findBlocksInReuseTreeByHashes(allBlockHashes, windowSize);
+        TLLM_CHECK_WITH_INFO(lastBlockOpt, "Couldn't find the requested block in the reuse tree");
+        auto lastBlock = *lastBlockOpt;
+        TLLM_CHECK_WITH_INFO(lastBlock, "Couldn't find the requested block in the reuse tree");
+        // Assume the the last block is the requested block
+        std::vector<SizeType32> blockIds;
-    static BlockRange fromReuseTree(BaseKVCacheManager const& cacheManager, std::vector<size_t> const& allBlockHashes,
-        std::vector<size_t> const& requestedBlockHashes)
-    {
-        auto const windowSize = firstWindowSize(cacheManager);
-        auto lastBlock = *cacheManager.findBlocksInReuseTreeByHashes(allBlockHashes, windowSize);
-        // TODO: handle the case where the last block is not found
-        TLLM_CHECK_WITH_INFO(lastBlock, "Couldn't find the requested block in the reuse tree");
-        // Assume the the last block is the requested block
-        std::vector<SizeType32> blockIds;
+    static BlockRange fromReuseTree(BaseKVCacheManager const& cacheManager, std::vector<size_t> const& allBlockHashes,
+        std::vector<size_t> const& requestedBlockHashes)
+    {
+        auto const windowSize = firstWindowSize(cacheManager);
+        auto lastBlockOpt = cacheManager.findBlocksInReuseTreeByHashes(allBlockHashes, windowSize);
+        TLLM_CHECK_WITH_INFO(lastBlockOpt, "Couldn't find the requested block in the reuse tree");
+        auto lastBlock = *lastBlockOpt;
+        TLLM_CHECK_WITH_INFO(lastBlock, "Couldn't find the requested block in the reuse tree");
+        // Assume the the last block is the requested block
+        std::vector<SizeType32> blockIds;
+        for (auto const& hash : requestedBlockHashes)
+        {
+            if (lastBlock->getHash() != hash)
+            {
+                return BlockRange(cacheManager, {}, 0);
+            }
+            blockIds.emplace_back(lastBlock->getBlockId());
+            lastBlock = lastBlock->getPrevBlock();
+            TLLM_CHECK_WITH_INFO(lastBlock, "Last block is not found");
+        }
+        return BlockRange(cacheManager, blockIds, 0);
+    }
-        for (auto const& hash : requestedBlockHashes)
-        {
-            if (lastBlock->getHash() != hash)
-            {
-                return BlockRange(cacheManager, {}, 0);
-            }
-            blockIds.emplace_back(lastBlock->getBlockId());
-            lastBlock = lastBlock->getPrevBlock();
-            TLLM_CHECK_WITH_INFO(lastBlock, "Last block is not found");
-        }
-        return BlockRange(cacheManager, blockIds, 0);
-    }
+        for (auto const& hash : requestedBlockHashes)
+        {
+            TLLM_CHECK_WITH_INFO(lastBlock, "Requested hash sequence exceeds reuse chain length");
+            if (lastBlock->getHash() != hash)
+            {
+                TLLM_THROW("Requested block hash mismatch in reuse tree (expected %zu, got %zu).",
+                    hash, lastBlock->getHash());
+            }
+            blockIds.emplace_back(lastBlock->getBlockId());
+            lastBlock = lastBlock->getPrevBlock();
+        }
+        return BlockRange(cacheManager, blockIds, 0);
+    }
-        for (auto const& hash : requestedBlockHashes)
-        {
-            if (lastBlock->getHash() != hash)
-            {
-                return BlockRange(cacheManager, {}, 0);
-            }
-            blockIds.emplace_back(lastBlock->getBlockId());
-            lastBlock = lastBlock->getPrevBlock();
-            TLLM_CHECK_WITH_INFO(lastBlock, "Last block is not found");
-        }
-        return BlockRange(cacheManager, blockIds, 0);
-    }
+        for (auto const& hash : requestedBlockHashes)
+        {
+            TLLM_CHECK_WITH_INFO(lastBlock, "Requested hash sequence exceeds reuse chain length");
+            if (lastBlock->getHash() != hash)
+            {
+                TLLM_THROW("Requested block hash mismatch in reuse tree (expected %zu, got %zu).",
+                    hash, lastBlock->getHash());
+            }
+            blockIds.emplace_back(lastBlock->getBlockId());
+            lastBlock = lastBlock->getPrevBlock();
+        }
+        return BlockRange(cacheManager, blockIds, 0);
+    }
+
     BlockRange(runtime::ITensor::SharedPtr pool, std::vector<SizeType32> const& blockIds) // Only used in tests
         : mManager{nullptr}
         , mPool{std::move(pool)}

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1831,16 +1831,6 @@ class GenericLlmRequest
         }
     }
 
-    void setRequestedBlockHashes(std::vector<size_t> hashes)
-    {
-        mRequestedBlockHashes = std::move(hashes);
-    }
-
-    [[nodiscard]] std::vector<size_t> const& getRequestedBlockHashes() const
-    {
-        return mRequestedBlockHashes;
-    }
-
     void setIsDummyRequest(bool isDummyRequest)
     {
         mIsDummyRequest = isDummyRequest;

diff --git a/cpp/tensorrt_llm/batch_manager/CMakeLists.txt b/cpp/tensorrt_llm/batch_manager/CMakeLists.txt
@@ -24,7 +24,6 @@ set(SRCS
     createNewDecoderRequests.cpp
     contextProgress.cpp
     dataTransceiver.cpp
-    dataTransceiverImpl.cpp
     decoderBuffers.cpp
     encoderBuffers.cpp
     guidedDecoder.cpp

@@ -39,37 +39,33 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest)
+BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
+    std::vector<size_t> const& allBlockHashes, std::vector<size_t> const& requestedBlockHashes)
 {
-    size_t requestBlockNum = llmRequest.getRequestedBlockHashes().size();
+    size_t requestBlockNum = requestedBlockHashes.size();
     constexpr SizeType32 beam{0};
-    auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum > 1 || common::getEnvDisableSelectiveCacheTransfer())
+    if (poolNum > 1 || !cacheManager->isEnableBlockReuse())
     {
-        // disable selective cache transfer for poolNum > 1
+        auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);
         return blockRange;
     }
-    if (requestBlockNum < blockRange.size() && requestBlockNum > 0)
-    {
-        // handle block reuse, the prefix blocks are reused
-        // TODO(zhengd): pass the hashes directly instead of from llmRequest; use hash instead of block num
-        auto const& ids = blockRange.getBlockIds();
-        blockRange.setBlockIds({ids.end() - requestBlockNum, ids.end()});
-    }
-    return blockRange;
+    return BlockRange::fromReuseTree(*cacheManager, allBlockHashes, requestedBlockHashes);
 }
 
 BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest)
 {
 
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum > 1 || common::getEnvDisableSelectiveCacheTransfer())
+    if (poolNum == 1 && cacheManager->isEnableBlockReuse())
+    {
+        return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
+    }
+    else
     {
         constexpr SizeType32 beam{0};
         return BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId, beam);
     }
-    return BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
 }
 
 bool CacheFormatter::needSendCache(
@@ -155,13 +151,17 @@ void CacheFormatter::format(TransferSession& session)
     auto const& selfConfig = session.getSelfState().getCacheState().value();
     auto const& destConfig = session.getOtherState().getCacheState().value();
     auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto& allBlockHashes = session.getAllBlockHashes();
+    auto& requestedBlockHashes = session.getRequestedBlockHashes();
+    TLLM_CHECK_WITH_INFO(allBlockHashes.size() >= requestedBlockHashes.size(),
+        "allBlockHashes must be greater than or equal to requestedBlockHashes");
     auto& bufferManager = session.getBufferManager();
     if (!needSendCache(selfConfig, destConfig, selfIdx))
     {
         return;
     }
     auto& blockManager = mCacheManager->getBlockManager();
-    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest);
+    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, allBlockHashes, requestedBlockHashes);
 
     auto const numPools = blockManager.getNumPools();
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...