Skip to content

[None][chore] Dead code elimination, we no longer record/fetch through WindowBlockManager:: mContextBlocksByHash #6249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 7 additions & 38 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,7 @@ class WindowBlockManager
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
bool copyOnPartialReuse);
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse);

~WindowBlockManager();

Expand Down Expand Up @@ -633,11 +632,6 @@ class WindowBlockManager
return mAllBlocksById.at(blockId);
}

[[nodiscard]] BlockMapIterRange getBlocksByHash(size_t hash) const
{
return mContextBlocksByHash.equal_range(hash);
}

[[nodiscard]] SizeType32 getTokensPerBlock() const noexcept
{
return mTokensPerBlock;
Expand Down Expand Up @@ -723,10 +717,6 @@ class WindowBlockManager
//! \param blockIds Id of each block.
void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);

void addBlockToHashMap(BlockPtr const& block);

void removeBlockFromHashMap(BlockPtr const& block);

[[nodiscard]] bool verifyQueueIntegrity();

// Only needed when sliding window attention + paged context fmha are used together.
Expand Down Expand Up @@ -808,8 +798,6 @@ class WindowBlockManager
SizeType32 mTokensPerBlock;
// List of all blocks by idx
std::vector<BlockPtr> mAllBlocksById;
// List of all context blocks by hash
BlockMap mContextBlocksByHash;
// Dummy block acting as root for BlockToken searches
BlockPtr mCachedBlocksRoot;
// KV cache type (self or cross)
Expand Down Expand Up @@ -841,8 +829,6 @@ class WindowBlockManager
double mReusedTokens;
// Total number of input tokens
double mTotalInputTokens;
// Whether or not to maintain a hashmap of blocks.
bool mEnableHashKey;
// Whether blocks that are partially matched should be reused.
bool mEnablePartialReuse;
// Whether partially matched blocks that are already in use should be copied and reused.
Expand All @@ -863,8 +849,8 @@ class BlockManager
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
bool enablePartialReuse = true, bool copyOnPartialReuse = true);
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnPartialReuse = true);

BlockManager(BlockManager const&) = delete;
BlockManager& operator=(BlockManager const&) = delete;
Expand Down Expand Up @@ -1081,11 +1067,6 @@ class BlockManager
return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
}

[[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const
{
return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash);
}

[[nodiscard]] SizeType32 getNumPrimaryBlocks() const
{
return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
Expand All @@ -1096,16 +1077,6 @@ class BlockManager
return getPool(poolIdx).containsBlockScales;
}

void addBlockToHashMap(BlockPtr const& block, SizeType32 windowSize)
{
mWindowBlockManagers.at(windowSize).addBlockToHashMap(block);
}

void removeBlockFromHashMap(BlockPtr const& block, SizeType32 windowSize)
{
mWindowBlockManagers.at(windowSize).removeBlockFromHashMap(block);
}

//! \brief Store context blocks
void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);

Expand Down Expand Up @@ -1385,8 +1356,8 @@ class KVCacheManager : public BaseKVCacheManager
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
bool enablePartialReuse = true, bool copyOnpartialReuse = true);
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnpartialReuse = true);

KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
Expand All @@ -1405,8 +1376,8 @@ class KVCacheManager : public BaseKVCacheManager
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
bool enablePartialReuse = true, bool copyOnpartialReuse = true);
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnpartialReuse = true);

KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
Expand Down Expand Up @@ -1692,8 +1663,6 @@ class KVCacheManager : public BaseKVCacheManager
std::unordered_map<LlmRequest::RequestIdType, GenerationRequest> mSequences;
// Whether to cache KV pages for reuse
bool mEnableBlockReuse;
// Whether enable finding blocks by their hash, ignored when reuse enabled
bool mEnableHashKey;
// Mutex to protect access to mSequences
mutable std::mutex mSequencesMtx;
// buffers for static tensors, will be created after allocating pools
Expand Down
100 changes: 8 additions & 92 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
bool copyOnPartialReuse)
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
: mNumLayers{static_cast<SizeType32>(numKvHeadsPerLayer.size())}
, mTokensPerBlock{tokensPerBlock}
, mEventManager{std::move(eventManager)}
Expand All @@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
TLLM_CHECK(allottedPrimaryBlocks > 0); // You can't have a model with negative primary blocks...
mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enableHashKey, enablePartialReuse,
onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enablePartialReuse,
copyOnPartialReuse);
}

Expand Down Expand Up @@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
bool copyOnPartialReuse)
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
: mDataType{dtype}
, mWindowSize{windowSize}
, mNumPrimaryBlocks{blocksInPrimaryPool}
Expand All @@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
, mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%u]", mWindowSize)}
, mReusedTokens{0.0}
, mTotalInputTokens{0.0}
, mEnableHashKey{enableHashKey}
, mEnablePartialReuse{enablePartialReuse}
, mCopyOnPartialReuse{copyOnPartialReuse}
{
Expand Down Expand Up @@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
}

void WindowBlockManager::addBlockToHashMap(BlockPtr const& block)
{
if (!mEnableHashKey)
{
return;
}
auto range = mContextBlocksByHash.equal_range(block->getHash());
for (auto it = range.first; it != range.second; ++it)
{
if (it->second == block)
{
// TODO: change to assert when reused block is added only once
TLLM_LOG_TRACE(
"Block %d by %zx exists", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
return;
}
}
TLLM_LOG_TRACE(
"Add block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
mContextBlocksByHash.emplace(block->getHash(), std::move(block));
}

void WindowBlockManager::removeBlockFromHashMap(BlockPtr const& block)
{
if (mContextBlocksByHash.empty() || block->getBlockKey().uniqueTokens.empty())
{
// Hash key not enabled / Empty block
return;
}
auto range = mContextBlocksByHash.equal_range(block->getHash());
TLLM_LOG_TRACE(
"Remove block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
for (auto it = range.first; it != range.second; ++it)
{
if (it->second == block)
{
mContextBlocksByHash.erase(it);
return;
}
}
// TODO: should be unreachable
TLLM_LOG_DEBUG("Trying to remove block %d by %zx that is not in hash map", block->getBlockId(), block->getHash());
}

void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize)
{
mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock);
Expand Down Expand Up @@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Reused partially filled block %d", mLogPrefix.c_str(),
matchingBlockId);
addBlockToHashMap(matchingBlock);
}
searchRoot = nullptr; // no matching needed for following blocks
}
Expand All @@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
mEvictionPolicy->claimBlock(
matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId);
addBlockToHashMap(matchingBlock);
searchRoot = matchingBlock;
}
onboardBlock(matchingBlock);
Expand Down Expand Up @@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
++blockItr;
}
freeBlock->setHash();
addBlockToHashMap(freeBlock);
++mMissedBlocks;
}
}
Expand All @@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
++blockItr;
}
freeBlock->setHash();
addBlockToHashMap(freeBlock);
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d",
mLogPrefix.c_str(), beamIdx, freeBlock->getBlockId(), bi);
}
Expand Down Expand Up @@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks(
if (oldHash != newHash)
{
TLLM_LOG_DEBUG("#%d block hash %zx -> %zx", block->getBlockId(), oldHash, newHash);
removeBlockFromHashMap(block);
block->setHash(newHash);
addBlockToHashMap(block);
}
searchRoot = block;
}
Expand Down Expand Up @@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
if (!block->hasRefs())
{
mEvictionPolicy->releaseBlock(block);
removeBlockFromHashMap(block);
}
}

Expand Down Expand Up @@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
if (!block->hasRefs())
{
mEvictionPolicy->releaseBlock(block, true);
removeBlockFromHashMap(block);
}
// Remove block from allocated blocks
allocatedBlocks.pop_back();
Expand Down Expand Up @@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
if (!block->hasRefs())
{
mEvictionPolicy->releaseBlock(block);
removeBlockFromHashMap(block);
}
}
// Remove stored block ids in sequence
Expand Down Expand Up @@ -1654,8 +1598,7 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
: KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
std::make_shared<runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream)), maxSequenceLength,
enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, false, enablePartialReuse,
copyOnPartialReuse)
enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, enablePartialReuse, copyOnPartialReuse)
{
}

Expand All @@ -1682,8 +1625,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
bool copyOnPartialReuse)
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
: mMaxBeamWidth(maxBeamWidth)
, mDataType(dtype)
, mMaxAttentionWindow(*std::max_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
Expand All @@ -1693,10 +1635,9 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
, mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
std::move(stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
enableHashKey, enablePartialReuse, copyOnPartialReuse)
enablePartialReuse, copyOnPartialReuse)
// disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
, mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
, mEnableHashKey{enableHashKey}
{
TLLM_CHECK_DEBUG(std::find(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end(), mMaxAttentionWindow)
!= maxAttentionWindowVec.end());
Expand All @@ -1716,12 +1657,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
bool copyOnPartialReuse)
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
: KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
std::move(eventManager), enablePartialReuse, copyOnPartialReuse)
{
}

Expand Down Expand Up @@ -2085,30 +2025,6 @@ void KVCacheManager::addSequence(
llmRequest->mRequestId);
}
mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx, windowSize);
if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1)
{
constexpr SizeType32 beamIdx = 0;
auto const& blockIds = sequence.getCacheBlockIds(windowSize).at(beamIdx);
auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true);
auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
auto tokensPerBlock = static_cast<size_t>(getTokensPerBlock());
for (size_t i = 0; i < blockIds.size(); i++)
{
auto const& block = mBlockManager.getBlockById(blockIds[i], windowSize);
if (i < blockKeys.size())
{
block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock);
}
else
{
block->setBlockKey({}, false);
}
block->setHash();
mBlockManager.addBlockToHashMap(block, windowSize);
}
}
}
cacheBlockOffsets(sequence, windowSize);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,7 @@ std::unique_ptr<kv_cache_manager::KVCacheManager> TrtGptModelInflightBatching::c
kvCacheConfig.getEventBufferMaxSize() > 0
? std::make_unique<kv_cache_manager::KVCacheEventManager>(kvCacheConfig.getEventBufferMaxSize())
: nullptr,
false, kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse());
kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse());

reshapeKvTensors(kvCacheManager->getOffsetTableDimensions());

Expand Down
Loading