@@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
504504 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
505505 SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
506506 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
507- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
508- bool copyOnPartialReuse)
507+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
509508 : mNumLayers {static_cast <SizeType32>(numKvHeadsPerLayer.size ())}
510509 , mTokensPerBlock {tokensPerBlock}
511510 , mEventManager {std::move (eventManager)}
@@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
530529 TLLM_CHECK (allottedPrimaryBlocks > 0 ); // You can't have a model with negative primary blocks...
531530 mWindowBlockManagers .try_emplace (windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
532531 sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
533- onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enableHashKey, enablePartialReuse,
532+ onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enablePartialReuse,
534533 copyOnPartialReuse);
535534 }
536535
@@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
573572 SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
574573 SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
575574 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
576- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
577- bool copyOnPartialReuse)
575+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
578576 : mDataType {dtype}
579577 , mWindowSize {windowSize}
580578 , mNumPrimaryBlocks {blocksInPrimaryPool}
@@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
596594 , mLogPrefix {tensorrt_llm::common::fmtstr (" BlockManager[windowSize=%u]" , mWindowSize )}
597595 , mReusedTokens {0.0 }
598596 , mTotalInputTokens {0.0 }
599- , mEnableHashKey {enableHashKey}
600597 , mEnablePartialReuse {enablePartialReuse}
601598 , mCopyOnPartialReuse {copyOnPartialReuse}
602599{
@@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
920917 mWindowBlockManagers .at (windowSize).setOffsets (offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
921918}
922919
923- void WindowBlockManager::addBlockToHashMap (BlockPtr const & block)
924- {
925- if (!mEnableHashKey )
926- {
927- return ;
928- }
929- auto range = mContextBlocksByHash .equal_range (block->getHash ());
930- for (auto it = range.first ; it != range.second ; ++it)
931- {
932- if (it->second == block)
933- {
934- // TODO: change to assert when reused block is added only once
935- TLLM_LOG_TRACE (
936- " Block %d by %zx exists" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
937- return ;
938- }
939- }
940- TLLM_LOG_TRACE (
941- " Add block %d by %zx, block n = %zu" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
942- mContextBlocksByHash .emplace (block->getHash (), std::move (block));
943- }
944-
945- void WindowBlockManager::removeBlockFromHashMap (BlockPtr const & block)
946- {
947- if (mContextBlocksByHash .empty () || block->getBlockKey ().uniqueTokens .empty ())
948- {
949- // Hash key not enabled / Empty block
950- return ;
951- }
952- auto range = mContextBlocksByHash .equal_range (block->getHash ());
953- TLLM_LOG_TRACE (
954- " Remove block %d by %zx, block n = %zu" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
955- for (auto it = range.first ; it != range.second ; ++it)
956- {
957- if (it->second == block)
958- {
959- mContextBlocksByHash .erase (it);
960- return ;
961- }
962- }
963- // TODO: should be unreachable
964- TLLM_LOG_DEBUG (" Trying to remove block %d by %zx that is not in hash map" , block->getBlockId (), block->getHash ());
965- }
966-
967920void BlockManager::onboardBlock (BlockPtr const & offloadBlock, SizeType32 windowSize)
968921{
969922 mWindowBlockManagers .at (windowSize).onboardBlock (offloadBlock);
@@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11041057 matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
11051058 TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Reused partially filled block %d" , mLogPrefix .c_str (),
11061059 matchingBlockId);
1107- addBlockToHashMap (matchingBlock);
11081060 }
11091061 searchRoot = nullptr ; // no matching needed for following blocks
11101062 }
@@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11141066 mEvictionPolicy ->claimBlock (
11151067 matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
11161068 TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Matched full block %d" , mLogPrefix .c_str (), matchingBlockId);
1117- addBlockToHashMap (matchingBlock);
11181069 searchRoot = matchingBlock;
11191070 }
11201071 onboardBlock (matchingBlock);
@@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11451096 ++blockItr;
11461097 }
11471098 freeBlock->setHash ();
1148- addBlockToHashMap (freeBlock);
11491099 ++mMissedBlocks ;
11501100 }
11511101 }
@@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
11691119 ++blockItr;
11701120 }
11711121 freeBlock->setHash ();
1172- addBlockToHashMap (freeBlock);
11731122 TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d" ,
11741123 mLogPrefix .c_str (), beamIdx, freeBlock->getBlockId (), bi);
11751124 }
@@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks(
13691318 if (oldHash != newHash)
13701319 {
13711320 TLLM_LOG_DEBUG (" #%d block hash %zx -> %zx" , block->getBlockId (), oldHash, newHash);
1372- removeBlockFromHashMap (block);
13731321 block->setHash (newHash);
1374- addBlockToHashMap (block);
13751322 }
13761323 searchRoot = block;
13771324 }
@@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
14081355 if (!block->hasRefs ())
14091356 {
14101357 mEvictionPolicy ->releaseBlock (block);
1411- removeBlockFromHashMap (block);
14121358 }
14131359 }
14141360
@@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
14731419 if (!block->hasRefs ())
14741420 {
14751421 mEvictionPolicy ->releaseBlock (block, true );
1476- removeBlockFromHashMap (block);
14771422 }
14781423 // Remove block from allocated blocks
14791424 allocatedBlocks.pop_back ();
@@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
16161561 if (!block->hasRefs ())
16171562 {
16181563 mEvictionPolicy ->releaseBlock (block);
1619- removeBlockFromHashMap (block);
16201564 }
16211565 }
16221566 // Remove stored block ids in sequence
@@ -1654,8 +1598,7 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
16541598 : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
16551599 maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
16561600 std::make_shared<runtime::CudaStream>(reinterpret_cast <cudaStream_t>(stream)), maxSequenceLength,
1657- enableBlockReuse, onboardBlocks, cacheType, std::nullopt , nullptr , false , enablePartialReuse,
1658- copyOnPartialReuse)
1601+ enableBlockReuse, onboardBlocks, cacheType, std::nullopt , nullptr , enablePartialReuse, copyOnPartialReuse)
16591602{
16601603}
16611604
@@ -1682,8 +1625,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
16821625 SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
16831626 bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
16841627 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1685- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
1686- bool copyOnPartialReuse)
1628+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
16871629 : mMaxBeamWidth (maxBeamWidth)
16881630 , mDataType (dtype)
16891631 , mMaxAttentionWindow (*std::max_element (maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
@@ -1693,10 +1635,9 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
16931635 , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
16941636 std::move (stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
16951637 mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
1696- enableHashKey, enablePartialReuse, copyOnPartialReuse)
1638+ enablePartialReuse, copyOnPartialReuse)
16971639 // disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
16981640 , mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
1699- , mEnableHashKey {enableHashKey}
17001641{
17011642 TLLM_CHECK_DEBUG (std::find (maxAttentionWindowVec.begin (), maxAttentionWindowVec.end (), mMaxAttentionWindow )
17021643 != maxAttentionWindowVec.end ());
@@ -1716,12 +1657,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
17161657 SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
17171658 bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
17181659 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1719- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
1720- bool copyOnPartialReuse)
1660+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
17211661 : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
17221662 maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
17231663 std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
1724- std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
1664+ std::move(eventManager), enablePartialReuse, copyOnPartialReuse)
17251665{
17261666}
17271667
@@ -2085,30 +2025,6 @@ void KVCacheManager::addSequence(
20852025 llmRequest->mRequestId );
20862026 }
20872027 mBlockManager .addSequence (sequence, numContextBlocks, unsharedBlockIdx, windowSize);
2088- if (mEnableHashKey && llmRequest.has_value () && beamWidth == 1 )
2089- {
2090- constexpr SizeType32 beamIdx = 0 ;
2091- auto const & blockIds = sequence.getCacheBlockIds (windowSize).at (beamIdx);
2092- auto const & uniqueTokens = llmRequest->getUniqueTokens (beamIdx);
2093- auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
2094- uniqueTokens, uniqueTokens.size () - 1 , getTokensPerBlock (), true );
2095- auto blockKeys = buildBlockKeys (blockedUniqueTokens, *llmRequest);
2096- auto tokensPerBlock = static_cast <size_t >(getTokensPerBlock ());
2097- for (size_t i = 0 ; i < blockIds.size (); i++)
2098- {
2099- auto const & block = mBlockManager .getBlockById (blockIds[i], windowSize);
2100- if (i < blockKeys.size ())
2101- {
2102- block->setBlockKey (blockKeys[i], blockKeys[i].uniqueTokens .size () == tokensPerBlock);
2103- }
2104- else
2105- {
2106- block->setBlockKey ({}, false );
2107- }
2108- block->setHash ();
2109- mBlockManager .addBlockToHashMap (block, windowSize);
2110- }
2111- }
21122028 }
21132029 cacheBlockOffsets (sequence, windowSize);
21142030 }
0 commit comments