@@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
504
504
std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
505
505
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
506
506
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
507
- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
508
- bool copyOnPartialReuse)
507
+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
509
508
: mNumLayers {static_cast <SizeType32>(numKvHeadsPerLayer.size ())}
510
509
, mTokensPerBlock {tokensPerBlock}
511
510
, mEventManager {std::move (eventManager)}
@@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
530
529
TLLM_CHECK (allottedPrimaryBlocks > 0 ); // You can't have a model with negative primary blocks...
531
530
mWindowBlockManagers .try_emplace (windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
532
531
sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
533
- onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enableHashKey, enablePartialReuse,
532
+ onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager , enablePartialReuse,
534
533
copyOnPartialReuse);
535
534
}
536
535
@@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
573
572
SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
574
573
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
575
574
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
576
- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
577
- bool copyOnPartialReuse)
575
+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
578
576
: mDataType {dtype}
579
577
, mWindowSize {windowSize}
580
578
, mNumPrimaryBlocks {blocksInPrimaryPool}
@@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
596
594
, mLogPrefix {tensorrt_llm::common::fmtstr (" BlockManager[windowSize=%u]" , mWindowSize )}
597
595
, mReusedTokens {0.0 }
598
596
, mTotalInputTokens {0.0 }
599
- , mEnableHashKey {enableHashKey}
600
597
, mEnablePartialReuse {enablePartialReuse}
601
598
, mCopyOnPartialReuse {copyOnPartialReuse}
602
599
{
@@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
920
917
mWindowBlockManagers .at (windowSize).setOffsets (offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
921
918
}
922
919
923
- void WindowBlockManager::addBlockToHashMap (BlockPtr const & block)
924
- {
925
- if (!mEnableHashKey )
926
- {
927
- return ;
928
- }
929
- auto range = mContextBlocksByHash .equal_range (block->getHash ());
930
- for (auto it = range.first ; it != range.second ; ++it)
931
- {
932
- if (it->second == block)
933
- {
934
- // TODO: change to assert when reused block is added only once
935
- TLLM_LOG_TRACE (
936
- " Block %d by %zx exists" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
937
- return ;
938
- }
939
- }
940
- TLLM_LOG_TRACE (
941
- " Add block %d by %zx, block n = %zu" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
942
- mContextBlocksByHash .emplace (block->getHash (), std::move (block));
943
- }
944
-
945
- void WindowBlockManager::removeBlockFromHashMap (BlockPtr const & block)
946
- {
947
- if (mContextBlocksByHash .empty () || block->getBlockKey ().uniqueTokens .empty ())
948
- {
949
- // Hash key not enabled / Empty block
950
- return ;
951
- }
952
- auto range = mContextBlocksByHash .equal_range (block->getHash ());
953
- TLLM_LOG_TRACE (
954
- " Remove block %d by %zx, block n = %zu" , block->getBlockId (), block->getHash (), mContextBlocksByHash .size ());
955
- for (auto it = range.first ; it != range.second ; ++it)
956
- {
957
- if (it->second == block)
958
- {
959
- mContextBlocksByHash .erase (it);
960
- return ;
961
- }
962
- }
963
- // TODO: should be unreachable
964
- TLLM_LOG_DEBUG (" Trying to remove block %d by %zx that is not in hash map" , block->getBlockId (), block->getHash ());
965
- }
966
-
967
920
void BlockManager::onboardBlock (BlockPtr const & offloadBlock, SizeType32 windowSize)
968
921
{
969
922
mWindowBlockManagers .at (windowSize).onboardBlock (offloadBlock);
@@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
1104
1057
matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
1105
1058
TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Reused partially filled block %d" , mLogPrefix .c_str (),
1106
1059
matchingBlockId);
1107
- addBlockToHashMap (matchingBlock);
1108
1060
}
1109
1061
searchRoot = nullptr ; // no matching needed for following blocks
1110
1062
}
@@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
1114
1066
mEvictionPolicy ->claimBlock (
1115
1067
matchingBlock, perBlockRetentions[bi].retentionPriority , perBlockRetentions[bi].durationMs );
1116
1068
TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Matched full block %d" , mLogPrefix .c_str (), matchingBlockId);
1117
- addBlockToHashMap (matchingBlock);
1118
1069
searchRoot = matchingBlock;
1119
1070
}
1120
1071
onboardBlock (matchingBlock);
@@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
1145
1096
++blockItr;
1146
1097
}
1147
1098
freeBlock->setHash ();
1148
- addBlockToHashMap (freeBlock);
1149
1099
++mMissedBlocks ;
1150
1100
}
1151
1101
}
@@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
1169
1119
++blockItr;
1170
1120
}
1171
1121
freeBlock->setHash ();
1172
- addBlockToHashMap (freeBlock);
1173
1122
TLLM_LOG_DEBUG (" %s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d" ,
1174
1123
mLogPrefix .c_str (), beamIdx, freeBlock->getBlockId (), bi);
1175
1124
}
@@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks(
1369
1318
if (oldHash != newHash)
1370
1319
{
1371
1320
TLLM_LOG_DEBUG (" #%d block hash %zx -> %zx" , block->getBlockId (), oldHash, newHash);
1372
- removeBlockFromHashMap (block);
1373
1321
block->setHash (newHash);
1374
- addBlockToHashMap (block);
1375
1322
}
1376
1323
searchRoot = block;
1377
1324
}
@@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
1408
1355
if (!block->hasRefs ())
1409
1356
{
1410
1357
mEvictionPolicy ->releaseBlock (block);
1411
- removeBlockFromHashMap (block);
1412
1358
}
1413
1359
}
1414
1360
@@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
1473
1419
if (!block->hasRefs ())
1474
1420
{
1475
1421
mEvictionPolicy ->releaseBlock (block, true );
1476
- removeBlockFromHashMap (block);
1477
1422
}
1478
1423
// Remove block from allocated blocks
1479
1424
allocatedBlocks.pop_back ();
@@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
1616
1561
if (!block->hasRefs ())
1617
1562
{
1618
1563
mEvictionPolicy ->releaseBlock (block);
1619
- removeBlockFromHashMap (block);
1620
1564
}
1621
1565
}
1622
1566
// Remove stored block ids in sequence
@@ -1682,8 +1626,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
1682
1626
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
1683
1627
bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
1684
1628
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1685
- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
1686
- bool copyOnPartialReuse)
1629
+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
1687
1630
: mMaxBeamWidth (maxBeamWidth)
1688
1631
, mDataType (dtype)
1689
1632
, mMaxAttentionWindow (*std::max_element (maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
@@ -1693,10 +1636,9 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
1693
1636
, mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
1694
1637
std::move (stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
1695
1638
mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
1696
- enableHashKey, enablePartialReuse, copyOnPartialReuse)
1639
+ enablePartialReuse, copyOnPartialReuse)
1697
1640
// disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
1698
1641
, mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
1699
- , mEnableHashKey {enableHashKey}
1700
1642
{
1701
1643
TLLM_CHECK_DEBUG (std::find (maxAttentionWindowVec.begin (), maxAttentionWindowVec.end (), mMaxAttentionWindow )
1702
1644
!= maxAttentionWindowVec.end ());
@@ -1716,12 +1658,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
1716
1658
SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
1717
1659
bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
1718
1660
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
1719
- std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
1720
- bool copyOnPartialReuse)
1661
+ std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
1721
1662
: KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
1722
1663
maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
1723
1664
std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
1724
- std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
1665
+ std::move(eventManager), enablePartialReuse, copyOnPartialReuse)
1725
1666
{
1726
1667
}
1727
1668
@@ -2085,30 +2026,6 @@ void KVCacheManager::addSequence(
2085
2026
llmRequest->mRequestId );
2086
2027
}
2087
2028
mBlockManager .addSequence (sequence, numContextBlocks, unsharedBlockIdx, windowSize);
2088
- if (mEnableHashKey && llmRequest.has_value () && beamWidth == 1 )
2089
- {
2090
- constexpr SizeType32 beamIdx = 0 ;
2091
- auto const & blockIds = sequence.getCacheBlockIds (windowSize).at (beamIdx);
2092
- auto const & uniqueTokens = llmRequest->getUniqueTokens (beamIdx);
2093
- auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
2094
- uniqueTokens, uniqueTokens.size () - 1 , getTokensPerBlock (), true );
2095
- auto blockKeys = buildBlockKeys (blockedUniqueTokens, *llmRequest);
2096
- auto tokensPerBlock = static_cast <size_t >(getTokensPerBlock ());
2097
- for (size_t i = 0 ; i < blockIds.size (); i++)
2098
- {
2099
- auto const & block = mBlockManager .getBlockById (blockIds[i], windowSize);
2100
- if (i < blockKeys.size ())
2101
- {
2102
- block->setBlockKey (blockKeys[i], blockKeys[i].uniqueTokens .size () == tokensPerBlock);
2103
- }
2104
- else
2105
- {
2106
- block->setBlockKey ({}, false );
2107
- }
2108
- block->setHash ();
2109
- mBlockManager .addBlockToHashMap (block, windowSize);
2110
- }
2111
- }
2112
2029
}
2113
2030
cacheBlockOffsets (sequence, windowSize);
2114
2031
}
0 commit comments