@@ -3053,189 +3053,6 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
3053
3053
assertBlocks (seq3, {4 }, {6 });
3054
3054
}
3055
3055
3056
- namespace
3057
- {
3058
- KVCacheManager setupKvCacheManagerForHashTest (bool enableBlockReuse)
3059
- {
3060
- auto constexpr numLayers = 2 ;
3061
- auto constexpr numHeads = 2 ;
3062
- auto constexpr sizePerHead = 64 ;
3063
- auto constexpr tokensPerBlock = 4 ;
3064
- auto constexpr maxNumSequences = 8 ;
3065
- auto constexpr maxBeamWidth = 1 ;
3066
- auto constexpr sinkTokenLength = 0 ;
3067
- auto const stream = std::make_shared<tr::CudaStream>();
3068
-
3069
- auto constexpr maxBlocksPerSeq = 8 ;
3070
- auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
3071
- auto constexpr maxAttentionWindow = maxNumTokens;
3072
-
3073
- auto constexpr blocksInPrimaryPool = 16 ;
3074
- auto constexpr blocksInSecondaryPool = 0 ;
3075
-
3076
- auto constexpr onboardBlocks = true ;
3077
-
3078
- auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}};
3079
-
3080
- return KVCacheManager (std::vector<SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
3081
- maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
3082
- nvinfer1::DataType::kHALF , sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks,
3083
- CacheType::kSELF , std::nullopt, nullptr ,
3084
- /* enableHashKey*/ true );
3085
- }
3086
-
3087
- std::vector<size_t > getHashAndRetrieveBlocksByHashTest (
3088
- BlockManager const & blockManager, std::vector<KVCacheBlock::IdType> const & blockIds, SizeType32 windowSize)
3089
- {
3090
- std::vector<size_t > blockHashes;
3091
- for (auto blockId : blockIds)
3092
- {
3093
- blockHashes.emplace_back (blockManager.getBlockById (blockId, windowSize)->getHash ());
3094
- }
3095
- std::vector<BlockPtr> blockPtrs;
3096
- for (auto hash : blockHashes)
3097
- {
3098
- auto range = blockManager.getBlocksByHash (hash, windowSize);
3099
- BlockPtr const prevBlock = blockPtrs.empty () ? nullptr : blockPtrs.back ();
3100
- BlockPtr thisBlock = nullptr ;
3101
- for (auto it = range.first ; it != range.second ; ++it)
3102
- {
3103
- if (it->second ->getPrevBlockInSeq () == prevBlock)
3104
- {
3105
- thisBlock = it->second ;
3106
- break ;
3107
- }
3108
- }
3109
- EXPECT_NE (thisBlock, nullptr );
3110
- blockPtrs.emplace_back (thisBlock);
3111
- }
3112
- EXPECT_EQ (blockHashes.size (), blockPtrs.size ());
3113
- for (size_t i = 0 ; i < blockHashes.size (); i++)
3114
- {
3115
- EXPECT_EQ (blockManager.getBlockById (blockIds[i], windowSize), blockPtrs[i]);
3116
- }
3117
- return blockHashes;
3118
- }
3119
- } // namespace
3120
-
3121
- TEST_F (KVCacheManagerTest, KVCacheManagerHashKeyTest)
3122
- {
3123
- auto kvCacheManager = setupKvCacheManagerForHashTest (false );
3124
-
3125
- auto const & blockManager = kvCacheManager.getBlockManager ();
3126
-
3127
- SizeType32 constexpr maxNewTokens = 4 ;
3128
-
3129
- // prepare tokens with token[i] = 1000 + i
3130
- TokenIdType constexpr firstToken = 1000 ;
3131
-
3132
- auto constexpr beamWidth = 1 ;
3133
- tr::SamplingConfig const samplingConfig{beamWidth};
3134
- bool constexpr isStreaming{false };
3135
-
3136
- SizeType32 requestId = 0 ;
3137
- int inputLength = 16 ;
3138
- auto inputTokens = std::make_shared<VecTokens>(inputLength);
3139
- std::iota (inputTokens->begin (), inputTokens->end (), firstToken);
3140
- auto llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
3141
- auto constexpr beamIdx = 0 ;
3142
-
3143
- // /////////////////////////////////////////////////////////////////////////
3144
- // add a request and then remove it without reuse
3145
- kvCacheManager.addSequence (requestId, inputLength, beamWidth, llmRequest);
3146
- GenerationRequest const & seq = kvCacheManager.getSequence (requestId);
3147
- EXPECT_EQ (llmRequest->getContextCurrentPosition (), 0 );
3148
-
3149
- auto const onlyWindowSize = theOnlyWindowSize (kvCacheManager);
3150
-
3151
- auto & blockIds = seq.getCacheBlockIds (onlyWindowSize).at (beamIdx);
3152
- EXPECT_THAT (blockIds, ::testing::ElementsAreArray ({0 , 1 , 2 , 3 }));
3153
-
3154
- // get blocks by hash and try to retrieve them by hash
3155
- auto blockHashes = getHashAndRetrieveBlocksByHashTest (blockManager, blockIds, onlyWindowSize);
3156
-
3157
- EXPECT_NO_THROW (kvCacheManager.removeSequence (requestId, llmRequest));
3158
-
3159
- // blocks are all removed
3160
- for (auto hash : blockHashes)
3161
- {
3162
- auto range = blockManager.getBlocksByHash (hash, onlyWindowSize);
3163
- EXPECT_EQ (range.first , range.second );
3164
- }
3165
- EXPECT_EQ (blockManager.getNumAllocatedBlocks (), 0 );
3166
- }
3167
-
3168
- TEST_F (KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest)
3169
- {
3170
- auto kvCacheManager = setupKvCacheManagerForHashTest (true );
3171
-
3172
- auto const & blockManager = kvCacheManager.getBlockManager ();
3173
-
3174
- SizeType32 constexpr maxNewTokens = 4 ;
3175
-
3176
- // prepare tokens with token[i] = 1000 + i
3177
- TokenIdType constexpr firstToken = 1000 ;
3178
-
3179
- auto constexpr beamWidth = 1 ;
3180
- tr::SamplingConfig const samplingConfig{beamWidth};
3181
- bool constexpr isStreaming{false };
3182
-
3183
- SizeType32 requestId = 0 ;
3184
- int inputLength = 16 ;
3185
- auto inputTokens = std::make_shared<VecTokens>(inputLength);
3186
- std::iota (inputTokens->begin (), inputTokens->end (), firstToken);
3187
- auto llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
3188
- auto constexpr beamIdx = 0 ;
3189
-
3190
- // /////////////////////////////////////////////////////////////////////////
3191
- // add a request and then remove it with reuse
3192
- kvCacheManager.addSequence (requestId, inputLength, beamWidth, llmRequest);
3193
- GenerationRequest const & seq0 = kvCacheManager.getSequence (requestId);
3194
- EXPECT_EQ (llmRequest->getContextCurrentPosition (), 0 );
3195
-
3196
- EXPECT_EQ (blockManager.getNumPools (), 1 );
3197
- auto const onlyWindowSize = theOnlyWindowSize (kvCacheManager);
3198
-
3199
- auto & blockIds0 = seq0.getCacheBlockIds (onlyWindowSize).at (beamIdx);
3200
- EXPECT_THAT (blockIds0, ::testing::ElementsAreArray ({0 , 1 , 2 , 3 }));
3201
-
3202
- // get blocks by hash and try to retrieve them by hash
3203
- auto blockHashes = getHashAndRetrieveBlocksByHashTest (blockManager, blockIds0, onlyWindowSize);
3204
-
3205
- EXPECT_NO_THROW (kvCacheManager.removeSequence (requestId, llmRequest));
3206
-
3207
- // TODO: Make reused blocks accessible by hash, after sequence removed. Test here.
3208
-
3209
- // /////////////////////////////////////////////////////////////////////////
3210
- // add a new request with same prefix
3211
- requestId = 1 ;
3212
- inputLength = 20 ;
3213
- inputTokens->resize (inputLength);
3214
- std::iota (inputTokens->begin (), inputTokens->end (), firstToken);
3215
- llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
3216
- kvCacheManager.addSequence (requestId, inputLength, beamWidth, llmRequest);
3217
- GenerationRequest const & seq1 = kvCacheManager.getSequence (requestId);
3218
- EXPECT_EQ (llmRequest->getContextCurrentPosition (), 15 );
3219
- auto & blockIds1 = seq1.getCacheBlockIds (onlyWindowSize).at (beamIdx);
3220
- EXPECT_THAT (blockIds1, ::testing::ElementsAreArray ({0 , 1 , 2 , 3 , 4 }));
3221
-
3222
- std::ignore = getHashAndRetrieveBlocksByHashTest (blockManager, blockIds1, onlyWindowSize);
3223
-
3224
- // blocks are reused, so reused blocks are still accessible by previous hashes
3225
- for (size_t i = 0 ; i < 4 ; i++)
3226
- {
3227
- auto range = blockManager.getBlocksByHash (blockHashes[i], onlyWindowSize);
3228
- EXPECT_NE (range.first , range.second );
3229
- }
3230
- // evicted block is not accessible
3231
- {
3232
- size_t i = 4 ;
3233
- auto range = blockManager.getBlocksByHash (blockHashes[i], onlyWindowSize);
3234
- EXPECT_EQ (range.first , range.second );
3235
- }
3236
- EXPECT_EQ (blockManager.getNumAllocatedBlocks (), 5 );
3237
- }
3238
-
3239
3056
TEST_F (KVCacheManagerTest, KVCacheManagerEventStream)
3240
3057
{
3241
3058
auto constexpr numLayers = 12 ;
0 commit comments