@@ -47,9 +47,7 @@ HierarchyBlockManagerPool::HierarchyBlockManagerPool(
4747 }
4848
4949 load_block_transfer_infos_.resize (host_block_managers_.size ());
50- offload_block_transfer_infos_.resize (host_block_managers_.size ());
51- saved_host_blocks_.resize (host_block_managers_.size ());
52- saved_device_blocks_.resize (host_block_managers_.size ());
50+ offload_block_pair_queues_.resize (host_block_managers_.size ());
5351}
5452
5553void HierarchyBlockManagerPool::deallocate (Sequence* sequence) {
@@ -68,10 +66,6 @@ void HierarchyBlockManagerPool::deallocate(Sequence* sequence) {
6866 size_t cached_block_num =
6967 sequence->host_kv_state ().kv_cache_tokens_num () / options_.block_size ();
7068
71- if (host_blocks->size () > 0 ) {
72- host_block_managers_[dp_rank]->cache (sequence->tokens (), *host_blocks);
73- }
74-
7569 size_t needed_block_num =
7670 sequence->num_tokens () / options_.block_size () - host_blocks->size ();
7771
@@ -88,14 +82,9 @@ void HierarchyBlockManagerPool::deallocate(Sequence* sequence) {
8882 }
8983
9084 host_blocks->at (i).set_hash_value (blocks->at (i).get_immutable_hash_value ());
91- saved_host_blocks_[dp_rank].emplace_back (std::move (host_blocks->at (i)));
92- saved_device_blocks_[dp_rank].emplace_back (std::move (blocks->at (i)));
93- offload_block_transfer_infos_[dp_rank].emplace_back (BlockTransferInfo (
94- saved_device_blocks_[dp_rank].back ().id (),
95- saved_host_blocks_[dp_rank].back ().id (),
96- saved_host_blocks_[dp_rank].back ().get_immutable_hash_value (),
97- saved_host_blocks_[dp_rank].back ().get_hash_value_len (),
98- TransferType::D2G));
85+ auto block_pair = std::make_shared<OffloadBlockPair>(
86+ std::move (blocks->at (i)), std::move (host_blocks->at (i)));
87+ offload_block_pair_queues_[dp_rank].enqueue (std::move (block_pair));
9988 }
10089 host_block_managers_[dp_rank]->cache (
10190 *sequence->host_kv_state ().mutable_kv_blocks ());
@@ -235,36 +224,50 @@ void HierarchyBlockManagerPool::transfer_blocks(
235224 }
236225
237226 // offload blocks from device to host and kvcache store
238- for (int i = 0 ; i < offload_block_transfer_infos_.size (); i++) {
239- if (!offload_block_transfer_infos_[i].empty ()) {
240- folly::collectAll (std::move (engine_->transfer_kv_blocks (
241- i, std::move (offload_block_transfer_infos_[i]))))
227+ for (int i = 0 ; i < offload_block_pair_queues_.size (); i++) {
228+ std::vector<BlockTransferInfo> transfer_infos;
229+ std::vector<Block> src_blocks;
230+ std::vector<Block> dst_blocks;
231+
232+ std::shared_ptr<OffloadBlockPair> block_pair;
233+ while (offload_block_pair_queues_[i].try_dequeue (block_pair)) {
234+ src_blocks.emplace_back (std::move (block_pair->src ));
235+ dst_blocks.emplace_back (std::move (block_pair->dst ));
236+ transfer_infos.emplace_back (
237+ BlockTransferInfo (src_blocks.back ().id (),
238+ dst_blocks.back ().id (),
239+ dst_blocks.back ().get_immutable_hash_value (),
240+ TransferType::D2G));
241+ block_pair.reset ();
242+ }
243+
244+ if (!transfer_infos.empty ()) {
245+ folly::collectAll (
246+ std::move (engine_->transfer_kv_blocks (i, std::move (transfer_infos))))
242247 .via (folly::getGlobalCPUExecutor ())
243- .thenValue ([host_blocks = std::move (saved_host_blocks_[i] ),
244- device_blocks = std::move (saved_device_blocks_[i] ),
245- host_block_mgr_ptr = host_block_managers_ [i].get (),
246- device_block_mgr_ptr = block_managers_ [i].get ()](
247- std::vector<folly::Try<uint32_t >>&& results) {
248+ .thenValue ([device_blocks = std::move (src_blocks ),
249+ host_blocks = std::move (dst_blocks ),
250+ device_block_mgr_ptr = block_managers_ [i].get (),
251+ host_block_mgr_ptr = host_block_managers_ [i].get ()](
252+ std::vector<folly::Try<uint32_t >>&& results) mutable {
248253 for (auto && result : results) {
249254 if (result.value () != host_blocks.size ()) {
250255 LOG (FATAL) << " Offload copy fail, expected "
251256 << host_blocks.size () << " , got " << result.value ();
252257 }
253258 }
259+
260+ device_block_mgr_ptr->deallocate ({device_blocks});
261+ device_blocks.clear ();
262+
254263 host_block_mgr_ptr->cache (host_blocks);
255264 host_block_mgr_ptr->deallocate ({host_blocks});
256- device_block_mgr_ptr->deallocate ({device_blocks});
265+ host_blocks.clear ();
266+
257267 return 0 ;
258268 });
259269 }
260270 }
261-
262- offload_block_transfer_infos_.clear ();
263- saved_host_blocks_.clear ();
264- saved_device_blocks_.clear ();
265- offload_block_transfer_infos_.resize (host_block_managers_.size ());
266- saved_host_blocks_.resize (host_block_managers_.size ());
267- saved_device_blocks_.resize (host_block_managers_.size ());
268271}
269272
270273void HierarchyBlockManagerPool::get_merged_kvcache_event (
0 commit comments