bugfix: resolve multi-threading conflict in kv cache events reporting.

Kang-Meng · Kang-Meng · commit 08cdf97d30a8 · 2025-12-16T18:16:59.000+08:00
diff --git a/xllm/core/framework/block/block_manager.h b/xllm/core/framework/block/block_manager.h
@@ -66,7 +66,6 @@ class BlockManager {
 
   // get merged all dp rank KVCacheEvent
   virtual void get_merged_kvcache_event(KvCacheEvent* event) const = 0;
-  virtual float get_gpu_cache_usage_perc() const = 0;
 
   virtual size_t num_blocks_in_prefix_cache() const = 0;
   virtual size_t num_free_blocks() const = 0;
diff --git a/xllm/core/framework/block/block_manager_impl.cpp b/xllm/core/framework/block/block_manager_impl.cpp
@@ -70,9 +70,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
     for (const auto& block : blocks) {
       // the block is not shared by other sequence
       if (block.is_valid() && block.ref_count() <= 2) {
-        auto origin_num_used_blocks =
-            num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
-        if (origin_num_used_blocks < 0) {
+        if (num_used_blocks_ == 0) {
           LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"
                      << block.id()
                      << ", total block size: " << num_total_blocks();
@@ -86,6 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
           }
           LOG(FATAL) << error_msg;
         }
+        num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
       }
     }
   } else {
diff --git a/xllm/core/framework/block/block_manager_impl.h b/xllm/core/framework/block/block_manager_impl.h
@@ -76,10 +76,6 @@ class BlockManagerImpl : public BlockManager {
     }
   }
 
-  float get_gpu_cache_usage_perc() const override {
-    return 1 - static_cast<float>(num_free_blocks_) / num_total_blocks();
-  }
-
   // call BlockManager to free block used by Block.
   void free(int32_t block_id) override;
 
diff --git a/xllm/core/framework/block/block_manager_pool.cpp b/xllm/core/framework/block/block_manager_pool.cpp
@@ -30,7 +30,9 @@ BlockManagerPool::BlockManagerPool(const Options& options, int32_t dp_size)
       .block_size(options_.block_size())
       .enable_prefix_cache(options_.enable_prefix_cache())
       .enable_disagg_pd(options_.enable_disagg_pd())
-      .enable_cache_upload(options_.enable_cache_upload());
+      .enable_cache_upload(options_.host_num_blocks() > 0
+                               ? false
+                               : options_.enable_cache_upload());
 
   for (int32_t i = 0; i < dp_size; ++i) {
     if (options.enable_disagg_pd() || options_.enable_kvcache_store()) {
@@ -221,7 +223,7 @@ void BlockManagerPool::get_merged_kvcache_event(KvCacheEvent* event) const {
 float BlockManagerPool::get_gpu_cache_usage_perc() const {
   float perc = 0.0;
   for (int32_t i = 0; i < block_managers_.size(); ++i) {
-    perc += block_managers_[i]->get_gpu_cache_usage_perc();
+    perc += block_managers_[i]->kv_cache_utilization();
   }
   return perc / block_managers_.size();
 }
diff --git a/xllm/core/framework/block/hierarchy_block_manager_pool.cpp b/xllm/core/framework/block/hierarchy_block_manager_pool.cpp
@@ -34,7 +34,7 @@ HierarchyBlockManagerPool::HierarchyBlockManagerPool(
       .enable_prefix_cache(options_.enable_prefix_cache())
       .enable_disagg_pd(options_.enable_disagg_pd())
       .num_blocks(options_.host_num_blocks())
-      .enable_cache_upload(false);
+      .enable_cache_upload(options_.enable_cache_upload());
 
   for (int32_t i = 0; i < dp_size; ++i) {
     if (options.enable_disagg_pd() || options_.enable_kvcache_store()) {
@@ -69,13 +69,11 @@ void HierarchyBlockManagerPool::deallocate(Sequence* sequence) {
   size_t needed_block_num =
       sequence->num_tokens() / options_.block_size() - host_blocks->size();
 
-  if (needed_block_num == 0) {
-    return;
+  if (needed_block_num != 0) {
+    sequence->host_kv_state().add_kv_blocks(
+        host_block_managers_[dp_rank]->allocate(needed_block_num));
   }
 
-  sequence->host_kv_state().add_kv_blocks(
-      host_block_managers_[dp_rank]->allocate(needed_block_num));
-
   for (size_t i = cached_block_num; i < host_blocks->size(); i++) {
     if (blocks->at(i).ref_count() != 2) {
       continue;
@@ -86,8 +84,7 @@ void HierarchyBlockManagerPool::deallocate(Sequence* sequence) {
         std::move(blocks->at(i)), std::move(host_blocks->at(i)));
     offload_block_pair_queues_[dp_rank].enqueue(std::move(block_pair));
   }
-  host_block_managers_[dp_rank]->cache(
-      *sequence->host_kv_state().mutable_kv_blocks());
+
   host_block_managers_[dp_rank]->deallocate(
       sequence->host_kv_state().kv_blocks());
 
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.cpp b/xllm/core/framework/prefix_cache/prefix_cache.cpp
@@ -124,6 +124,11 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return insert(token_ids, blocks, &insert_keys);
 }
 
+size_t PrefixCache::insert(const std::vector<Block>& blocks) {
+  std::vector<Murmur3Key> insert_keys;
+  return insert(blocks, &insert_keys);
+}
+
 size_t PrefixCache::evict(size_t n_blocks) {
   std::vector<Murmur3Key> evict_keys;
   return evict(n_blocks, &evict_keys);
@@ -192,11 +197,13 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return n_tokens;
 }
 
-size_t PrefixCache::insert(const std::vector<Block>& blocks) {
+size_t PrefixCache::insert(const std::vector<Block>& blocks,
+                           std::vector<Murmur3Key>* insert_keys) {
   const int64_t now = absl::ToUnixMicros(absl::Now());
   DNodeList node_list;
   Murmur3Key token_hash_key;
 
+  insert_keys->reserve(blocks.size());
   for (size_t i = 0; i < blocks.size(); i++) {
     if (!blocks[i].is_valid()) {
       continue;
@@ -220,6 +227,8 @@ size_t PrefixCache::insert(const std::vector<Block>& blocks) {
       cached_blocks_.emplace(std::make_pair(token_hash_key, new_node));
 
       num_blocks_++;
+
+      insert_keys->emplace_back(token_hash_key.data);
     }
   }
 
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.h b/xllm/core/framework/prefix_cache/prefix_cache.h
@@ -66,8 +66,13 @@ class PrefixCache {
       const Slice<int32_t>& token_ids,
       const Slice<Block>& existed_shared_blocks = {});
 
+  // insert the token ids and blocks into the prefix tree
+  // and set hash key to the corresponding block
+  // return the length of new inserted tokens
   virtual size_t insert(const Slice<int32_t>& token_ids,
                         std::vector<Block>& blocks);
+
+  // insert the blocks with hash key into the prefix tree
   virtual size_t insert(const std::vector<Block>& blocks);
 
   // evict blocks hold by the prefix cache
@@ -98,6 +103,10 @@ class PrefixCache {
   size_t insert(const Slice<int32_t>& token_ids,
                 std::vector<Block>& blocks,
                 std::vector<Murmur3Key>* insert_keys);
+
+  size_t insert(const std::vector<Block>& blocks,
+                std::vector<Murmur3Key>* insert_keys);
+
   size_t evict(size_t n_blocks, std::vector<Murmur3Key>* evict_keys);
 
   struct Node {
diff --git a/xllm/core/framework/prefix_cache/prefix_cache_with_upload.cpp b/xllm/core/framework/prefix_cache/prefix_cache_with_upload.cpp
@@ -35,47 +35,54 @@ size_t PrefixCacheWithUpload::insert(const Slice<int32_t>& token_ids,
                                      std::vector<Block>& blocks) {
   std::vector<Murmur3Key> insert_keys;
   auto n_tokens = PrefixCache::insert(token_ids, blocks, &insert_keys);
+  save_event_async(true, insert_keys);
+  return n_tokens;
+}
 
-  threadpool_.schedule([insert_keys = std::move(insert_keys), this]() {
-    auto front_ptr = this->db_kvcache_events_.get_front_value();
-    if (!front_ptr) {
-      LOG(INFO) << "Front DoubleBufferKvCacheEvent is nullptr!";
-      return;
-    }
-    if (!this->exited_.load()) {
-      for (const auto& hash_id : insert_keys) {
-        front_ptr->removed_cache.erase(hash_id);
-        front_ptr->stored_cache.insert(hash_id);
-      }
-    }
-  });
-
+size_t PrefixCacheWithUpload::insert(const std::vector<Block>& blocks) {
+  std::vector<Murmur3Key> insert_keys;
+  auto n_tokens = PrefixCache::insert(blocks, &insert_keys);
+  save_event_async(true, insert_keys);
   return n_tokens;
 }
 
 size_t PrefixCacheWithUpload::evict(size_t n_blocks) {
   std::vector<Murmur3Key> evict_keys;
   auto evict_count = PrefixCache::evict(n_blocks, &evict_keys);
+  save_event_async(false, evict_keys);
+  return evict_count;
+}
 
-  threadpool_.schedule([evict_keys = std::move(evict_keys), this]() {
+void PrefixCacheWithUpload::save_event_async(const bool is_insert,
+                                             std::vector<Murmur3Key>& keys) {
+  threadpool_.schedule([this, is_insert = is_insert, keys = std::move(keys)]() {
+    std::lock_guard<std::mutex> lock(this->mutex_);
     auto front_ptr = this->db_kvcache_events_.get_front_value();
     if (!front_ptr) {
       LOG(INFO) << "Front DoubleBufferKvCacheEvent is nullptr!";
       return;
     }
     if (!this->exited_.load()) {
-      for (const auto& hash_id : evict_keys) {
-        front_ptr->removed_cache.insert(hash_id);
-        front_ptr->stored_cache.erase(hash_id);
+      if (is_insert) {
+        for (const auto& key : keys) {
+          front_ptr->removed_cache.erase(key);
+          front_ptr->stored_cache.insert(key);
+        }
+      } else {
+        for (const auto& key : keys) {
+          front_ptr->removed_cache.insert(key);
+          front_ptr->stored_cache.erase(key);
+        }
       }
     }
   });
-
-  return evict_count;
 }
 
 KvCacheEvent* PrefixCacheWithUpload::get_upload_kvcache_events() {
-  db_kvcache_events_.swap();
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    db_kvcache_events_.swap();
+  }
   if (!exited_.load()) {
     return db_kvcache_events_.get_back_value();
   } else {
diff --git a/xllm/core/framework/prefix_cache/prefix_cache_with_upload.h b/xllm/core/framework/prefix_cache/prefix_cache_with_upload.h
@@ -18,15 +18,22 @@ class PrefixCacheWithUpload final : public PrefixCache {
   size_t insert(const Slice<int32_t>& token_ids,
                 std::vector<Block>& blocks) override;
 
+  // insert the blocks with hash key into the prefix tree
+  size_t insert(const std::vector<Block>& blocks) override;
+
   // evict blocks hold by the prefix cache
   // return the actual number of evicted blocks
   size_t evict(size_t n_blocks) override;
 
   virtual KvCacheEvent* get_upload_kvcache_events() override;
 
+ private:
+  void save_event_async(const bool is_insert, std::vector<Murmur3Key>& keys);
+
  private:
   ThreadPool threadpool_;
 
+  std::mutex mutex_;
   DoubleBuffer<KvCacheEvent> db_kvcache_events_;
 };
 
diff --git a/xllm/core/util/hash_util.h b/xllm/core/util/hash_util.h
@@ -43,7 +43,7 @@ struct Murmur3Key {
     std::memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN);
   }
 
-  std::string debug_string() {
+  std::string debug_string() const {
     std::string rt;
     for (int i = 0; i < MURMUR_HASH3_VALUE_LEN; i++) {
       rt += std::to_string(int64_t(data[i])) + " ";
diff --git a/xllm/xllm.cpp b/xllm/xllm.cpp
@@ -200,18 +200,19 @@ int run() {
       .enable_schedule_overlap(FLAGS_enable_schedule_overlap)
       .kv_cache_transfer_mode(FLAGS_kv_cache_transfer_mode)
       .etcd_addr(FLAGS_etcd_addr)
-      .enable_service_routing(FLAGS_enable_service_routing)
+      .enable_service_routing(FLAGS_enable_service_routing ||
+                              FLAGS_enable_disagg_pd)
       .tool_call_parser(FLAGS_tool_call_parser)
       .reasoning_parser(FLAGS_reasoning_parser)
       .priority_strategy(FLAGS_priority_strategy)
       .enable_online_preempt_offline(FLAGS_enable_online_preempt_offline)
-      .enable_cache_upload(FLAGS_enable_prefix_cache &&
-                           FLAGS_enable_service_routing &&
-                           FLAGS_enable_cache_upload)
+      .enable_cache_upload(
+          (FLAGS_enable_service_routing || FLAGS_enable_disagg_pd) &&
+          FLAGS_enable_prefix_cache && FLAGS_enable_cache_upload)
       .host_blocks_factor(FLAGS_host_blocks_factor)
       .enable_kvcache_store(FLAGS_enable_kvcache_store &&
                             FLAGS_enable_prefix_cache &&
-                            (FLAGS_host_blocks_factor > 0.0))
+                            (FLAGS_host_blocks_factor > 1.0))
       .prefetch_timeout(FLAGS_prefetch_timeout)
       .prefetch_bacth_size(FLAGS_prefetch_bacth_size)
       .layers_wise_copy_batchs(FLAGS_layers_wise_copy_batchs)

Original file line number	Diff line number	Diff line change
`@@ -70,9 +70,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`70`	`70`	`for (const auto& block : blocks) {`
`71`	`71`	`// the block is not shared by other sequence`
`72`	`72`	`if (block.is_valid() && block.ref_count() <= 2) {`
`73`		`- auto origin_num_used_blocks =`
`74`		`- num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
`75`		`- if (origin_num_used_blocks < 0) {`
	`73`	`+ if (num_used_blocks_ == 0) {`
`76`	`74`	`LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"`
`77`	`75`	`<< block.id()`
`78`	`76`	`<< ", total block size: " << num_total_blocks();`
`@@ -86,6 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`86`	`84`	`}`
`87`	`85`	`LOG(FATAL) << error_msg;`
`88`	`86`	`}`
	`87`	`+ num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
`89`	`88`	`}`
`90`	`89`	`}`
`91`	`90`	`} else {`
Original file line number	Diff line number	Diff line change
`@@ -76,10 +76,6 @@ class BlockManagerImpl : public BlockManager {`
`76`	`76`	`}`
`77`	`77`	`}`
`78`	`78`
`79`		`- float get_gpu_cache_usage_perc() const override {`
`80`		`- return 1 - static_cast<float>(num_free_blocks_) / num_total_blocks();`
`81`		`- }`
`82`		`-`
`83`	`79`	`// call BlockManager to free block used by Block.`
`84`	`80`	`void free(int32_t block_id) override;`
`85`	`81`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ struct Murmur3Key {`
`43`	`43`	`std::memcpy(data, input_data, MURMUR_HASH3_VALUE_LEN);`
`44`	`44`	`}`
`45`	`45`
`46`		`- std::string debug_string() {`
	`46`	`+ std::string debug_string() const {`
`47`	`47`	`std::string rt;`
`48`	`48`	`for (int i = 0; i < MURMUR_HASH3_VALUE_LEN; i++) {`
`49`	`49`	`rt += std::to_string(int64_t(data[i])) + " ";`