[ET-VK] 7/n Split dispatches between multiple command buffers. Split execute dispatch into multiple commands based on dispatch count.

trivedivivek · trivedivivek · commit 463b6327660f · 2025-08-04T12:41:51.000-07:00
Pull Request resolved: #12530 This diff, splits the execute dispatch into multiple commands based on the dispatch count. This allows for concurrent CPU and GPU execution. The modifications involve adding a counter `encoded_node_count` to track the number of encoded nodes and submitting a new command buffer to the GPU every 64 nodes. ghstack-source-id: 300616853 @exported-using-ghexport Differential Revision: [D78360039](https://our.internmc.facebook.com/intern/diff/D78360039/)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -151,6 +151,10 @@ ComputeGraph::ComputeGraph(GraphConfig config)
     config_.prepack_threshold_nbytes = 10 * MB;
     config_.prepack_initial_threshold_nbytes = 10 * MB;
   }
+  if (config_.execute_threshold_node_count == 0) {
+    config_.execute_threshold_node_count = 128;
+    config_.execute_initial_threshold_node_count = 64;
+  }
 }
 
 ComputeGraph::~ComputeGraph() {
@@ -852,15 +856,38 @@ void ComputeGraph::execute() {
     context_->set_cmd(/*reusable = */ true);
 
     context_->cmd_reset_querypool();
+    uint32_t encoded_node_count = 0;
 
     for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
       node->encode(this);
+      encoded_node_count++;
+
+      // Threshold is reached when the node count reached
+      // execute_initial_threshold_node_count or if its a multiple of
+      // execute_threshold_node_count.
+      const bool reached_threshold =
+          encoded_node_count >= config_.execute_initial_threshold_node_count &&
+          ((encoded_node_count - config_.execute_initial_threshold_node_count) %
+               config_.execute_threshold_node_count ==
+           0);
+
+      // Create a new command buffer when threashold is reached
+      if (reached_threshold) {
+        context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
+        deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
+        context_->set_cmd(true);
+      }
     }
 
+    vkapi::VulkanFence fence = context_->fences().get_fence();
+    context_->submit_cmd_to_gpu(fence.get_submit_handle(), false);
+    fence.wait();
+    context_->fences().return_fence(fence);
     deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
+  } else {
+    submit_deferred_cmds_and_wait();
   }
 
-  submit_deferred_cmds_and_wait();
   execute_count_++;
 }
 
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -50,6 +50,17 @@ struct GraphConfig final {
   // by taking more advantage of parallelism between the CPU and GPU.
   size_t prepack_initial_threshold_nbytes = 0;
 
+  // During execute, once this node count is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve execution
+  // performance.
+  size_t execute_threshold_node_count = 0;
+  // Execute node count used for the first command buffer submission during
+  // execute. This can be set to be lower than execute_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t execute_initial_threshold_node_count = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings