[ET-VK] 5/n Split dispatches between multiple command buffers. Add support to defer command buffers in encode_execute function and submit all deferred commands in execute function.

trivedivivek · trivedivivek · commit 5aa78c757df7 · 2025-07-16T12:29:58.000-07:00
Pull Request resolved: #12527 The diff adds changes to store command buffers submitted with final_use set to false. Storing these buffers is necessary for `execute()` function. Since, `encode_execute()` function is typically called once but `execute()` can be called multiple times, `submit_all_non_final_cmds` function is added so all recorded command buffers with `final_use = False` can be called multiple times in `execute()`. #### Key Changes * Added a flag `execute_pending_first_submission` to the `ComputeGraph` class to track whether execute nodes have been freshly encoded and need to be submitted first. * Added a new function `submit_all_non_final_cmds` to the `Context` class, which submits all non-final command buffers to the GPU. * Modified the `submit_cmd_to_gpu` function to add the submitted command buffer to the `non_final_cmds_` list if it's not marked as final use. * Updated the `execute` function in `ComputeGraph` to submit all non-final command buffers before executing the graph. ghstack-source-id: 296562130 @exported-using-ghexport Differential Revision: [D78360038](https://our.internmc.facebook.com/intern/diff/D78360038/)
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
@@ -217,7 +217,7 @@ void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
 }
 
 void Context::flush() {
-  VK_CHECK(vkQueueWaitIdle(queue()));
+  VK_CHECK(vkQueueWaitIdle(queue().handle));
 
   command_pool_.flush();
   descriptor_pool_.flush();
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
@@ -90,8 +90,8 @@ class Context final {
     return device_;
   }
 
-  inline VkQueue queue() {
-    return queue_.handle;
+  inline vkapi::Adapter::Queue& queue() {
+    return queue_;
   }
 
   // Device Caches
@@ -230,6 +230,10 @@ class Context final {
       VkFence fence_handle = VK_NULL_HANDLE,
       const bool final_use = false);
 
+  vkapi::CommandBuffer& extract_cmd() {
+    return cmd_;
+  }
+
   void flush();
 
 #ifdef VULKAN_DEBUG
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -158,6 +158,7 @@ ComputeGraph::~ComputeGraph() {
 
   prepack_nodes_.clear();
   execute_nodes_.clear();
+  deferred_cmd_list_.clear();
 
   context_->flush();
 }
@@ -767,6 +768,30 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
   context_->fences().return_fence(fence);
 }
 
+void ComputeGraph::submit_deferred_cmds() {
+  VkSemaphore prev_semaphore = VK_NULL_HANDLE;
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+
+  for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) {
+    auto& cmd = deferred_cmd_list_[i];
+    VkSemaphore wait_semaphore = prev_semaphore;
+    VkSemaphore signal_semaphore = cmd.get_signal_semaphore();
+    prev_semaphore = signal_semaphore;
+
+    if (cmd) {
+      cmd.end();
+      context_->adapter_ptr()->submit_cmd(
+          context_->queue(),
+          cmd.get_submit_handle(false),
+          i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle() : VK_NULL_HANDLE,
+          wait_semaphore,
+          signal_semaphore);
+    }
+  }
+  fence.wait();
+  context_->fences().return_fence(fence);
+}
+
 void ComputeGraph::prepack() {
   int i = 0;
   bool submitted = false;
@@ -805,6 +830,7 @@ void ComputeGraph::prepack() {
 }
 
 void ComputeGraph::encode_execute() {
+  deferred_cmd_list_.clear();
   context_->flush();
   context_->set_cmd(/*reusable = */ true);
 
@@ -813,13 +839,12 @@ void ComputeGraph::encode_execute() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->encode(this);
   }
+
+  deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
 }
 
 void ComputeGraph::execute() {
-  vkapi::VulkanFence fence = context_->fences().get_fence();
-  context_->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-  context_->fences().return_fence(fence);
+  submit_deferred_cmds();
   execute_count_++;
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -193,6 +193,9 @@ class ComputeGraph final {
   // Utility constexpr to express byte quantities
   constexpr static size_t MB = 1024 * 1024;
 
+  // List of command buffers deferred for submission
+  std::vector<vkapi::CommandBuffer> deferred_cmd_list_;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
@@ -851,6 +854,11 @@ class ComputeGraph final {
    */
   void submit_current_cmd_and_wait(const bool final_use = false);
 
+  /*
+   * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU.
+   */
+  void submit_deferred_cmds();
+
  public:
   //
   // Graph Prepacking

Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`void Context::flush() {`
`220`		`- VK_CHECK(vkQueueWaitIdle(queue()));`
	`220`	`+ VK_CHECK(vkQueueWaitIdle(queue().handle));`
`221`	`221`
`222`	`222`	`command_pool_.flush();`
`223`	`223`	`descriptor_pool_.flush();`