[ET-VK] Split up prepack command buffer

SS-JIA · web-flow · commit bfd39e9c4871 · 2025-07-15T21:52:07.000-07:00
Differential Revision: D78275586 Pull Request resolved: #12442
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->prepare();
     compute_graph->prepare_pipelines();
 
-    compute_graph->encode_prepack();
-    compute_graph->prepack();
+    compute_graph->run_prepack();
 
     // If dynamic shapes are not expected, then the command buffer only needs to
     // be encoded once. Otherwise, wait until the first inference to encode the
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -145,7 +145,12 @@ ComputeGraph::ComputeGraph(GraphConfig config)
   execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
   execute_descriptor_counts_.descriptor_storage_image_count = 0;
 
-  context_->set_cmd(/*reusable = */ true);
+  // If certain graph config variables are not specified, then set them
+  // automatically.
+  if (config_.prepack_threshold_nbytes == 0) {
+    config_.prepack_threshold_nbytes = 10 * MB;
+    config_.prepack_initial_threshold_nbytes = 10 * MB;
+  }
 }
 
 ComputeGraph::~ComputeGraph() {
@@ -431,6 +436,7 @@ ValueRef ComputeGraph::add_tensorref(
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
   values_.emplace_back(TensorRef(sizes, dtype, data));
+  total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
   return idx;
 }
 
@@ -750,6 +756,19 @@ void ComputeGraph::prepare_pipelines() {
       vkapi::ComputePipelineCache::Hasher>();
 }
 
+void ComputeGraph::submit_current_cmd(const bool final_use) {
+  context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use);
+}
+
+void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
+  fence.wait();
+  context_->fences().return_fence(fence);
+
+  context_->flush();
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
@@ -766,6 +785,37 @@ void ComputeGraph::prepack() const {
   context_->flush();
 }
 
+void ComputeGraph::run_prepack() {
+  int i = 0;
+  bool submitted = false;
+  const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
+  // int count = 0;
+  context_->set_cmd();
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    // Do not trigger on the first or last prepack node.
+    const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
+    size_t threshold = submitted ? config_.prepack_threshold_nbytes
+                                 : config_.prepack_initial_threshold_nbytes;
+    if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
+      // If reducing peak memory usage, wait for the current command buffer to
+      // finish executing and flush to recycle the staging memory. This will
+      // reduce peak memory usage, but will slightly increase load latency.
+      // Otherwise, just submit the current command buffer for execution and
+      // proceed. This results in lower load latency at the cost of higher peak
+      // memory usage.
+      reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd();
+      staging_nbytes_in_cmd_ = 0;
+      context_->set_cmd();
+      submitted = true;
+    }
+
+    node->encode(this);
+    i++;
+  }
+  submit_current_cmd_and_wait(/*final_use=*/true);
+  staging_nbytes_in_cmd_ = 0;
+}
+
 void ComputeGraph::encode_execute() {
   context_->flush();
   context_->set_cmd(/*reusable = */ true);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -190,10 +190,20 @@ class ComputeGraph final {
       vkapi::ComputePipelineCache::Hasher>
       pipeline_descriptors_;
 
+  // Utility constexpr to express byte quantities
+  constexpr static size_t MB = 1024 * 1024;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
 
+  // Total number of bytes needed to store model weights
+  size_t total_constant_nbytes_ = 0;
+
+  // Represents the amount of staging buffer data that will be copied if the
+  // current Context's command buffer is submitted now.
+  size_t staging_nbytes_in_cmd_ = 0;
+
  public:
   //
   // Accessors
@@ -812,13 +822,39 @@ class ComputeGraph final {
   copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
   void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
 
+ protected:
+  // Command Buffer Management
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution.
+   */
+  void submit_current_cmd(const bool final_use = false);
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution,
+   * and wait for it to complete before returning. This function will also flush
+   * the Context after execution.
+   */
+  void submit_current_cmd_and_wait(const bool final_use = false);
+
+ public:
   //
   // Graph Prepacking
   //
 
+  inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
+    staging_nbytes_in_cmd_ += staging_bytes;
+  }
+
   void encode_prepack();
   void prepack() const;
 
+  /*
+   * Executes prepacking operations to transfer model weight data from the CPU
+   * to GPU.
+   */
+  void run_prepack();
+
   //
   // Graph Execution
   //
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -36,6 +36,20 @@ struct GraphConfig final {
   // Whether or not the ComputeGraph should expect input shapes to be dynamic
   bool expect_dynamic_shapes;
 
+  // Execution properties that determine specifics re: how command buffer
+  // submission is handled, etc. 0 means this field is not set.
+
+  // During prepacking, once this threshold is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve model load
+  // performance and prevent crashes when loading large models.
+  size_t prepack_threshold_nbytes = 0;
+  // Threshold used for the first command buffer submission during prepacking.
+  // This can be set to be lower than prepack_submission_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t prepack_initial_threshold_nbytes = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -28,6 +28,10 @@ struct TensorRef final {
       const std::vector<int64_t>& t_sizes,
       vkapi::ScalarType t_dtype,
       const void* const t_data);
+
+  inline size_t nbytes() const {
+    return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
+  }
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
   api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   staging.copy_from(tref->data, nbytes);
   return staging;
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -138,6 +138,10 @@ class VulkanBuffer final {
     return buffer_properties_.size;
   }
 
+  inline size_t mem_size_as_size_t() const {
+    return utils::safe_downcast<size_t>(mem_size());
+  }
+
   inline bool has_memory() const {
     return (memory_.allocation != VK_NULL_HANDLE);
   }

Original file line number	Diff line number	Diff line change
`@@ -138,6 +138,10 @@ class VulkanBuffer final {`
`138`	`138`	`return buffer_properties_.size;`
`139`	`139`	`}`
`140`	`140`
	`141`	`+ inline size_t mem_size_as_size_t() const {`
	`142`	`+ return utils::safe_downcast<size_t>(mem_size());`
	`143`	`+ }`
	`144`	`+`
`141`	`145`	`inline bool has_memory() const {`
`142`	`146`	`return (memory_.allocation != VK_NULL_HANDLE);`
`143`	`147`	`}`