From 50fac330f464f06b27bddf874fe4d534c4dd3603 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Tue, 15 Jul 2025 15:42:17 -0700
Subject: [PATCH] [ET-VK] Split up prepack command buffer

Pull Request resolved: https://github.com/pytorch/executorch/pull/12442

## Changes

* Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once.
* Introduce graph config options to control command buffer submission behaviour during prepacking.

Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below).

## Context

Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer.

However, this approach has some drawbacks:

* CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded.
* There can be performance issues at the Vulkan API level when processing a single "large" command buffer.

By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues.

## Llama 3.2 1B crash on Samsung S24

I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely.

Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s).

## Performance Impact

* Improves model load time, especially on larger models.

## Future Work

* Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern
ghstack-source-id: 296437695
@exported-using-ghexport

Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/)
---
 backends/vulkan/runtime/VulkanBackend.cpp     |  3 +-
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 52 ++++++++++++++++++-
 backends/vulkan/runtime/graph/ComputeGraph.h  | 36 +++++++++++++
 backends/vulkan/runtime/graph/GraphConfig.h   | 14 +++++
 .../runtime/graph/containers/Constant.h       |  4 ++
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  |  1 +
 .../vulkan/runtime/vk_api/memory/Buffer.h     |  4 ++
 7 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 28e7574537c..594c00854a2 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->prepare();
     compute_graph->prepare_pipelines();
 
-    compute_graph->encode_prepack();
-    compute_graph->prepack();
+    compute_graph->run_prepack();
 
     // If dynamic shapes are not expected, then the command buffer only needs to
     // be encoded once. Otherwise, wait until the first inference to encode the
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index cb14a41e98a..cafe2f5e502 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -145,7 +145,12 @@ ComputeGraph::ComputeGraph(GraphConfig config)
   execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
   execute_descriptor_counts_.descriptor_storage_image_count = 0;
 
-  context_->set_cmd(/*reusable = */ true);
+  // If certain graph config variables are not specified, then set them
+  // automatically.
+  if (config_.prepack_threshold_nbytes == 0) {
+    config_.prepack_threshold_nbytes = 10 * MB;
+    config_.prepack_initial_threshold_nbytes = 10 * MB;
+  }
 }
 
 ComputeGraph::~ComputeGraph() {
@@ -431,6 +436,7 @@ ValueRef ComputeGraph::add_tensorref(
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
   values_.emplace_back(TensorRef(sizes, dtype, data));
+  total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
   return idx;
 }
 
@@ -750,6 +756,19 @@ void ComputeGraph::prepare_pipelines() {
       vkapi::ComputePipelineCache::Hasher>();
 }
 
+void ComputeGraph::submit_current_cmd(const bool final_use) {
+  context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use);
+}
+
+void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
+  fence.wait();
+  context_->fences().return_fence(fence);
+
+  context_->flush();
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
@@ -766,6 +785,37 @@ void ComputeGraph::prepack() const {
   context_->flush();
 }
 
+void ComputeGraph::run_prepack() {
+  int i = 0;
+  bool submitted = false;
+  const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
+  // int count = 0;
+  context_->set_cmd();
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    // Do not trigger on the first or last prepack node.
+    const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
+    size_t threshold = submitted ? config_.prepack_threshold_nbytes
+                                 : config_.prepack_initial_threshold_nbytes;
+    if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
+      // If reducing peak memory usage, wait for the current command buffer to
+      // finish executing and flush to recycle the staging memory. This will
+      // reduce peak memory usage, but will slightly increase load latency.
+      // Otherwise, just submit the current command buffer for execution and
+      // proceed. This results in lower load latency at the cost of higher peak
+      // memory usage.
+      reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd();
+      staging_nbytes_in_cmd_ = 0;
+      context_->set_cmd();
+      submitted = true;
+    }
+
+    node->encode(this);
+    i++;
+  }
+  submit_current_cmd_and_wait(/*final_use=*/true);
+  staging_nbytes_in_cmd_ = 0;
+}
+
 void ComputeGraph::encode_execute() {
   context_->flush();
   context_->set_cmd(/*reusable = */ true);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 78135a434e5..a8405bb312d 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -190,10 +190,20 @@ class ComputeGraph final {
       vkapi::ComputePipelineCache::Hasher>
       pipeline_descriptors_;
 
+  // Utility constexpr to express byte quantities
+  constexpr static size_t MB = 1024 * 1024;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
 
+  // Total number of bytes needed to store model weights
+  size_t total_constant_nbytes_ = 0;
+
+  // Represents the amount of staging buffer data that will be copied if the
+  // current Context's command buffer is submitted now.
+  size_t staging_nbytes_in_cmd_ = 0;
+
  public:
   //
   // Accessors
@@ -812,13 +822,39 @@ class ComputeGraph final {
   copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
   void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
 
+ protected:
+  // Command Buffer Management
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution.
+   */
+  void submit_current_cmd(const bool final_use = false);
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution,
+   * and wait for it to complete before returning. This function will also flush
+   * the Context after execution.
+   */
+  void submit_current_cmd_and_wait(const bool final_use = false);
+
+ public:
   //
   // Graph Prepacking
   //
 
+  inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
+    staging_nbytes_in_cmd_ += staging_bytes;
+  }
+
   void encode_prepack();
   void prepack() const;
 
+  /*
+   * Executes prepacking operations to transfer model weight data from the CPU
+   * to GPU.
+   */
+  void run_prepack();
+
   //
   // Graph Execution
   //
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 753ce8362af..33c7ae73e62 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -36,6 +36,20 @@ struct GraphConfig final {
   // Whether or not the ComputeGraph should expect input shapes to be dynamic
   bool expect_dynamic_shapes;
 
+  // Execution properties that determine specifics re: how command buffer
+  // submission is handled, etc. 0 means this field is not set.
+
+  // During prepacking, once this threshold is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve model load
+  // performance and prevent crashes when loading large models.
+  size_t prepack_threshold_nbytes = 0;
+  // Threshold used for the first command buffer submission during prepacking.
+  // This can be set to be lower than prepack_submission_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t prepack_initial_threshold_nbytes = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
index 9aa3716e28d..aaa92360a9e 100644
--- a/backends/vulkan/runtime/graph/containers/Constant.h
+++ b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -28,6 +28,10 @@ struct TensorRef final {
       const std::vector<int64_t>& t_sizes,
       vkapi::ScalarType t_dtype,
       const void* const t_data);
+
+  inline size_t nbytes() const {
+    return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
+  }
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index bdbecc866ab..05729172420 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
   api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   staging.copy_from(tref->data, nbytes);
   return staging;
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 0ef9f7e95e4..e1b441397b4 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -138,6 +138,10 @@ class VulkanBuffer final {
     return buffer_properties_.size;
   }
 
+  inline size_t mem_size_as_size_t() const {
+    return utils::safe_downcast<size_t>(mem_size());
+  }
+
   inline bool has_memory() const {
     return (memory_.allocation != VK_NULL_HANDLE);
   }