Skip to content

Commit 16632e5

Browse files
committed
[ET-VK] Split up prepack command buffer
Pull Request resolved: #12442 ## Changes * Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once. * Introduce graph config options to control command buffer submission behaviour during prepacking. Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below). ## Context Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer. However, this approach has some drawbacks: * CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded. * There can be performance issues at the Vulkan API level when processing a single "large" command buffer. By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues. ## Llama 3.2 1B crash on Samsung S24 I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely. Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s). ## Performance Impact * Improves model load time, especially on larger models. ## Future Work * Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern ghstack-source-id: 296122477 @exported-using-ghexport Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/)
1 parent dd4488d commit 16632e5

File tree

6 files changed

+85
-2
lines changed

6 files changed

+85
-2
lines changed

backends/vulkan/runtime/VulkanBackend.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
507507
compute_graph->prepare();
508508
compute_graph->prepare_pipelines();
509509

510-
compute_graph->encode_prepack();
511-
compute_graph->prepack();
510+
compute_graph->run_prepack();
512511

513512
// If dynamic shapes are not expected, then the command buffer only needs to
514513
// be encoded once. Otherwise, wait until the first inference to encode the

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ ComputeGraph::ComputeGraph(GraphConfig config)
145145
execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
146146
execute_descriptor_counts_.descriptor_storage_image_count = 0;
147147

148+
#define MB (1024.0 * 1024.0)
149+
// If certain graph config variables are not specified, then set them
150+
// automatically.
151+
if (config_.prepack_threshold_nbytes == 0) {
152+
config_.prepack_threshold_nbytes = 20 * MB;
153+
config_.prepack_initial_threshold_nbytes = 20 * MB;
154+
}
155+
#undef MB
156+
148157
context_->set_cmd(/*reusable = */ true);
149158
}
150159

@@ -750,6 +759,15 @@ void ComputeGraph::prepare_pipelines() {
750759
vkapi::ComputePipelineCache::Hasher>();
751760
}
752761

762+
void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
763+
vkapi::VulkanFence fence = context_->fences().get_fence();
764+
context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
765+
fence.wait();
766+
context_->fences().return_fence(fence);
767+
768+
context_->flush();
769+
}
770+
753771
void ComputeGraph::encode_prepack() {
754772
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
755773
node->encode(this);
@@ -766,6 +784,28 @@ void ComputeGraph::prepack() const {
766784
context_->flush();
767785
}
768786

787+
void ComputeGraph::run_prepack() {
788+
int i = 0;
789+
bool submitted = false;
790+
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
791+
// Do not trigger on the first or last prepack node.
792+
const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
793+
size_t threshold = submitted ? config_.prepack_threshold_nbytes
794+
: config_.prepack_initial_threshold_nbytes;
795+
if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
796+
submit_current_cmd_and_wait(/*final_use=*/true);
797+
staging_nbytes_in_cmd_ = 0;
798+
context_->set_cmd();
799+
submitted = true;
800+
}
801+
802+
node->encode(this);
803+
i++;
804+
}
805+
submit_current_cmd_and_wait(/*final_use=*/true);
806+
staging_nbytes_in_cmd_ = 0;
807+
}
808+
769809
void ComputeGraph::encode_execute() {
770810
context_->flush();
771811
context_->set_cmd(/*reusable = */ true);

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ class ComputeGraph final {
194194
size_t values_in_use_ = 0;
195195
size_t execute_count_ = 0;
196196

197+
// Represents the amount of staging buffer data that will be copied if the
198+
// current Context's command buffer is submitted now.
199+
size_t staging_nbytes_in_cmd_ = 0;
200+
197201
public:
198202
//
199203
// Accessors
@@ -812,13 +816,34 @@ class ComputeGraph final {
812816
copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
813817
void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
814818

819+
protected:
820+
// Command Buffer Management
821+
822+
/*
823+
* Submits the current command buffer in the Context to the GPU for execution,
824+
* and wait for it to complete before returning. This function will also flush
825+
* the Context after execution.
826+
*/
827+
void submit_current_cmd_and_wait(const bool final_use = false);
828+
829+
public:
815830
//
816831
// Graph Prepacking
817832
//
818833

834+
inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
835+
staging_nbytes_in_cmd_ += staging_bytes;
836+
}
837+
819838
void encode_prepack();
820839
void prepack() const;
821840

841+
/*
842+
* Executes prepacking operations to transfer model weight data from the CPU
843+
* to GPU.
844+
*/
845+
void run_prepack();
846+
822847
//
823848
// Graph Execution
824849
//

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,20 @@ struct GraphConfig final {
3636
// Whether or not the ComputeGraph should expect input shapes to be dynamic
3737
bool expect_dynamic_shapes;
3838

39+
// Execution properties that determine specifics re: how command buffer
40+
// submission is handled, etc. 0 means this field is not set.
41+
42+
// During prepacking, once this threshold is reached, submit the current
43+
// command buffer for execution. This allows the work to be distributed over
44+
// multiple command buffer submissions, which can improve model load
45+
// performance and prevent crashes when loading large models.
46+
size_t prepack_threshold_nbytes = 0;
47+
// Threshold used for the first command buffer submission during prepacking.
48+
// This can be set to be lower than prepack_submission_threshold_nbytes to
49+
// submit a command buffer for execution earlier which can improve performance
50+
// by taking more advantage of parallelism between the CPU and GPU.
51+
size_t prepack_initial_threshold_nbytes = 0;
52+
3953
vkapi::Adapter* external_adapter;
4054

4155
// Generate a default graph config with pre-configured settings

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
6262
TensorRefPtr tref = graph->get_tref(tref_);
6363
size_t numel = utils::multiply_integers(tref->sizes);
6464
api::StagingBuffer staging(graph->context(), tref->dtype, numel);
65+
graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
6566
size_t nbytes = numel * vkapi::element_size(tref->dtype);
6667
staging.copy_from(tref->data, nbytes);
6768
return staging;

backends/vulkan/runtime/vk_api/memory/Buffer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ class VulkanBuffer final {
138138
return buffer_properties_.size;
139139
}
140140

141+
inline size_t mem_size_as_size_t() const {
142+
return utils::safe_downcast<size_t>(mem_size());
143+
}
144+
141145
inline bool has_memory() const {
142146
return (memory_.allocation != VK_NULL_HANDLE);
143147
}

0 commit comments

Comments
 (0)