From 5892e42ca79bc75b2e9bcd51808a9ac5c3b47d3c Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Mon, 14 Jul 2025 08:33:11 -0700 Subject: [PATCH] [ET-VK] Split up prepack command buffer ## Changes * Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once. * Introduce graph config options to control command buffer submission behaviour during prepacking. Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below). ## Context Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer. However, this approach has some drawbacks: * CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded. * There can be performance issues at the Vulkan API level when processing a single "large" command buffer. By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues. ## Llama 3.2 1B crash on Samsung S24 I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely. Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s). ## Performance Impact * Improves model load time, especially on larger models. ## Future Work * Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/) [ghstack-poisoned] --- backends/vulkan/runtime/VulkanBackend.cpp | 3 +- backends/vulkan/runtime/api/Context.h | 10 ++++ .../vulkan/runtime/graph/ComputeGraph.cpp | 47 +++++++++++++++++-- backends/vulkan/runtime/graph/ComputeGraph.h | 38 +++++++++++++-- backends/vulkan/runtime/graph/GraphConfig.h | 14 ++++++ .../vulkan/runtime/graph/ops/PrepackNode.cpp | 1 + .../vulkan/runtime/vk_api/memory/Buffer.h | 4 ++ extension/llm/runner/stats.h | 29 ++++++------ 8 files changed, 120 insertions(+), 26 deletions(-) diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 7077a9df59c..f25a020a60f 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -503,8 +503,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->prepare(); compute_graph->prepare_pipelines(); - compute_graph->encode_prepack(); - compute_graph->prepack(); + compute_graph->run_prepack(); // If dynamic shapes are not expected, then the command buffer only needs to // be encoded once. Otherwise, wait until the first inference to encode the diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h index e55ddcca141..0c2046da315 100644 --- a/backends/vulkan/runtime/api/Context.h +++ b/backends/vulkan/runtime/api/Context.h @@ -92,6 +92,16 @@ class Context final { return queue_.handle; } + // Device Metadata + + inline bool device_is_adreno() const { + return adapter_p_->device_type() == vkapi::DeviceType::ADRENO; + } + + inline bool device_name_contains(const char* substr) const { + return adapter_p_->device_name().find(substr) != std::string::npos; + } + // Device Caches inline vkapi::ShaderLayoutCache& shader_layout_cache() { diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index cb14a41e98a..c25fa28be6f 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -15,6 +15,8 @@ #include +#include + namespace vkcompute { // @@ -145,6 +147,15 @@ ComputeGraph::ComputeGraph(GraphConfig config) execute_descriptor_counts_.descriptor_combined_sampler_count = 0; execute_descriptor_counts_.descriptor_storage_image_count = 0; +#define MB (1024.0 * 1024.0) + // If certain graph config variables are not specified, then set them + // automatically. + if (config_.prepack_threshold_nbytes == 0) { + config_.prepack_threshold_nbytes = 20 * MB; + config_.prepack_initial_threshold_nbytes = 20 * MB; + } +#undef MB + context_->set_cmd(/*reusable = */ true); } @@ -212,11 +223,6 @@ utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout( return utils::kChannelsPacked; } -bool ComputeGraph::device_name_contains(const char* substr) { - return context_->adapter_ptr()->device_name().find(substr) != - std::string::npos; -} - void ComputeGraph::check_no_active_value_ptrs() { VK_CHECK_COND( values_in_use_ == 0, @@ -750,6 +756,15 @@ void ComputeGraph::prepare_pipelines() { vkapi::ComputePipelineCache::Hasher>(); } +void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) { + vkapi::VulkanFence fence = context_->fences().get_fence(); + context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use); + fence.wait(); + context_->fences().return_fence(fence); + + context_->flush(); +} + void ComputeGraph::encode_prepack() { for (std::unique_ptr& node : prepack_nodes_) { node->encode(this); @@ -766,6 +781,28 @@ void ComputeGraph::prepack() const { context_->flush(); } +void ComputeGraph::run_prepack() { + int i = 0; + bool submitted = false; + for (std::unique_ptr& node : prepack_nodes_) { + // Do not trigger on the first or last prepack node. + const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1); + size_t threshold = submitted ? config_.prepack_threshold_nbytes + : config_.prepack_initial_threshold_nbytes; + if (not_terminal && staging_nbytes_in_cmd_ > threshold) { + submit_current_cmd_and_wait(/*final_use=*/true); + staging_nbytes_in_cmd_ = 0; + context_->set_cmd(); + submitted = true; + } + + node->encode(this); + i++; + } + submit_current_cmd_and_wait(/*final_use=*/true); + staging_nbytes_in_cmd_ = 0; +} + void ComputeGraph::encode_execute() { context_->flush(); context_->set_cmd(/*reusable = */ true); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 78135a434e5..b78c1b3218d 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -194,6 +194,10 @@ class ComputeGraph final { size_t values_in_use_ = 0; size_t execute_count_ = 0; + // Represents the amount of staging buffer data that will be copied if the + // current Context's command buffer is submitted now. + size_t staging_nbytes_in_cmd_ = 0; + public: // // Accessors @@ -512,14 +516,17 @@ class ComputeGraph final { utils::GPUMemoryLayout suggested_memory_layout( const std::vector& sizes); - inline bool device_is_adreno() { - return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO; + inline bool device_is_adreno() const { + return context_->device_is_adreno(); } - const std::string& device_name() { - return context()->adapter_ptr()->device_name(); + + const std::string& device_name() const { + return context_->adapter_ptr()->device_name(); } - bool device_name_contains(const char* substr); + inline bool device_name_contains(const char* substr) const { + return context_->device_name_contains(substr); + } // // Graph Building @@ -812,13 +819,34 @@ class ComputeGraph final { copy_into_staging(const ValueRef idx, const void* data, const size_t numel); void copy_from_staging(const ValueRef idx, void* data, const size_t numel); + protected: + // Command Buffer Management + + /* + * Submits the current command buffer in the Context to the GPU for execution, + * and wait for it to complete before returning. This function will also flush + * the Context after execution. + */ + void submit_current_cmd_and_wait(const bool final_use = false); + + public: // // Graph Prepacking // + inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) { + staging_nbytes_in_cmd_ += staging_bytes; + } + void encode_prepack(); void prepack() const; + /* + * Executes prepacking operations to transfer model weight data from the CPU + * to GPU. + */ + void run_prepack(); + // // Graph Execution // diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 753ce8362af..33c7ae73e62 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -36,6 +36,20 @@ struct GraphConfig final { // Whether or not the ComputeGraph should expect input shapes to be dynamic bool expect_dynamic_shapes; + // Execution properties that determine specifics re: how command buffer + // submission is handled, etc. 0 means this field is not set. + + // During prepacking, once this threshold is reached, submit the current + // command buffer for execution. This allows the work to be distributed over + // multiple command buffer submissions, which can improve model load + // performance and prevent crashes when loading large models. + size_t prepack_threshold_nbytes = 0; + // Threshold used for the first command buffer submission during prepacking. + // This can be set to be lower than prepack_submission_threshold_nbytes to + // submit a command buffer for execution earlier which can improve performance + // by taking more advantage of parallelism between the CPU and GPU. + size_t prepack_initial_threshold_nbytes = 0; + vkapi::Adapter* external_adapter; // Generate a default graph config with pre-configured settings diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index bdbecc866ab..05729172420 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); api::StagingBuffer staging(graph->context(), tref->dtype, numel); + graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t()); size_t nbytes = numel * vkapi::element_size(tref->dtype); staging.copy_from(tref->data, nbytes); return staging; diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 0ef9f7e95e4..e1b441397b4 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -138,6 +138,10 @@ class VulkanBuffer final { return buffer_properties_.size; } + inline size_t mem_size_as_size_t() const { + return utils::safe_downcast(mem_size()); + } + inline bool has_memory() const { return (memory_.allocation != VK_NULL_HANDLE); } diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index 19766329ed3..8357afa9b0d 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -100,62 +100,63 @@ inline std::string stats_to_json_string(const Stats& stats) { inline void print_report(const Stats& stats) { printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str()); + printf("\n"); - ET_LOG( - Info, + printf( "\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64, stats.num_prompt_tokens, stats.num_generated_tokens); + printf("\n"); - ET_LOG( - Info, + printf( "\tModel Load Time:\t\t%f (seconds)", ((double)(stats.model_load_end_ms - stats.model_load_start_ms) / stats.SCALING_FACTOR_UNITS_PER_SECOND)); + printf("\n"); double inference_time_ms = (double)(stats.inference_end_ms - stats.inference_start_ms); - ET_LOG( - Info, + printf( "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, (stats.num_generated_tokens) / (double)(stats.inference_end_ms - stats.inference_start_ms) * stats.SCALING_FACTOR_UNITS_PER_SECOND); + printf("\n"); double prompt_eval_time = (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); - ET_LOG( - Info, + printf( "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, (stats.num_prompt_tokens) / prompt_eval_time * stats.SCALING_FACTOR_UNITS_PER_SECOND); + printf("\n"); double eval_time = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); - ET_LOG( - Info, + printf( "\t\tGenerated %" PRIu64 " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", stats.num_generated_tokens, eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, stats.num_generated_tokens / eval_time * stats.SCALING_FACTOR_UNITS_PER_SECOND); + printf("\n"); // Time to first token is measured from the start of inference, excluding // model load time. - ET_LOG( - Info, + printf( "\tTime to first generated token:\t%f (seconds)", ((double)(stats.first_token_ms - stats.inference_start_ms) / stats.SCALING_FACTOR_UNITS_PER_SECOND)); + printf("\n"); - ET_LOG( - Info, + printf( "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", stats.num_prompt_tokens + stats.num_generated_tokens, (double)stats.aggregate_sampling_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND); + printf("\n"); } } // namespace llm