From bb3c625b1f9f588ed5c027677013f181b869ca73 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 14 Jul 2025 08:33:08 -0700
Subject: [PATCH 1/8] [ET-VK] Fix caching mechanism to account for included
 files

## Changes

* Update the `gen_vulkan_spv.py` script to account for changes to included files when decided whether to perform a re-compilation


## Additional Changes

Also applied some misc fixes:

* Allow the Python preprocessor to handle unescaped `\` used to extend macros to multiple lines. Fixed this by replacing these with `\` before running the preprocessor.
* Fixed an issue where build would unexpectedly pass when trying to recompile a failing build multiple time. This would cause the caching mechanism to use a old cached build artifact since no changes were detected. Fixed this by removing any existing cached artifacts on unsuccessful build.

## Context

The `gen_vulkan_spv.py` script which handles GLSL shader codegen and GLSL -> SPIR-V compilation for the ExecuTorch Vulkan backend has a caching mechanism to only recompile modified files for the purpose of developer efficiency. However, this mechanism currently doesn't consider whether included files have been changed. Therefore, if a include file is modified without first modifying the source file which uses it, the changes to the include file will not be captured.

Differential Revision: [D78275585](https://our.internmc.facebook.com/intern/diff/D78275585/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/gen_vulkan_spv.py | 203 ++++++++++++++++------
 1 file changed, 149 insertions(+), 54 deletions(-)

diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index 7b6b116fb4b..d42d7ab33be 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -545,6 +545,9 @@ def escape(line: str) -> str:
 def preprocess(
     input_text: str, variables: Dict[str, Any], input_path: str = "codegen"
 ) -> str:
+    # Workaround to handle source files using \ to extend mecros to a new line
+    input_text = re.sub(r"\\$", r"\\\\", input_text, flags=re.MULTILINE)
+
     input_lines = input_text.splitlines()
     python_lines = []
 
@@ -654,8 +657,8 @@ def addSrcAndYamlFiles(self, src_dir_paths: List[str]) -> None:
         for src_path in src_dir_paths:
             # Collect glsl source files
             src_files_list = glob.glob(
-                os.path.join(src_path, "**", "*.glsl*"), recursive=True
-            )
+                os.path.join(src_path, "**", "*.[gh]lsl*"), recursive=True
+            ) + glob.glob(os.path.join(src_path, "**", "*.h"), recursive=True)
             for file in src_files_list:
                 if len(file) > 1:
                     self.src_files[extract_filename(file, keep_ext=False)] = file
@@ -851,47 +854,150 @@ def generateSPV(  # noqa: C901
         cache_dir: Optional[str] = None,
         force_rebuild: bool = False,
     ) -> Dict[str, str]:
-        output_file_map = {}
+        # The key of this dictionary is the full path to a generated source file. The
+        # value is a tuple that contains 3 entries:
+        #
+        # 1. A bool indicationg if the file has changed since the last compilation; this
+        #    is determined by comparing against the cached version.
+        # 2. List of other source files included by the generated file.
+        gen_file_meta: Dict[str, Tuple[bool, List[str], str]] = {}
+
+        # Return value of the function mapping the abspath of compiled SPIR-V binaries
+        # to the abspath of the generated GLSL file they were compiled from.
+        spv_to_glsl_map: Dict[str, str] = {}
+
+        # Convert output_dir to absolute path
+        assert os.path.exists(output_dir)
+        output_dir = os.path.abspath(output_dir)
+
+        if cache_dir is not None:
+            assert os.path.exists(cache_dir)
+
+        def get_glsl_includes(glsl_text):
+            """
+            Parse GLSL text content and return a list of included files.
+
+            Args:
+                glsl_text: String containing the GLSL file content to analyze
+
+            Returns:
+                List of included file names (e.g., ["random.h"])
+            """
+            includes = []
+            for line in glsl_text.splitlines():
+                # Look for #include directives with quoted filenames
+                # Matches: #include "filename.h" or #include <filename.h>
+                include_match = re.match(
+                    r'^\s*#include\s+[<"]([^>"]+)[>"]', line.strip()
+                )
+                if include_match:
+                    includes.append(include_match.group(1))
+
+            return includes
+
+        def file_has_changed(gen_file_path, cached_file_path):
+            # If the file does not exist in the cache, then return True
+            if not os.path.exists(cached_file_path):
+                return True
+            current_checksum = self.get_md5_checksum(gen_file_path)
+            cached_checksum = self.get_md5_checksum(cached_file_path)
+            return current_checksum != cached_checksum
+
+        def any_sources_changed(gen_file_path, output_dir):
+            """
+            Given the path to a generated source file, check the gen_file_meta dict to
+            determine if the ANY of the source files contributing to the compilation of
+            this file were changed since the last successful compilation.
+            """
+            gen_file_changed, includes_list = gen_file_meta[gen_file_path]
+            any_changed = gen_file_changed
+            for included_file in includes_list:
+                included_file_path = os.path.join(output_dir, included_file)
+                any_changed = any_changed or any_sources_changed(
+                    included_file_path, output_dir
+                )
+
+            return any_changed
 
-        def generate_src_file(shader_paths_pair):
-            # Extract components from the input tuple
-            # name of .glsl, .glslh, or .h to be generated
+        def generate_src_file(shader_paths_pair) -> Tuple[bool, List[str]]:
+            """
+            Given an input tuple containing the following items:
+            (src_file_name, (template_file_path, codegen_params))
+
+            This function generates src_file_name by processing
+            template_file_path with the Python preprocessor using the
+            parameters specified by codegen_params.
+
+            Then, it returns a tuple containing:
+            1. The path of the generated source file
+            2. A bool indicating if the generated source file has changed since the last
+               compilation.
+            3. A list of files included by the generated source file
+            """
+            # name of .glsl, .glslh, or .h file to be generated
             src_file_name = shader_paths_pair[0]
             # path of template file used for codegen
-            src_file_fullpath = shader_paths_pair[1][0]
+            template_file_path = shader_paths_pair[1][0]
             # args to be used for codegen
             codegen_params = shader_paths_pair[1][1]
 
             # Assume that generated files will have the same file extension as the
             # source template file.
-            src_file_ext = extract_extension(src_file_fullpath)
-            out_file_ext = src_file_ext
+            out_file_ext = extract_extension(template_file_path)
 
             # Construct generated file name
             gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}")
+            # Construct path of cached generated file
+            cached_gen_out_path = os.path.join(
+                cache_dir, f"{src_file_name}.{out_file_ext}"
+            )
 
             # Execute codegen to generate the output file
-            with codecs.open(src_file_fullpath, "r", encoding="utf-8") as input_file:
+            with codecs.open(template_file_path, "r", encoding="utf-8") as input_file:
                 input_text = input_file.read()
                 input_text = self.maybe_replace_u16vecn(input_text)
                 output_text = preprocess(input_text, codegen_params)
 
+            included_files = get_glsl_includes(output_text)
+
             with codecs.open(gen_out_path, "w", encoding="utf-8") as output_file:
                 output_file.write(output_text)
 
-        def compile_spirv(shader_paths_pair):
-            # Extract components from the input tuple
-            # name of generated .glsl, .glslh, or .h
+            file_changed = (
+                file_has_changed(gen_out_path, cached_gen_out_path) or force_rebuild
+            )
+
+            # Save the generated file to cache so it can be used for future checks
+            if cache_dir is not None and file_changed:
+                shutil.copyfile(gen_out_path, cached_gen_out_path)
+
+            return gen_out_path, file_changed, included_files
+
+        def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
+            """
+            Given an input tuple containing the following items:
+            (src_file_name, (template_file_path, codegen_params))
+
+            Infer the path of the GLSL source file generated by generate_src_file and
+            compile a SPIR-V binary from it. Returns the path of the compiled SPIR-V
+            binary and the path of the source file used to compile it.
+
+            This function also utilizes a caching mechanism; if generate_src_file
+            reported that the source file was unchanged since the last successful
+            compilation, AND if the SPIR-V from the last successful compilation was
+            stored in the cache, then directly use the cached SPIR-V without triggering
+            a re-compilation.
+            """
+            # name of generated .glsl, .glslh, or .h from generate_src_file
             src_file_name = shader_paths_pair[0]
             # path of template file used for codegen
-            src_file_fullpath = shader_paths_pair[1][0]
+            template_file_path = shader_paths_pair[1][0]
             # args used for codegen
             codegen_params = shader_paths_pair[1][1]
 
             # Assume that generated files will have the same file extension as the
             # source template file.
-            src_file_ext = extract_extension(src_file_fullpath)
-            out_file_ext = src_file_ext
+            out_file_ext = extract_extension(template_file_path)
 
             # Infer name of generated file (created by generate_src_file)
             gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}")
@@ -900,32 +1006,21 @@ def compile_spirv(shader_paths_pair):
             if out_file_ext != "glsl":
                 return (None, gen_out_path)
 
-            # Construct name of SPIR-V file to be compiled, if needed
+            # Validate that the source file actually exists
+            assert os.path.exists(gen_out_path) and gen_out_path in gen_file_meta
+
+            # Construct name of SPIR-V file to be compiled
             spv_out_path = os.path.join(output_dir, f"{src_file_name}.spv")
 
             if cache_dir is not None:
                 # Construct the file names of cached SPIR-V file to check if they exist
                 # in the cache.
-                cached_gen_out_path = os.path.join(
-                    cache_dir, f"{src_file_name}.{out_file_ext}"
-                )
                 cached_spv_out_path = os.path.join(cache_dir, f"{src_file_name}.spv")
 
-                # Only use cached artifacts if all of the expected artifacts are present
-                if (
-                    not force_rebuild
-                    and os.path.exists(cached_gen_out_path)
-                    and os.path.exists(cached_spv_out_path)
-                ):
-                    current_checksum = self.get_md5_checksum(gen_out_path)
-                    cached_checksum = self.get_md5_checksum(cached_gen_out_path)
-                    # If the cached generated GLSL file is the same as the current GLSL
-                    # generated file, then assume that the generated GLSL and SPIR-V
-                    # will not have changed. In that case, just copy over the GLSL and
-                    # SPIR-V files from the cache and return.
-                    if current_checksum == cached_checksum:
-                        shutil.copyfile(cached_spv_out_path, spv_out_path)
-                        return (spv_out_path, gen_out_path)
+                can_use_cached = not any_sources_changed(gen_out_path, output_dir)
+                if can_use_cached and os.path.exists(cached_spv_out_path):
+                    shutil.copyfile(cached_spv_out_path, spv_out_path)
+                    return (spv_out_path, gen_out_path)
 
             vk_version = codegen_params.get("VK_VERSION", "1.1")
             # Only proceed if a GLSL compiler was specified
@@ -938,10 +1033,8 @@ def compile_spirv(shader_paths_pair):
                     spv_out_path,
                     "--target-env=vulkan{}".format(vk_version),
                     "-Werror",
-                ] + [
-                    arg
-                    for src_dir_path in self.src_dir_paths
-                    for arg in ["-I", src_dir_path]
+                    "-I",
+                    output_dir,
                 ]
                 cmd = cmd_base + self.glslc_flags
 
@@ -955,17 +1048,24 @@ def compile_spirv(shader_paths_pair):
                         try:
                             subprocess.run(cmd_no_opt, check=True, capture_output=True)
                         except subprocess.CalledProcessError as e_no_opt:
+                            # Delete any existing cached SPIR-V file if it exists
+                            if os.path.exists(cached_spv_out_path):
+                                os.remove(cached_spv_out_path)
+
                             raise RuntimeError(
                                 f"{err_msg_base} {e_no_opt.stderr}"
                             ) from e_no_opt
 
                     else:
+                        # Delete any existing cached SPIR-V file if it exists
+                        if os.path.exists(cached_spv_out_path):
+                            os.remove(cached_spv_out_path)
+
                         raise RuntimeError(f"{err_msg_base} {e.stderr}") from e
 
-                # If compilation was successful, store the source GLSL file and the
-                # compiled SPIR-V file in the cache for future comparison.
+                # If compilation was successful, store the compiled SPIR-V file in the
+                # cache for future use.
                 if cache_dir is not None:
-                    shutil.copyfile(gen_out_path, cached_gen_out_path)
                     shutil.copyfile(spv_out_path, cached_spv_out_path)
 
             return (spv_out_path, gen_out_path)
@@ -973,25 +1073,20 @@ def compile_spirv(shader_paths_pair):
         # Run codegen serially to ensure that all .glsl, .glslh, and .h files are up to
         # date before compilation
         for generated_file_tuple in self.output_file_map.items():
-            generate_src_file(generated_file_tuple)
+            gen_out_path, file_changed, include_list = generate_src_file(
+                generated_file_tuple
+            )
+            gen_file_meta[gen_out_path] = (file_changed, include_list)
 
         # Parallelize SPIR-V compilation to optimize build time
         with ThreadPool(os.cpu_count()) as pool:
             for spv_out_path, glsl_out_path in pool.map(
                 compile_spirv, self.output_file_map.items()
             ):
-                output_file_map[spv_out_path] = glsl_out_path
-
-        # Save all source GLSL files to the cache. Only do this at the very end since
-        # multiple variants may use the same source file.
-        if cache_dir is not None:
-            for _, src_file_fullpath in self.src_files.items():
-                cached_src_file = os.path.join(
-                    cache_dir, os.path.basename(src_file_fullpath) + ".t"
-                )
-                shutil.copyfile(src_file_fullpath, cached_src_file)
+                print(spv_to_glsl_map)
+                spv_to_glsl_map[spv_out_path] = glsl_out_path
 
-        return output_file_map
+        return spv_to_glsl_map
 
 
 ##############################################

From 5892e42ca79bc75b2e9bcd51808a9ac5c3b47d3c Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 14 Jul 2025 08:33:11 -0700
Subject: [PATCH 2/8] [ET-VK] Split up prepack command buffer

## Changes

* Introduce `run_prepack()` API which combines the functionality of `encode_prepack()` and `prepack()`, but submits prepacking shaders incrementally rather than all at once.
* Introduce graph config options to control command buffer submission behaviour during prepacking.

Note that the current default values for the prepack submission thresholds were determined through experimentation. I will leave determining optimal values for specific devices as a later exercise. The goal of this diff is simply to introduce this mechanism to fix the Llama model loading crash on Samsung S24 (described below).

## Context

Currently, ET-VK will encode all prepacking shaders, and then perform prepacking by submitting only one command buffer.

However, this approach has some drawbacks:

* CPU/GPU parallelism is decreased, since the command buffer is submitted only after all commands have been encoded.
* There can be performance issues at the Vulkan API level when processing a single "large" command buffer.

By splitting up prepacking to occur over multiple command buffers, performance can be improved by avoiding both the aforementioned issues.

## Llama 3.2 1B crash on Samsung S24

I have also noticed that running large models (i.e. Llama 3.2 1B) on the Samsung S24 with ET-VK, the device's display will crash (causing the screen to go black and become unresponsive), and sometimes the device will shut down entirely.

Fortunately, this change also fixes this behaviour, in addition to providing a significant performance boost to model load time for Llama models (from 9s to 3s).

## Performance Impact

* Improves model load time, especially on larger models.

## Future Work

* Deprecate the `encode_prepack()` + `prepack()` pattern in favor of the `run_prepack()` pattern

Differential Revision: [D78275586](https://our.internmc.facebook.com/intern/diff/D78275586/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/VulkanBackend.cpp     |  3 +-
 backends/vulkan/runtime/api/Context.h         | 10 ++++
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 47 +++++++++++++++++--
 backends/vulkan/runtime/graph/ComputeGraph.h  | 38 +++++++++++++--
 backends/vulkan/runtime/graph/GraphConfig.h   | 14 ++++++
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  |  1 +
 .../vulkan/runtime/vk_api/memory/Buffer.h     |  4 ++
 extension/llm/runner/stats.h                  | 29 ++++++------
 8 files changed, 120 insertions(+), 26 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 7077a9df59c..f25a020a60f 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -503,8 +503,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->prepare();
     compute_graph->prepare_pipelines();
 
-    compute_graph->encode_prepack();
-    compute_graph->prepack();
+    compute_graph->run_prepack();
 
     // If dynamic shapes are not expected, then the command buffer only needs to
     // be encoded once. Otherwise, wait until the first inference to encode the
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index e55ddcca141..0c2046da315 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -92,6 +92,16 @@ class Context final {
     return queue_.handle;
   }
 
+  // Device Metadata
+
+  inline bool device_is_adreno() const {
+    return adapter_p_->device_type() == vkapi::DeviceType::ADRENO;
+  }
+
+  inline bool device_name_contains(const char* substr) const {
+    return adapter_p_->device_name().find(substr) != std::string::npos;
+  }
+
   // Device Caches
 
   inline vkapi::ShaderLayoutCache& shader_layout_cache() {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index cb14a41e98a..c25fa28be6f 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -15,6 +15,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
+#include <iostream>
+
 namespace vkcompute {
 
 //
@@ -145,6 +147,15 @@ ComputeGraph::ComputeGraph(GraphConfig config)
   execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
   execute_descriptor_counts_.descriptor_storage_image_count = 0;
 
+#define MB (1024.0 * 1024.0)
+  // If certain graph config variables are not specified, then set them
+  // automatically.
+  if (config_.prepack_threshold_nbytes == 0) {
+    config_.prepack_threshold_nbytes = 20 * MB;
+    config_.prepack_initial_threshold_nbytes = 20 * MB;
+  }
+#undef MB
+
   context_->set_cmd(/*reusable = */ true);
 }
 
@@ -212,11 +223,6 @@ utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
   return utils::kChannelsPacked;
 }
 
-bool ComputeGraph::device_name_contains(const char* substr) {
-  return context_->adapter_ptr()->device_name().find(substr) !=
-      std::string::npos;
-}
-
 void ComputeGraph::check_no_active_value_ptrs() {
   VK_CHECK_COND(
       values_in_use_ == 0,
@@ -750,6 +756,15 @@ void ComputeGraph::prepare_pipelines() {
       vkapi::ComputePipelineCache::Hasher>();
 }
 
+void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
+  fence.wait();
+  context_->fences().return_fence(fence);
+
+  context_->flush();
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
@@ -766,6 +781,28 @@ void ComputeGraph::prepack() const {
   context_->flush();
 }
 
+void ComputeGraph::run_prepack() {
+  int i = 0;
+  bool submitted = false;
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    // Do not trigger on the first or last prepack node.
+    const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
+    size_t threshold = submitted ? config_.prepack_threshold_nbytes
+                                 : config_.prepack_initial_threshold_nbytes;
+    if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
+      submit_current_cmd_and_wait(/*final_use=*/true);
+      staging_nbytes_in_cmd_ = 0;
+      context_->set_cmd();
+      submitted = true;
+    }
+
+    node->encode(this);
+    i++;
+  }
+  submit_current_cmd_and_wait(/*final_use=*/true);
+  staging_nbytes_in_cmd_ = 0;
+}
+
 void ComputeGraph::encode_execute() {
   context_->flush();
   context_->set_cmd(/*reusable = */ true);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 78135a434e5..b78c1b3218d 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -194,6 +194,10 @@ class ComputeGraph final {
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
 
+  // Represents the amount of staging buffer data that will be copied if the
+  // current Context's command buffer is submitted now.
+  size_t staging_nbytes_in_cmd_ = 0;
+
  public:
   //
   // Accessors
@@ -512,14 +516,17 @@ class ComputeGraph final {
   utils::GPUMemoryLayout suggested_memory_layout(
       const std::vector<int64_t>& sizes);
 
-  inline bool device_is_adreno() {
-    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
+  inline bool device_is_adreno() const {
+    return context_->device_is_adreno();
   }
-  const std::string& device_name() {
-    return context()->adapter_ptr()->device_name();
+
+  const std::string& device_name() const {
+    return context_->adapter_ptr()->device_name();
   }
 
-  bool device_name_contains(const char* substr);
+  inline bool device_name_contains(const char* substr) const {
+    return context_->device_name_contains(substr);
+  }
 
   //
   // Graph Building
@@ -812,13 +819,34 @@ class ComputeGraph final {
   copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
   void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
 
+ protected:
+  // Command Buffer Management
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution,
+   * and wait for it to complete before returning. This function will also flush
+   * the Context after execution.
+   */
+  void submit_current_cmd_and_wait(const bool final_use = false);
+
+ public:
   //
   // Graph Prepacking
   //
 
+  inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
+    staging_nbytes_in_cmd_ += staging_bytes;
+  }
+
   void encode_prepack();
   void prepack() const;
 
+  /*
+   * Executes prepacking operations to transfer model weight data from the CPU
+   * to GPU.
+   */
+  void run_prepack();
+
   //
   // Graph Execution
   //
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 753ce8362af..33c7ae73e62 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -36,6 +36,20 @@ struct GraphConfig final {
   // Whether or not the ComputeGraph should expect input shapes to be dynamic
   bool expect_dynamic_shapes;
 
+  // Execution properties that determine specifics re: how command buffer
+  // submission is handled, etc. 0 means this field is not set.
+
+  // During prepacking, once this threshold is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve model load
+  // performance and prevent crashes when loading large models.
+  size_t prepack_threshold_nbytes = 0;
+  // Threshold used for the first command buffer submission during prepacking.
+  // This can be set to be lower than prepack_submission_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t prepack_initial_threshold_nbytes = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index bdbecc866ab..05729172420 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
   api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   staging.copy_from(tref->data, nbytes);
   return staging;
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
index 0ef9f7e95e4..e1b441397b4 100644
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -138,6 +138,10 @@ class VulkanBuffer final {
     return buffer_properties_.size;
   }
 
+  inline size_t mem_size_as_size_t() const {
+    return utils::safe_downcast<size_t>(mem_size());
+  }
+
   inline bool has_memory() const {
     return (memory_.allocation != VK_NULL_HANDLE);
   }
diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h
index 19766329ed3..8357afa9b0d 100644
--- a/extension/llm/runner/stats.h
+++ b/extension/llm/runner/stats.h
@@ -100,62 +100,63 @@ inline std::string stats_to_json_string(const Stats& stats) {
 
 inline void print_report(const Stats& stats) {
   printf("PyTorchObserver %s\n", stats_to_json_string(stats).c_str());
+  printf("\n");
 
-  ET_LOG(
-      Info,
+  printf(
       "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
       stats.num_prompt_tokens,
       stats.num_generated_tokens);
+  printf("\n");
 
-  ET_LOG(
-      Info,
+  printf(
       "\tModel Load Time:\t\t%f (seconds)",
       ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
        stats.SCALING_FACTOR_UNITS_PER_SECOND));
+  printf("\n");
   double inference_time_ms =
       (double)(stats.inference_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
+  printf(
       "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
 
       (stats.num_generated_tokens) /
           (double)(stats.inference_end_ms - stats.inference_start_ms) *
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  printf("\n");
   double prompt_eval_time =
       (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
-  ET_LOG(
-      Info,
+  printf(
       "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
       (stats.num_prompt_tokens) / prompt_eval_time *
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  printf("\n");
 
   double eval_time =
       (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
-  ET_LOG(
-      Info,
+  printf(
       "\t\tGenerated %" PRIu64
       " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       stats.num_generated_tokens,
       eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
       stats.num_generated_tokens / eval_time *
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  printf("\n");
 
   // Time to first token is measured from the start of inference, excluding
   // model load time.
-  ET_LOG(
-      Info,
+  printf(
       "\tTime to first generated token:\t%f (seconds)",
       ((double)(stats.first_token_ms - stats.inference_start_ms) /
        stats.SCALING_FACTOR_UNITS_PER_SECOND));
+  printf("\n");
 
-  ET_LOG(
-      Info,
+  printf(
       "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
       stats.num_prompt_tokens + stats.num_generated_tokens,
       (double)stats.aggregate_sampling_time_ms /
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  printf("\n");
 }
 
 } // namespace llm

From 6bf4695adeb8b7e998c54ddf9d5ec30daa441d97 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:06:37 -0700
Subject: [PATCH 3/8] [ET-VK] 1/n Split dispatches between multiple command
 buffers. Add semaphore support in command buffer.

execute stage
## Context

This following diffs aims to improve the performance of the Executorch Vulkan backend by adding a mechanism to issue multiple command buffers in prepack and execute function, so GPU work is issues while CPU is still working on issuing new work.

## This Diff

### Summary

This diff is the first in a series of diffs that aim to split dispatches between multiple command buffers and add semaphore support in the command buffer.

The changes in this diff include:

*   Adding a `VkSemaphore` parameter to the `CommandBuffer` constructor in `vk_api/Command.cpp` and `vk_api/Command.h` to support signaling when the command buffer has completed execution.
*   Modifying the `CommandBuffer` constructor in `vk_api/Command.h` and `vk_api/Command.cpp` to include the `VkSemaphore` parameter.
*   Updating the `CommandBuffer` object in `api/Context.cpp` to include the `VkSemaphore` parameter.

Differential Revision: [D78282194](https://our.internmc.facebook.com/intern/diff/D78282194/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/Context.cpp    |  2 +-
 backends/vulkan/runtime/vk_api/Command.cpp | 29 ++++++++++++++++++++--
 backends/vulkan/runtime/vk_api/Command.h   | 14 ++++++++++-
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 1308be6c93a..6a80b912a9a 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -38,7 +38,7 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
       querypool_(config_.query_pool_config, nullptr),
       // Command buffer submission
       cmd_mutex_{},
-      cmd_(VK_NULL_HANDLE, 0u),
+      cmd_(VK_NULL_HANDLE, VK_NULL_HANDLE, 0u),
       submit_count_{0u},
       // Memory Management
       buffer_clearlist_mutex_{},
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 3a5041f9500..4e0a915fe98 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -20,28 +20,34 @@ namespace vkapi {
 
 CommandBuffer::CommandBuffer(
     VkCommandBuffer handle,
+    VkSemaphore semaphore,
     const VkCommandBufferUsageFlags flags)
     : handle_(handle),
+      signal_semaphore_(semaphore),
       flags_(flags),
       state_(CommandBuffer::State::NEW),
       bound_{} {}
 
 CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept
     : handle_(other.handle_),
+      signal_semaphore_(other.signal_semaphore_),
       flags_(other.flags_),
-      state_(CommandBuffer::State::INVALID),
+      state_(other.state_),
       bound_(other.bound_) {
   other.handle_ = VK_NULL_HANDLE;
+  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
 }
 
 CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept {
   handle_ = other.handle_;
+  signal_semaphore_ = other.signal_semaphore_;
   flags_ = other.flags_;
   state_ = other.state_;
   bound_ = other.bound_;
 
   other.handle_ = VK_NULL_HANDLE;
+  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
   other.state_ = CommandBuffer::State::INVALID;
 
@@ -304,6 +310,12 @@ CommandPool::~CommandPool() {
   if (pool_ == VK_NULL_HANDLE) {
     return;
   }
+  for (auto& semaphore : semaphores_) {
+    if (semaphore != VK_NULL_HANDLE) {
+      vkDestroySemaphore(device_, semaphore, nullptr);
+    }
+  }
+
   vkDestroyCommandPool(device_, pool_, nullptr);
 }
 
@@ -314,6 +326,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   allocate_new_batch(config_.cmd_pool_batch_size);
 
   VkCommandBuffer handle = buffers_[in_use_];
+  VkSemaphore semaphore = semaphores_[in_use_];
 
   VkCommandBufferUsageFlags cmd_flags = 0u;
   if (!reusable) {
@@ -321,7 +334,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   }
 
   in_use_++;
-  return CommandBuffer(handle, cmd_flags);
+  return CommandBuffer(handle, semaphore, cmd_flags);
 }
 
 void CommandPool::flush() {
@@ -337,6 +350,7 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
   }
 
   buffers_.resize(buffers_.size() + count);
+  semaphores_.resize(buffers_.size() + count);
 
   const VkCommandBufferAllocateInfo allocate_info{
       VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType
@@ -348,6 +362,17 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
 
   VK_CHECK(vkAllocateCommandBuffers(
       device_, &allocate_info, buffers_.data() + in_use_));
+
+  const VkSemaphoreCreateInfo semaphoreCreateInfo = {
+      VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, nullptr, 0};
+
+  for (uint32_t i = 0; i < count; i++) {
+    VK_CHECK(vkCreateSemaphore(
+        device_,
+        &semaphoreCreateInfo,
+        nullptr,
+        semaphores_.data() + in_use_ + i));
+  }
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
index ff1e5934a5c..d6d3fe05a34 100644
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ b/backends/vulkan/runtime/vk_api/Command.h
@@ -26,7 +26,10 @@ namespace vkapi {
 
 class CommandBuffer final {
  public:
-  explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags);
+  explicit CommandBuffer(
+      VkCommandBuffer,
+      VkSemaphore,
+      const VkCommandBufferUsageFlags);
 
   CommandBuffer(const CommandBuffer&) = delete;
   CommandBuffer& operator=(const CommandBuffer&) = delete;
@@ -70,6 +73,8 @@ class CommandBuffer final {
 
  private:
   VkCommandBuffer handle_;
+  // Semaphore to signal when the command buffer has completed execution
+  VkSemaphore signal_semaphore_;
   VkCommandBufferUsageFlags flags_;
   State state_;
   Bound bound_;
@@ -81,6 +86,7 @@ class CommandBuffer final {
 
   inline void invalidate() {
     handle_ = VK_NULL_HANDLE;
+    signal_semaphore_ = VK_NULL_HANDLE;
     bound_.reset();
   }
 
@@ -100,6 +106,10 @@ class CommandBuffer final {
 
   VkCommandBuffer get_submit_handle(const bool final_use = false);
 
+  VkSemaphore get_signal_semaphore() const {
+    return signal_semaphore_;
+  }
+
   inline operator bool() const {
     return handle_ != VK_NULL_HANDLE;
   }
@@ -130,6 +140,8 @@ class CommandPool final {
   // New Buffers
   std::mutex mutex_;
   std::vector<VkCommandBuffer> buffers_;
+  // Semaphores corresponding to the command buffers
+  std::vector<VkSemaphore> semaphores_;
   size_t in_use_;
 
  public:

From 261d821b24acb3c5d2566e3e5ebaafca3cdb1213 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:55:35 -0700
Subject: [PATCH 4/8] [ET-VK] 2/n Split dispatches between multiple command
 buffers. Add semaphore support to Adapter::submit_cmd.

This diff adds semaphore support to `Adapter::submit_cmd` by modifying its function signature to include `wait_semaphore` and `signal_semaphore` parameters. The updated function now takes into account the new semaphore parameters and correctly configures the pipeline stages using `VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT`.

Differential Revision: [D78360041](https://our.internmc.facebook.com/intern/diff/D78360041/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/vk_api/Adapter.cpp | 17 +++++++++++------
 backends/vulkan/runtime/vk_api/Adapter.h   |  8 ++++++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index 038a66159fb..e08491c656b 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -307,17 +307,22 @@ void Adapter::return_queue(Adapter::Queue& compute_queue) {
 void Adapter::submit_cmd(
     const Adapter::Queue& device_queue,
     VkCommandBuffer cmd,
-    VkFence fence) {
+    VkFence fence,
+    VkSemaphore wait_semaphore,
+    VkSemaphore signal_semaphore) {
+  const VkPipelineStageFlags flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+  const bool set_wait_semaphore = wait_semaphore != VK_NULL_HANDLE;
+  const bool set_signal_semaphore = signal_semaphore != VK_NULL_HANDLE;
   const VkSubmitInfo submit_info{
       VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
       nullptr, // pNext
-      0u, // waitSemaphoreCount
-      nullptr, // pWaitSemaphores
-      nullptr, // pWaitDstStageMask
+      set_wait_semaphore ? 1u : 0u, // waitSemaphoreCount
+      set_wait_semaphore ? &wait_semaphore : nullptr, // pWaitSemaphores
+      &flags, // pWaitDstStageMask
       1u, // commandBufferCount
       &cmd, // pCommandBuffers
-      0u, // signalSemaphoreCount
-      nullptr, // pSignalSemaphores
+      set_signal_semaphore ? 1u : 0u, // signalSemaphoreCount
+      set_signal_semaphore ? &signal_semaphore : nullptr, // pSignalSemaphores
   };
 
   std::lock_guard<std::mutex> queue_lock(
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index d242e2d3ac1..aa4c659c6d8 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -242,8 +242,12 @@ class Adapter final {
 
   // Command Buffer Submission
 
-  void
-  submit_cmd(const Queue&, VkCommandBuffer, VkFence fence = VK_NULL_HANDLE);
+  void submit_cmd(
+      const Queue&,
+      VkCommandBuffer,
+      VkFence fence = VK_NULL_HANDLE,
+      VkSemaphore wait_semaphore = VK_NULL_HANDLE,
+      VkSemaphore signal_semaphore = VK_NULL_HANDLE);
 
   std::string stringize() const;
   friend std::ostream& operator<<(std::ostream&, const Adapter&);

From 33d3e29ca325484a25dcf865393f91d919c0a2ac Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:12:36 -0700
Subject: [PATCH 5/8] Update on "[ET-VK] 2/n Split dispatches between multiple
 command buffers. Add semaphore support to Adapter::submit_cmd."

This diff adds semaphore support to `Adapter::submit_cmd` by modifying its function signature to include `wait_semaphore` and `signal_semaphore` parameters. The updated function now takes into account the new semaphore parameters and correctly configures the pipeline stages using `VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT`.

Differential Revision: [D78360041](https://our.internmc.facebook.com/intern/diff/D78360041/)

[ghstack-poisoned]

From ec8cc511736ffc701934abd2199d6f4099d0d389 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:12:37 -0700
Subject: [PATCH 6/8] [ET-VK] 3/n Split dispatches between multiple command
 buffers. Track previous semaphore in context.

This diff is the third part of a series of diffs aiming to split dispatches between multiple command buffers. In this diff, we are tracking the previous semaphore in the context.

A new member variable `prev_semaphore_` was added to the `Context` class. This variable is used to store the semaphore of the previously submitted command buffer.

Differential Revision: [D78360037](https://our.internmc.facebook.com/intern/diff/D78360037/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/api/Context.cpp | 16 +++++++++++++++-
 backends/vulkan/runtime/api/Context.h   |  2 ++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 6a80b912a9a..64d940d44fb 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -39,6 +39,7 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
       // Command buffer submission
       cmd_mutex_{},
       cmd_(VK_NULL_HANDLE, VK_NULL_HANDLE, 0u),
+      prev_semaphore_(VK_NULL_HANDLE),
       submit_count_{0u},
       // Memory Management
       buffer_clearlist_mutex_{},
@@ -195,10 +196,21 @@ void Context::register_blit(
 }
 
 void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
+  // Wait semaphore would be previous command buffer's signal semaphore
+  VkSemaphore wait_semaphore = prev_semaphore_;
+  // Signal semaphore for the the current command buffer
+  VkSemaphore signal_semaphore = cmd_.get_signal_semaphore();
+  // Next command buffer would wait on this command buffer's signal semaphore
+  prev_semaphore_ = signal_semaphore;
+
   if (cmd_) {
     cmd_.end();
     adapter_p_->submit_cmd(
-        queue_, cmd_.get_submit_handle(final_use), fence_handle);
+        queue_,
+        cmd_.get_submit_handle(final_use),
+        fence_handle,
+        wait_semaphore,
+        signal_semaphore);
 
     submit_count_ = 0u;
   }
@@ -214,6 +226,8 @@ void Context::flush() {
   if (cmd_) {
     cmd_.invalidate();
   }
+  // Reset previous command buffer semaphore
+  prev_semaphore_ = VK_NULL_HANDLE;
 
   std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
   std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index e55ddcca141..9d8e7c92255 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -68,6 +68,8 @@ class Context final {
   // Command buffers submission
   std::mutex cmd_mutex_;
   vkapi::CommandBuffer cmd_;
+  // Semaphore for the previously submitted command buffer, if any
+  VkSemaphore prev_semaphore_;
   uint32_t submit_count_;
   // Memory Management
   std::mutex buffer_clearlist_mutex_;

From 4eb6326b6e194a726c8e7629a6db04d14746d012 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 16 Jul 2025 12:29:55 -0700
Subject: [PATCH 7/8] Update base for Update on "[ET-VK] 3/n Split dispatches
 between multiple command buffers. Track previous semaphore in context."

This diff is the third part of a series of diffs aiming to split dispatches between multiple command buffers. In this diff, we are tracking the previous semaphore in the context.

A new member variable `prev_semaphore_` was added to the `Context` class. This variable is used to store the semaphore of the previously submitted command buffer.

Differential Revision: [D78360037](https://our.internmc.facebook.com/intern/diff/D78360037/)

[ghstack-poisoned]

From c78bcfcbeaa9018dddf2612776411fcd25bbe5ba Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:00:15 -0700
Subject: [PATCH 8/8] Update base for Update on "[ET-VK] 3/n Split dispatches
 between multiple command buffers. Track previous semaphore in context."

This diff is the third part of a series of diffs aiming to split dispatches between multiple command buffers. In this diff, we are tracking the previous semaphore in the context.

A new member variable `prev_semaphore_` was added to the `Context` class. This variable is used to store the semaphore of the previously submitted command buffer.

Differential Revision: [D78360037](https://our.internmc.facebook.com/intern/diff/D78360037/)

[ghstack-poisoned]