Skip to content

Commit 463b632

Browse files
committed
[ET-VK] 7/n Split dispatches between multiple command buffers. Split execute dispatch into multiple commands based on dispatch count.
Pull Request resolved: #12530 This diff, splits the execute dispatch into multiple commands based on the dispatch count. This allows for concurrent CPU and GPU execution. The modifications involve adding a counter `encoded_node_count` to track the number of encoded nodes and submitting a new command buffer to the GPU every 64 nodes. ghstack-source-id: 300616853 @exported-using-ghexport Differential Revision: [D78360039](https://our.internmc.facebook.com/intern/diff/D78360039/)
1 parent 6485e4f commit 463b632

File tree

2 files changed

+39
-1
lines changed

2 files changed

+39
-1
lines changed

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,10 @@ ComputeGraph::ComputeGraph(GraphConfig config)
151151
config_.prepack_threshold_nbytes = 10 * MB;
152152
config_.prepack_initial_threshold_nbytes = 10 * MB;
153153
}
154+
if (config_.execute_threshold_node_count == 0) {
155+
config_.execute_threshold_node_count = 128;
156+
config_.execute_initial_threshold_node_count = 64;
157+
}
154158
}
155159

156160
ComputeGraph::~ComputeGraph() {
@@ -852,15 +856,38 @@ void ComputeGraph::execute() {
852856
context_->set_cmd(/*reusable = */ true);
853857

854858
context_->cmd_reset_querypool();
859+
uint32_t encoded_node_count = 0;
855860

856861
for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
857862
node->encode(this);
863+
encoded_node_count++;
864+
865+
// Threshold is reached when the node count reached
866+
// execute_initial_threshold_node_count or if its a multiple of
867+
// execute_threshold_node_count.
868+
const bool reached_threshold =
869+
encoded_node_count >= config_.execute_initial_threshold_node_count &&
870+
((encoded_node_count - config_.execute_initial_threshold_node_count) %
871+
config_.execute_threshold_node_count ==
872+
0);
873+
874+
// Create a new command buffer when threashold is reached
875+
if (reached_threshold) {
876+
context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
877+
deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
878+
context_->set_cmd(true);
879+
}
858880
}
859881

882+
vkapi::VulkanFence fence = context_->fences().get_fence();
883+
context_->submit_cmd_to_gpu(fence.get_submit_handle(), false);
884+
fence.wait();
885+
context_->fences().return_fence(fence);
860886
deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
887+
} else {
888+
submit_deferred_cmds_and_wait();
861889
}
862890

863-
submit_deferred_cmds_and_wait();
864891
execute_count_++;
865892
}
866893

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ struct GraphConfig final {
5050
// by taking more advantage of parallelism between the CPU and GPU.
5151
size_t prepack_initial_threshold_nbytes = 0;
5252

53+
// During execute, once this node count is reached, submit the current
54+
// command buffer for execution. This allows the work to be distributed over
55+
// multiple command buffer submissions, which can improve execution
56+
// performance.
57+
size_t execute_threshold_node_count = 0;
58+
// Execute node count used for the first command buffer submission during
59+
// execute. This can be set to be lower than execute_threshold_nbytes to
60+
// submit a command buffer for execution earlier which can improve performance
61+
// by taking more advantage of parallelism between the CPU and GPU.
62+
size_t execute_initial_threshold_node_count = 0;
63+
5364
vkapi::Adapter* external_adapter;
5465

5566
// Generate a default graph config with pre-configured settings

0 commit comments

Comments
 (0)