[ET-VK][ez] Fix Vulkan Validation layer errors due to consecutive command buffer encoding

SS-JIA · SS-JIA · commit b7bc28f2f29e · 2025-06-05T11:02:51.000-07:00
## Changes * In `VulkanBackend.cpp` do not call `encode_execute()` during model load if the model compile spec specifies `requires_dynamic_shapes` as true * In test files, do not call `encode_execute()` if `propagate_resize()` is subsequently called. ## Motivation Recently, it was discovered that a command buffer re-encode was required to update push constant values. This means that for dynamic shapes to work correctly, `encode_execute()` must be called after updating tensor sizes. As a result, `propagate_resize()` now calls `encode_execute()` internally. This results in scenarios where `encode_execute()` is called once during model load, then again right before the first inference during `propagate_resize()`, without actually executing the command buffer in-between. This causes Validation layer errors like ``` UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout(ERROR / SPEC): msgNum: 1303270965 - Validation Error: [ UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout ] Object 0: handle = 0x24086224ec0, type = VK_OBJECT_TYPE_COMMAND_BUFFER; Object 1: handle = 0x88d2b500000000e2, type = VK_OBJECT_TYPE_IMAGE; | MessageID = 0x4dae5635 | vkQueueSubmit(): pSubmits[0].pCommandBuffers[0] command buffer VkCommandBuffer 0x24086224ec0[] expects VkImage 0x88d2b500000000e2[] (subresource: aspectMask VK_IMAGE_ASPECT_COLOR_BIT array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL--instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED. Objects: 2 [0] 0x24086224ec0, type: 6, name: NULL [1] 0x88d2b500000000e2, type: 10, name: NULL UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout(ERROR / SPEC): msgNum: 1303270965 - Validation Error: [ UNASSIGNED-CoreValidation-DrawState-InvalidImageLayout ] Object 0: handle = 0x24086224ec0, type = VK_OBJECT_TYPE_COMMAND_BUFFER; Object 1: handle = 0x6caffc00000000e3, type = VK_OBJECT_TYPE_IMAGE; | MessageID = 0x4dae5635 | vkQueueSubmit(): pSubmits[0].pCommandBuffers[0] command buffer VkCommandBuffer 0x24086224ec0[] expects VkImage 0x6caffc00000000e3[] (subresource: aspectMask VK_IMAGE_ASPECT_COLOR_BIT array layer 0, mip level 0) to be in layout VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL--instead, current layout is VK_IMAGE_LAYOUT_UNDEFINED. Objects: 2 [0] 0x24086224ec0, type: 6, name: NULL [1] 0x6caffc00000000e3, type: 10, name: NULL ``` because the last access information of image/buffer resources are inaccurate during the second command buffer encoding, since the first command buffer never executed. ## Perf Impact * Performance improvement for first inference of dynamic shape models if actual tensor sizes are much smaller than maximum possible sizes * No impact for non-dynamic shape models Differential Revision: [D76047203](https://our.internmc.facebook.com/intern/diff/D76047203/) ghstack-source-id: 288459856 Pull Request resolved: #11401
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -30,6 +30,8 @@
 #include <type_traits>
 #include <vector>
 
+#include <iostream>
+
 namespace executorch {
 namespace backends {
 namespace vulkan {
@@ -140,6 +142,14 @@ GraphConfig get_graph_config(ArrayRef<CompileSpec>& compile_specs) {
 
       config.set_memory_layout_override(memory_layout);
     }
+    if (strcmp(spec.key, "require_dynamic_shapes") == 0) {
+      ET_CHECK_MSG(value_size == sizeof(uint8_t), "Unexpected value size!");
+      bool value = getBool(value_data);
+
+      if (value) {
+        config.expect_dynamic_shapes = true;
+      }
+    }
   }
 #ifdef ET_EVENT_TRACER_ENABLED
   config.enable_querypool = true;
@@ -500,9 +510,12 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
-    // TODO(ssjia): remove this once we can batch compile compute pipelines
-    // during prepare().
-    compute_graph->encode_execute();
+    // If dynamic shapes are not expected, then the command buffer only needs to
+    // be encoded once. Otherwise, wait until the first inference to encode the
+    // the command buffer, when actual input shapes are known.
+    if (!compute_graph->graphconfig().expect_dynamic_shapes) {
+      compute_graph->encode_execute();
+    }
 
     return Error::Ok;
   }
@@ -574,7 +587,9 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     // constants are updated and DynamicDispatchNode can update the compute
     // shader, global workgroup size, and local workgroup size to perform the
     // model inference.
-    if (should_propagate_resize) {
+    if (should_propagate_resize ||
+        (compute_graph->graphconfig().expect_dynamic_shapes &&
+         compute_graph->execute_count() == 0u)) {
       compute_graph->propagate_resize();
     }
 
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
@@ -60,6 +60,10 @@ uint32_t getUInt16LE(const uint8_t* data) {
   return (uint32_t)data[0] | ((uint32_t)data[1] << 8);
 }
 
+bool getBool(const uint8_t* data) {
+  return data[0] != 0;
+}
+
 bool VulkanDelegateHeader::is_valid() const {
   if (header_size < kExpectedSize) {
     return false;
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h
@@ -19,6 +19,9 @@ uint64_t getUInt64LE(const uint8_t* data);
 uint32_t getUInt32LE(const uint8_t* data);
 uint32_t getUInt16LE(const uint8_t* data);
 
+// Bool is serialized as a single byte
+bool getBool(const uint8_t* data);
+
 struct VulkanDelegateHeader {
   bool is_valid() const;
 
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -63,6 +63,8 @@ GraphConfig::GraphConfig() {
 
   enable_local_wg_size_override = false;
   local_wg_size_override = {};
+
+  expect_dynamic_shapes = false;
 }
 
 void GraphConfig::set_storage_type_override(utils::StorageType storage_type) {
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -33,6 +33,9 @@ struct GraphConfig final {
   bool enable_local_wg_size_override;
   utils::uvec3 local_wg_size_override;
 
+  // Whether or not the ComputeGraph should expect input shapes to be dynamic
+  bool expect_dynamic_shapes;
+
   // Generate a default graph config with pre-configured settings
   explicit GraphConfig();
 
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -618,7 +618,8 @@ def gen_graph_build_code(self, include_declarations: bool = True) -> str:
         graph_build += f"{self.graph}{self.dot}prepare();\n"
         graph_build += f"{self.graph}{self.dot}encode_prepack();\n"
         graph_build += f"{self.graph}{self.dot}prepack();\n"
-        graph_build += f"{self.graph}{self.dot}encode_execute();\n"
+        if not self.include_io:
+            graph_build += f"{self.graph}{self.dot}encode_execute();\n"
 
         graph_build += "\n"
         return graph_build
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1536,7 +1536,6 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();
-  graph.encode_execute();
 
   // +3: shared memory allocations for tensors
   expected_vma_allocation_count += 3;
@@ -1743,7 +1742,6 @@ TEST(VulkanComputeGraphTest, test_large_graph) {
   out.staging = graph.set_output_tensor(out.value);
 
   graph.prepare();
-  graph.encode_execute();
 
   auto build_end_time = std::chrono::system_clock::now();
 
@@ -1820,7 +1818,6 @@ void test_clone(
   out.staging = graph.set_output_tensor(out.value);
 
   graph.prepare();
-  graph.encode_execute();
 
   fill_vtensor(graph, a, 0.0f, /*iota = */ true);
 
@@ -2955,7 +2952,6 @@ void test_grid_priors(
   graph.prepare();
   graph.encode_prepack();
   graph.prepack();
-  graph.encode_execute();
 
   vTensorPtr t_in = graph.get_tensor(in.value);
   vTensorPtr t_out = graph.get_tensor(out.value);
@@ -3058,7 +3054,6 @@ void test_transpose_view_mm(
   graph.prepare();
   graph.encode_prepack();
   graph.prepack();
-  graph.encode_execute();
 
   for (int i = 1; i < 4; i++) {
     float val_mat1 = i;
@@ -3125,7 +3120,6 @@ void test_to_copy() {
   graph.prepare();
   graph.encode_prepack();
   graph.prepack();
-  graph.encode_execute();
   graph.propagate_resize();
   graph.execute();
 

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,8 @@ GraphConfig::GraphConfig() {`
`63`	`63`
`64`	`64`	`enable_local_wg_size_override = false;`
`65`	`65`	`local_wg_size_override = {};`
	`66`	`+`
	`67`	`+ expect_dynamic_shapes = false;`
`66`	`68`	`}`
`67`	`69`
`68`	`70`	`void GraphConfig::set_storage_type_override(utils::StorageType storage_type) {`