Optimize quantized max pool 2d (pytorch#115690)

shubhraprakash1 · pytorchmergebot · commit 1b8599283fe4 · 2023-12-15T00:45:37.000Z
Summary: We do not need to dequantize and quantize again for this op. With this optimization cunet-enc ops: vulkan.quantized_max_pool2d_quint8{48, 36, 2} 207532 vulkan.quantized_max_pool2d_quint8{24, 18, 4} 78832 vulkan.quantized_max_pool2d_quint8{12, 9, 8} 49296 Without optimization: vulkan.quantized_max_pool2d_quint8{48, 36, 2} 234416 vulkan.quantized_max_pool2d_quint8{24, 18, 4} 94380 vulkan.quantized_max_pool2d_quint8{12, 9, 8} 58760 Test Plan: Ensure all vulkan quantize tests pass: buck2 run --target-platforms ovr_configplatform/macos:arm64-fbsourcexplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 78 tests from 1 test suite. [----------] Global test environment set-up. [----------] 78 tests from VulkanAPITest ... [==========] 78 tests from 1 test suite ran. (1519 ms total) [ PASSED ] 78 tests. buck2 run --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 395 tests from 1 test suite. [----------] Global test environment set-up. [----------] 395 tests from VulkanAPITest ... [ SKIPPED ] VulkanAPITest.querypool_flushed_shader_log (0 ms) [----------] 395 tests from VulkanAPITest (6515 ms total) [----------] Global test environment tear-down [==========] 395 tests from 1 test suite ran. (6515 ms total) [ PASSED ] 394 tests. [ SKIPPED ] 1 test, listed below: [ SKIPPED ] VulkanAPITest.querypool_flushed_shader_log YOU HAVE 5 DISABLED TESTS Reviewed By: yipjustin, copyrightly Differential Revision: D50998619 Pull Request resolved: pytorch#115690 Approved by: https://github.com/SS-JIA
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_max_pool2d_qint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_max_pool2d_qint8.glsl
@@ -14,8 +14,6 @@ layout(set = 0, binding = 2)         uniform PRECISION restrict           Block
   ivec2 stride;
   ivec2 padding;
   ivec2 dilate;
-  vec2 scale;
-  ivec2 zero_point;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -37,13 +35,11 @@ void main() {
       for (int x = start.x; x < end.x; x += uBlock.dilate.x) {
         if ((x >= 0 && x < uBlock.kernel.z) && (y >= 0 && y < uBlock.kernel.w)) {
           vec4 outtexy = texelFetch(uInput, ivec3(x, y, pos.z), 0);
-          outtexy = uBlock.scale.x * (outtexy - uBlock.zero_point.x);
           outtex = max(outtexy, outtex);
         }
       }
     }
 
-    outtex = roundEven(outtex / uBlock.scale.x) + uBlock.zero_point.x;
     ivec4 store = ivec4(outtex);
     imageStore(uOutput, pos, store);
   }
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_max_pool2d_quint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_max_pool2d_quint8.glsl
@@ -14,8 +14,6 @@ layout(set = 0, binding = 2)         uniform PRECISION restrict           Block
   ivec2 stride;
   ivec2 padding;
   ivec2 dilate;
-  vec2 scale;
-  ivec2 zero_point;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -37,13 +35,11 @@ void main() {
       for (int x = start.x; x < end.x; x += uBlock.dilate.x) {
         if ((x >= 0 && x < uBlock.kernel.z) && (y >= 0 && y < uBlock.kernel.w)) {
           vec4 outtexy = texelFetch(uInput, ivec3(x, y, pos.z), 0);
-          outtexy = uBlock.scale.x * (outtexy - uBlock.zero_point.x);
           outtex = max(outtexy, outtex);
         }
       }
     }
 
-    outtex = roundEven(outtex / uBlock.scale.x) + uBlock.zero_point.x;
     uvec4 store = uvec4(outtex);
     imageStore(uOutput, pos, store);
   }
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -168,83 +168,37 @@ Tensor pool2d(
   }
 
   api::UniformParamsBuffer params;
-  if (v_self.is_quantized()) {
-    const struct Block final {
-      uvec3 extents;
-      int32_t range;
-      ivec4 kernel;
-      ivec2 stride;
-      ivec2 padding;
-      ivec2 dilation;
-      vec2 scale;
-      ivec2 zero_point;
-    } block{
-        v_output.extents(),
-        safe_downcast<int32_t>(
-            kernel[Layout::Parameter::width] *
-            kernel[Layout::Parameter::height]),
-        {
-            safe_downcast<int32_t>(kernel[Layout::Parameter::width]),
-            safe_downcast<int32_t>(kernel[Layout::Parameter::height]),
-            safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::width)),
-            safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::height)),
-        },
-        {
-            safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-            safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-        },
-        {
-            safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-            safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-        },
-        {
-            safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
-            safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
-        },
-        {
-            safe_downcast<float>(v_self.get_scale()),
-            0.0f,
-        },
-        {
-            safe_downcast<int32_t>(v_self.get_zero_point()),
-            0u,
-        },
-    };
-    params = api::UniformParamsBuffer(context, block);
-  } else {
-    const struct Block final {
-      uvec3 extents;
-      int32_t range;
-      ivec4 kernel;
-      ivec2 stride;
-      ivec2 padding;
-      ivec2 dilation;
-    } block{
-        v_output.extents(),
-        safe_downcast<int32_t>(
-            kernel[Layout::Parameter::width] *
-            kernel[Layout::Parameter::height]),
-        {
-            safe_downcast<int32_t>(kernel[Layout::Parameter::width]),
-            safe_downcast<int32_t>(kernel[Layout::Parameter::height]),
-            safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::width)),
-            safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::height)),
-        },
-        {
-            safe_downcast<int32_t>(stride[Layout::Parameter::width]),
-            safe_downcast<int32_t>(stride[Layout::Parameter::height]),
-        },
-        {
-            safe_downcast<int32_t>(padding[Layout::Parameter::width]),
-            safe_downcast<int32_t>(padding[Layout::Parameter::height]),
-        },
-        {
-            safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
-            safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
-        },
-    };
-    params = api::UniformParamsBuffer(context, block);
-  }
+  const struct Block final {
+    uvec3 extents;
+    int32_t range;
+    ivec4 kernel;
+    ivec2 stride;
+    ivec2 padding;
+    ivec2 dilation;
+  } block{
+      v_output.extents(),
+      safe_downcast<int32_t>(
+          kernel[Layout::Parameter::width] * kernel[Layout::Parameter::height]),
+      {
+          safe_downcast<int32_t>(kernel[Layout::Parameter::width]),
+          safe_downcast<int32_t>(kernel[Layout::Parameter::height]),
+          safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::width)),
+          safe_downcast<int32_t>(self_arg.size(Layout::Activation4D::height)),
+      },
+      {
+          safe_downcast<int32_t>(stride[Layout::Parameter::width]),
+          safe_downcast<int32_t>(stride[Layout::Parameter::height]),
+      },
+      {
+          safe_downcast<int32_t>(padding[Layout::Parameter::width]),
+          safe_downcast<int32_t>(padding[Layout::Parameter::height]),
+      },
+      {
+          safe_downcast<int32_t>(dilation[Layout::Parameter::width]),
+          safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
+      },
+  };
+  params = api::UniformParamsBuffer(context, block);
 
   api::PipelineBarrier pipeline_barrier{};
 

Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,6 @@ layout(set = 0, binding = 2) uniform PRECISION restrict Block`
`14`	`14`	`ivec2 stride;`
`15`	`15`	`ivec2 padding;`
`16`	`16`	`ivec2 dilate;`
`17`		`- vec2 scale;`
`18`		`- ivec2 zero_point;`
`19`	`17`	`} uBlock;`
`20`	`18`
`21`	`19`	`layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;`
`@@ -37,13 +35,11 @@ void main() {`
`37`	`35`	`for (int x = start.x; x < end.x; x += uBlock.dilate.x) {`
`38`	`36`	`if ((x >= 0 && x < uBlock.kernel.z) && (y >= 0 && y < uBlock.kernel.w)) {`
`39`	`37`	`vec4 outtexy = texelFetch(uInput, ivec3(x, y, pos.z), 0);`
`40`		`- outtexy = uBlock.scale.x * (outtexy - uBlock.zero_point.x);`
`41`	`38`	`outtex = max(outtexy, outtex);`
`42`	`39`	`}`
`43`	`40`	`}`
`44`	`41`	`}`
`45`	`42`
`46`		`- outtex = roundEven(outtex / uBlock.scale.x) + uBlock.zero_point.x;`
`47`	`43`	`ivec4 store = ivec4(outtex);`
`48`	`44`	`imageStore(uOutput, pos, store);`
`49`	`45`	`}`