Cortex_m backend: Add IO quantizers + tests of non rescaling ops

AdrianLundell · AdrianLundell · commit 6ac176f1699d · 2025-11-04T11:52:24.000+01:00
A number of ops only handles shape/meta-data without changing the dynamic range. In these cases, no rescaling needs to be performed and the int8 portable_ops kernel can be used directly. A new test is added to ensure this behaviour, as well as a test showing how operators which does change the dynamic range (SUB) are not supported. To support quantization of graphs with no-rescale ops in the beginning/ end of the graph, two new quantizers InputQuantizer and OutputQuantizer are introduced. By explicitly stating the dtype of the input/output, no-rescale ops inherit dtypes from them as with any other op. This change exposes the issue of mixing dtypes within the graph, which adds back xfails for the broadcasted add and mul tests. This can be fixed in a future patch after pytorch#15300 is resolved. Signed-off-by: Adrian Lundell <adrian.lundell@arm.com> Change-Id: I8f79b86b633f9ad8d9f183c914754b0ee2f7a87c
diff --git a/backends/cortex_m/quantizer/quantizer.py b/backends/cortex_m/quantizer/quantizer.py
@@ -15,6 +15,9 @@
     INT8_BINARY_OPS_OPERATOR_CONFIG,
     INT8_LINEAR_OPERATOR_CONFIG,
 )
+from executorch.backends.cortex_m.quantizer.quantization_configs import (
+    INT8_PER_TENSOR_CONFIG,
+)
 from torch._ops import OpOverload
 from torch.fx import GraphModule, Node
 from torchao.quantization.pt2e.quantizer import (
@@ -49,6 +52,8 @@ def __init__(self) -> None:
                 INT8_BINARY_OPS_OPERATOR_CONFIG, filter_fn=self.broadcasting_filter
             ),
             OperatorConfigQuantizer(INT8_LINEAR_OPERATOR_CONFIG),
+            InputQuantizer(INT8_PER_TENSOR_CONFIG),
+            OutputQuantizer(INT8_PER_TENSOR_CONFIG),
         ]
         super().__init__(quantizers)
 
@@ -196,3 +201,58 @@ def annotate(self, model: GraphModule) -> None:
 
     def validate(self, model: GraphModule) -> bool:
         return True
+
+
+class InputQuantizer(Quantizer):
+    """
+    Quantizes only the input activations of the graph.
+    """
+
+    def __init__(
+        self,
+        quantization_config: QuantizationConfig,
+        filter_fn: Callable[[Node], bool] = lambda node: False,
+    ) -> None:
+        self.quantization_config = quantization_config
+        self.filter_fn = filter_fn
+
+    def annotate(self, model: GraphModule) -> None:
+        for node in model.graph.nodes:
+            is_placeholder = node.op == "placeholder"
+            is_filtered = self.filter_fn(node)
+            if is_placeholder and not is_filtered:
+                node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+                    {}, self.quantization_config.output_activation
+                )
+
+    def validate(self, model: GraphModule) -> bool:
+        return True
+
+
+class OutputQuantizer(Quantizer):
+    """
+    Quantizes only the output activations of the graph.
+    """
+
+    def __init__(
+        self,
+        quantization_config: QuantizationConfig,
+        filter_fn: Callable[[Node], bool] = lambda node: False,
+    ) -> None:
+        self.quantization_config = quantization_config
+        self.filter_fn = filter_fn
+
+    def annotate(self, model: GraphModule) -> None:
+        output_node = model.graph.output_node()
+        input_qspec_map = {
+            n: self.quantization_config.input_activation
+            for n in output_node.all_input_nodes
+            if not self.filter_fn(n)
+        }
+        output_qspec = self.quantization_config.output_activation
+        output_node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map, output_qspec
+        )
+
+    def validate(self, model: GraphModule) -> bool:
+        return True
diff --git a/backends/cortex_m/test/misc/test_quantization.py b/backends/cortex_m/test/misc/test_quantization.py
@@ -0,0 +1,156 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import pytest
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class CortexMInheritAllOps(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        # x shape: (1, 3, 4, 5)
+        x = x + x
+        x = torch.ops.aten.squeeze.default(x)  # Remove dim 0: (3, 4, 5)
+        x = torch.ops.aten.unsqueeze.default(x, 0)  # Add dim at 0: (1, 3, 4, 5)
+        x = torch.ops.aten.squeeze_copy.default(x)  # (3, 4, 5)
+        x = torch.ops.aten.unsqueeze_copy.default(x, 0)  # (1, 3, 4, 5)
+        x = torch.ops.aten.squeeze.dims(x, [0])  # (3, 4, 5)
+        x = torch.ops.aten.squeeze_copy.dim(
+            x, 0
+        )  # Remove first dim if size 1, otherwise same
+        x = torch.ops.aten.squeeze.dim(x, 0)  # Same
+        x = torch.ops.aten.unbind.int(x, 0)[0]  # Unbind and take first: (4, 5)
+        x = torch.ops.aten.reshape.default(x, (1, 4, 5, 1))  # (1, 4, 5, 1)
+        x = torch.ops.aten.repeat.default(x, [1, 1, 1, 2])  # (1, 4, 5, 2)
+        x = torch.ops.aten.view.default(x, (1, 4, 10))  # (1, 4, 10)
+        target_shape = torch.zeros(1, 4, 10)
+        x = torch.ops.aten.view_as.default(x, target_shape)  # (1, 4, 10)
+        x = torch.ops.aten.view_copy.default(x, (1, 2, 20))  # (1, 2, 20)
+        x = torch.ops.aten.unflatten.int(x, 2, [4, 5])  # (1, 2, 4, 5)
+        x = torch.ops.aten.flatten.using_ints(x, 1, 3)  # (1, 40)
+        x = torch.ops.aten.repeat_interleave.self_int(x, 2, 1)  # (1, 80)
+        x = torch.ops.aten.expand_copy.default(x, (2, 80))  # (2, 80)
+        x = torch.ops.aten.expand.default(x, (2, 80))  # (2, 80)
+        x = torch.ops.aten.tile.default(x, [1, 1])  # (2, 80)
+        x = torch.ops.aten.split.Tensor(x, 40, 1)[0]  # (2, 40)
+        x = torch.ops.aten.split_with_sizes.default(x, [20, 20], 1)[0]  # (2, 20)
+        x = torch.ops.aten.split_copy.Tensor(x, 10, 1)[0]  # (2, 10)
+        x = torch.ops.aten.chunk.default(x, 2, 1)[0]  # (2, 5)
+        x = torch.ops.aten.pad.default(x, [1, 1, 0, 0], "constant", 0.0)  # (2, 7)
+        x = torch.ops.aten.select.int(x, 1, 0)  # (2,)
+        x = torch.ops.aten.select_copy.int(x, 0, 0)  # scalar -> need to reshape
+        x = torch.ops.aten.unsqueeze.default(x, 0)  # (1,)
+        x = torch.ops.aten.unsqueeze.default(x, 1)  # (1, 1)
+        x = torch.ops.aten.slice.Tensor(x, 0, 0, 1)  # (1, 1)
+        x = torch.ops.aten.slice_copy.Tensor(x, 1, 0, 1)  # (1, 1)
+        x = torch.ops.aten.reshape.default(x, (1, 1))  # Ensure shape for transpose
+        x = torch.ops.aten.transpose.int(x, 0, 1)  # (1, 1)
+        x = torch.ops.aten.transpose_copy.int(x, 0, 1)  # (1, 1)
+        x = torch.ops.aten.t_copy.default(x)  # (1, 1)
+        x = torch.ops.aten.contiguous.default(x)  # (1, 1)
+        x = torch.ops.aten.permute.default(x, [1, 0])  # (1, 1)
+        x = torch.ops.aten.permute_copy.default(x, [0, 1])  # (1, 1)
+        x = torch.ops.aten.flip.default(x, [0])  # (1, 1)
+        y = torch.zeros_like(x)
+        x = torch.ops.aten.copy_.default(y, x)  # (1, 1)
+        x = torch.ops.aten.clone.default(x)  # (1, 1)
+        return x
+
+
+class CortexMOnlyInheritOps(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        return torch.permute(torch.clone(x), (0, 1, 3, 2))
+
+
+class CortexMQuantizeNonSupportedSub(torch.nn.Module):
+    ops_before_transforms = {}
+
+    ops_after_transforms = {}
+
+    def forward(self, x, y):
+        return y - x
+
+
+test_cases = {
+    "all_ops": McuTestCase(
+        CortexMInheritAllOps(),
+        (ramp_tensor(0, 10, (1, 3, 4, 5)),),
+    ),
+    "only_inherit_ops": McuTestCase(
+        CortexMOnlyInheritOps(),
+        (ramp_tensor(0, 10, (1, 3, 4, 5)),),
+    ),
+}
+
+
+@parametrize("test_case", test_cases)
+def test_inherit_int8_dtype(test_case):
+    """
+    Test that ops which does not change dynamic range are able to use int8 portable kernels.
+    """
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+    # Check that all nodes in the graph are in int8
+    artifact = tester.get_artifact()
+    for node in artifact.exported_program().module().graph.nodes:
+        if node.op != "call_function":
+            continue
+        if node.target == exir_ops.edge.cortex_m.dequantize_per_tensor.default:
+            continue
+
+        assert get_first_fake_tensor(node).dtype == torch.int8, f"{node.name}"
+
+
+test_cases = {
+    "sub": McuTestCase(
+        CortexMQuantizeNonSupportedSub(),
+        (ramp_tensor(0, 10, (1, 3, 4, 5)), ramp_tensor(0, 1, (1, 3, 4, 5))),
+    ),
+}
+
+
+@pytest.mark.xfail(
+    reason="Non handled ops which change dynamic range currently not supported."
+)
+@parametrize("test_case", test_cases)
+def test_quantize_unsupported_op(test_case):
+    """
+    Test an op which does change dynamic range and which is not suported by CMSIS-NN. Currently not supported.
+    """
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py
@@ -163,6 +163,9 @@ class CortexMAlphaAdd(ModelAlpha):
         "Expecting kwargs for aten op IR to be empty - alpha arg not supported.",
         AssertionError,
     ),
+    "broadcast_1": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_2": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_3": "Mixed fp/quantized ops currently not supported.",
 }
 
 
@@ -187,6 +190,9 @@ def test_dialect_add(test_case):
         "Expecting kwargs for aten op IR to be empty - alpha arg not supported.",
         AssertionError,
     ),
+    "broadcast_1": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_2": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_3": "Mixed fp/quantized ops currently not supported.",
 }
 
 
diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py
@@ -127,6 +127,9 @@ class CortexMTensorMulBroadCast(Model):
 xfail_cases = {
     "self_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",
     "scalar_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",
+    "broadcast_1": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_2": "Mixed fp/quantized ops currently not supported.",
+    "broadcast_3": "Mixed fp/quantized ops currently not supported.",
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,9 @@ class CortexMTensorMulBroadCast(Model):`
`127`	`127`	`xfail_cases = {`
`128`	`128`	`"self_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",`
`129`	`129`	`"scalar_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",`
	`130`	`+ "broadcast_1": "Mixed fp/quantized ops currently not supported.",`
	`131`	`+ "broadcast_2": "Mixed fp/quantized ops currently not supported.",`
	`132`	`+ "broadcast_3": "Mixed fp/quantized ops currently not supported.",`
`130`	`133`	`}`
`131`	`134`
`132`	`135`