pytorch · psiddh · Mar 3, 2026 · Jan 30, 2026
@@ -71,7 +71,7 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
 
     def _get_linear_replacement(self, node):
         """
-         Let
+        Let
         - yi be the output activations (y1, ... yn)
         - xj be the input activations (x1, ... xm)
         - wij be the weights (w11, ... wnm)

@@ -36,11 +36,12 @@
 from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
 
 from executorch.backends.arm.vgf import VgfCompileSpec
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
 
-# To use Cortex-M backend
 from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
     ReplaceQuantNodesPass,
 )
+from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
 
 from executorch.devtools import generate_etrecord
 from executorch.devtools.backend_debug import get_delegation_info
@@ -396,6 +397,7 @@ def forward(self, x):
     "TOSA-1.0+INT",
     "TOSA-1.0+FP",
     "TOSA-1.0+INT+int16",
+    "cortex-m55+int8",
 ]
 
 
@@ -528,7 +530,7 @@ def get_args():
         required=False,
         default="ethos-u55-128",
         choices=TARGETS,
-        help=f"For ArmBackend delegated models, pick the target, and therefore the instruction set generated. valid targets are {TARGETS}",
+        help=f"Target backend. For delegated models: Ethos-U/VGF/TOSA variants. For non-delegated: cortex-m55+int8 (CMSIS-NN portable kernels). Valid targets: {TARGETS}",
     )
     parser.add_argument(
         "-e",
@@ -795,6 +797,75 @@ def to_edge_TOSA_delegate(
     return model_quant, edge
 
 
+def to_edge_cortex_m(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
+    """Cortex-M/CMSIS-NN compilation path with no delegation."""
+    logging.info("Using Cortex-M/CMSIS-NN compilation path (no delegation)")
+
+    def _to_channels_last(x):
+        if isinstance(x, torch.Tensor):
+            if x.dim() == 4 and not x.is_contiguous(memory_format=torch.channels_last):
+                logging.warning(
+                    "Converting input tensor with shape %s to channels_last",
+                    list(x.shape),
+                )
+                return x.to(memory_format=torch.channels_last)
+            return x
+        elif isinstance(x, tuple):
+            return tuple(_to_channels_last(t) for t in x)
+        return x
+
+    if not args.quantize:
+        logging.warning(
+            "Quantization is DISABLED. Cortex-M typically requires quantization."
+        )
+    else:
+        model = model.to(memory_format=torch.channels_last)
+        example_inputs = tuple(_to_channels_last(x) for x in example_inputs)
+
+        quantizer = CortexMQuantizer()
+        prepared = prepare_pt2e(model, quantizer)
+
+        dataset = get_calibration_data(
+            args.model_name, example_inputs, args.evaluate, args.evaluate_config
+        )
+
+        if isinstance(dataset, DataLoader):
+            for sample, _ in dataset:
+                prepared(_to_channels_last(sample))
+        else:
+            prepared(*tuple(_to_channels_last(x) for x in dataset))
+
+        model_quant = convert_pt2e(prepared)
+
+        exported_program = torch.export.export(
+            model_quant, example_inputs, strict=args.strict_export
+        )
+
+    edge = to_edge_transform_and_lower(
+        exported_program,
+        compile_config=EdgeCompileConfig(
+            preserve_ops=[
+                torch.ops.aten.linear.default,
+                torch.ops.aten.hardsigmoid.default,
+                torch.ops.aten.hardsigmoid_.default,
+                torch.ops.aten.hardswish.default,
+                torch.ops.aten.hardswish_.default,
+            ],
+            _check_ir_validity=False,
+        ),
+    )
+
+    pass_manager = CortexMPassManager(edge.exported_program())
+    edge._edge_programs["forward"] = pass_manager.transform()
+
+    return model_quant if args.quantize else None, edge
+
+
 def to_edge_no_delegate(
     exported_program: ExportedProgram,
     args,
@@ -873,7 +944,24 @@ def to_edge_no_delegate(
 
     # Quantize if required
     model_quant = None
-    if args.delegate:
+    if args.target == "cortex-m55+int8":
+        # Cortex-M path: CMSIS-NN portable kernels, no delegation
+        if getattr(args, "evaluate", False):
+            logging.error(
+                "--evaluate is not supported for target 'cortex-m55+int8' "
+                "because this path does not use a TOSA delegate."
+            )
+            sys.exit(1)
+        if args.delegate:
+            logging.warning(
+                "--delegate is ignored for target 'cortex-m55+int8' "
+                "(this target does not use delegated ops)."
+            )
+            args.delegate = False
+        model_quant, edge = to_edge_cortex_m(
+            exported_program, args, model, example_inputs
+        )
+    elif args.delegate:
         model_quant, edge = to_edge_TOSA_delegate(
             exported_program, args, model, example_inputs
         )