pytorch · lucylq · Feb 23, 2026 · Feb 26, 2026
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -221,6 +221,8 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'kernels/portable',
+    'kernels/quantized',
     'examples/cuda',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',

@@ -543,6 +543,13 @@ endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable/cpu/util)
 
+# Custom ops AOT needs Torch. Find it at root scope before processing the
+# portable subdirectory so that imported targets (e.g. MKL::MKL) are created at
+# root scope rather than in a child directory scope.
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
+  find_package_torch()
+endif()
+
 if(EXECUTORCH_BUILD_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -6,9 +6,9 @@
 
 # pyre-strict
 
-from pathlib import Path
 from typing import Callable, Optional, Protocol, TypeVar
 
+import executorch.kernels.quantized  # noqa: F401
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -18,19 +18,6 @@
 
 m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
 
-try:
-    torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
-except (OSError, RuntimeError):
-
-    custom_libs: list[Path] = list(
-        Path(__file__)
-        .parent.parent.parent.resolve()
-        .glob("**/kernels/quantized/**/*custom_ops_generated_lib.*")
-    )
-    if custom_libs:
-        torch.ops.load_library(str(custom_libs[0]))
-    del Path
-
 # Registry to track all ops with reference implementations
 _REGISTERED_REF_IMPLEMENTATIONS: set[str] = set()
 

@@ -9,6 +9,7 @@
 from typing import Dict, List
 
 import executorch.exir as exir
+import executorch.kernels.quantized  # noqa: F401
 import torch
 from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import LoweredBackendModule, to_backend
@@ -987,9 +988,6 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             _ = to_backend(ep.exported_program, BadPartitioner())
 
     def test_quantized_with_delegate(self) -> None:
-        torch.ops.load_library(
-            "//executorch/kernels/quantized:custom_ops_generated_lib"
-        )
         qconfig_mapping = get_default_qconfig_mapping("qnnpack")
         in_size = 2
         input_size = 3

@@ -8,6 +8,7 @@
 from typing import Dict, List
 
 import executorch.exir as exir
+import executorch.kernels.quantized  # noqa: F401
 import torch
 from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import LoweredBackendModule, to_backend
@@ -991,9 +992,6 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             _ = ep.to_backend(BadPartitioner())
 
     def test_quantized_with_delegate(self) -> None:
-        torch.ops.load_library(
-            "//executorch/kernels/quantized:custom_ops_generated_lib"
-        )
         qconfig_mapping = get_default_qconfig_mapping("qnnpack")
         in_size = 2
         input_size = 3

@@ -11,6 +11,7 @@
 from typing import Any, Callable, List, Optional, Tuple, Type
 
 import executorch.exir as exir
+import executorch.kernels.portable  # noqa: F401
 import torch
 from executorch.exir import ExecutorchBackendConfig, to_edge
 from executorch.exir.capture._capture import patch_forward
@@ -59,22 +60,6 @@
 from torch.nn import functional as F
 from torch.utils import _pytree as pytree
 
-try:
-    torch.ops.load_library("//executorch/kernels/portable:custom_ops_generated_lib")
-except (OSError, RuntimeError):
-    # When running outside of Buck (e.g., CMake/pip), find the shared library
-    # by globbing relative to the kernels/portable directory.
-    from pathlib import Path
-
-    _libs = list(
-        Path(__file__)
-        .parent.parent.parent.resolve()
-        .glob("**/kernels/portable/**/*custom_ops_generated_lib.*")
-    )
-    if _libs:
-        torch.ops.load_library(str(_libs[0]))
-    del Path
-
 
 def swap_modules(
     module: torch.nn.Module,

@@ -8,6 +8,7 @@
 
 import unittest
 
+import executorch.kernels.quantized  # noqa: F401
 import torch
 import torchvision
 
@@ -32,9 +33,6 @@
     prepare_pt2e,
 )
 
-# load executorch out variant ops
-torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
-
 
 class TestQuantization(unittest.TestCase):
     """prepare_pt2e and convert_pt2e are OSS APIs, the rest are all meta-only

@@ -105,3 +105,40 @@ install(
   PUBLIC_HEADER
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/portable/
 )
+
+# Build the portable custom ops AOT library for registering custom ops into
+# PyTorch. Requires find_package(Torch), which must be called at root scope
+# before this subdirectory is processed. Not targeting ARM_BAREMETAL as aot_lib
+# depends on incompatible libraries
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT AND NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
+  set(_custom_ops_yaml "${CMAKE_CURRENT_SOURCE_DIR}/custom_ops.yaml")
+  set(_portable_custom_ops "aten::allclose.out" "aten::allclose.Tensor")
+  gen_selected_ops(
+    LIB_NAME "portable_custom_ops_aot_lib" ROOT_OPS ${_portable_custom_ops}
+  )
+  generate_bindings_for_kernels(
+    LIB_NAME "portable_custom_ops_aot_lib" CUSTOM_OPS_YAML
+    "${_custom_ops_yaml}"
+  )
+  set(_portable_custom_ops_sources
+      "${CMAKE_CURRENT_SOURCE_DIR}/cpu/op_allclose.cpp"
+      "${EXECUTORCH_ROOT}/runtime/core/exec_aten/util/tensor_util_aten.cpp"
+  )
+  gen_custom_ops_aot_lib(
+    LIB_NAME "portable_custom_ops_aot_lib" KERNEL_SOURCES
+    "${_portable_custom_ops_sources}"
+  )
+  target_include_directories(
+    portable_custom_ops_aot_lib PRIVATE ${_common_include_directories}
+  )
+
+  if(APPLE)
+    set(RPATH "@loader_path/../../extensions/pybindings")
+  else()
+    set(RPATH "$ORIGIN/../../extensions/pybindings")
+  endif()
+  set_target_properties(
+    portable_custom_ops_aot_lib PROPERTIES BUILD_RPATH ${RPATH} INSTALL_RPATH
+                                                                ${RPATH}
+  )
+endif()
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    from pathlib import Path
+
+    libs = list(
+        Path(__file__).parent.resolve().glob("**/*portable_custom_ops_aot_lib.*")
+    )
+    del Path
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    import torch as _torch
+
+    _torch.ops.load_library(libs[0])
+    del _torch
+except:  # noqa: E722
+    import logging
+
+    logging.info("portable_custom_ops_aot_lib is not loaded")
+    del logging
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Kernel library for quantized operators. Please this file formatted by running:
+# Kernel library for quantized operators. Please keep this file formatted by
+# running:
 # ~~~
 # cmake-format -i CMakeLists.txt
 # ~~~

diff --git a/src/executorch/kernels/portable b/src/executorch/kernels/portable
@@ -0,0 +1 @@
+../../../kernels/portable
@@ -15,15 +15,14 @@
 """
 from typing import Dict, Optional
 
+import executorch.kernels.portable  # noqa: F401
 import torch
 from executorch.exir.operator.manip import (
     attach_get_scratch_metas_fn,
     ScratchTensorMetadata,
 )
 from executorch.exir.tensor import TensorSpec
 
-torch.ops.load_library("//executorch/kernels/portable:custom_ops_generated_lib")
-
 
 @attach_get_scratch_metas_fn(torch.ops.aten.linear.out)
 def linear_out_get_scratch_metas(

@@ -78,6 +78,10 @@ define_overridable_option(
   EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT
   "Build the optimized ops library for AOT export usage" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT
+  "Build the portable custom ops library for AOT export usage" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER "Build the ASR runner extension" BOOL
   OFF

@@ -7,6 +7,7 @@
 set_overridable_option(EXECUTORCH_BUILD_PYBIND ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT ON)
 # Enable logging even when in release mode. We are building for desktop, where
 # saving a few kB is less important than showing useful error information to
 # users.