[AOTI] Use torchgen to generate C shim functions (pytorch#120513)

desertfire · pytorchmergebot · commit bd19d6d822e7 · 2024-03-05T04:28:44.000Z
Summary: The current C shim layer manually implements a C interface for a handful of ops. Obviously that's not scalable if we want to extend it to cover all aten ops. This new torchgen script automatically generates C shim interfaces for CPU and CUDA backends. The interface follows the same parameter passing rules as the current C shim layer, such as * Use plain C data types to pass parameters * Use AtenTensorHandle to pass at::Tensor * Use pointer type to pass optional parameter * Use pointer+length to pass list * Use device_type+device_index to pass device * When a parameter is a pointer of pointer, e.g. AtenTensorHandle**, the script generates either a list of optional values or an optional list of values https://gist.github.com/desertfire/83701532b126c6d34dae6ba68a1b074a is an example of the generated torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.cpp file. The current version doesn't generate C shim wrappers for all aten ops, and probably generates more wrappers than needed on the other hand, but it should serve as a good basis. This PR by itself won't change AOTI codegen and thus won't introduce any FC breakage. The actual wrapper codegen changes will come in another PR with some version control flag to avoid FC breakage. Differential Revision: [D54258087](https://our.internmc.facebook.com/intern/diff/D54258087) Pull Request resolved: pytorch#120513 Approved by: https://github.com/jansel
diff --git a/.gitignore b/.gitignore
@@ -86,6 +86,7 @@ torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
+torch/csrc/inductor/aoti_torch/generated/*
 torch/csrc/jit/generated/*
 torch/csrc/jit/fuser/config.h
 torch/csrc/nn/THCUNN.cpp
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -368,6 +368,7 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
     "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_4.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cpu.cpp"
   )
   if(BUILD_LAZY_TS_BACKEND)
     list(APPEND GENERATED_CXX_TORCH
@@ -422,12 +423,17 @@ set(GENERATED_TESTING_PYTHON
   "${TORCH_SRC_DIR}/testing/_internal/generated/annotated_fn_args.py"
   )
 
+set(GENERATED_CXX_TORCH_CUDA
+  "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cuda.cpp"
+  )
+
 set(TORCH_GENERATED_CODE
   ${GENERATED_CXX_TORCH}
   ${GENERATED_H_TORCH}
   ${GENERATED_CXX_PYTHON}
   ${GENERATED_H_PYTHON}
   ${GENERATED_TESTING_PYTHON}
+  ${GENERATED_CXX_TORCH_CUDA}
   )
 
 set(GEN_PER_OPERATOR_FLAG)
@@ -970,6 +976,7 @@ endif()
 # Compile exposed libraries.
 if(USE_ROCM)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+  list(APPEND Caffe2_HIP_SRCS ${GENERATED_CXX_TORCH_CUDA})
   hip_add_library(torch_hip ${Caffe2_HIP_SRCS})
   if(USE_FLASH_ATTENTION)
     target_link_libraries(torch_hip PRIVATE __caffe2_oort)
@@ -988,6 +995,7 @@ if(USE_ROCM)
   endif()
 elseif(USE_CUDA)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+  list(APPEND Caffe2_GPU_SRCS ${GENERATED_CXX_TORCH_CUDA})
   if(CUDA_SEPARABLE_COMPILATION)
     # Separate compilation fails when kernels using `thrust::sort_by_key`
     # are linked with the rest of CUDA code. Workaround by linking them separately.
diff --git a/setup.py b/setup.py
@@ -1250,6 +1250,7 @@ def main():
         "include/torch/csrc/inductor/aoti_runtime/*.h",
         "include/torch/csrc/inductor/aoti_torch/*.h",
         "include/torch/csrc/inductor/aoti_torch/c/*.h",
+        "include/torch/csrc/inductor/aoti_torch/generated/*.h",
         "include/torch/csrc/jit/*.h",
         "include/torch/csrc/jit/backends/*.h",
         "include/torch/csrc/jit/generated/*.h",
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
@@ -1,7 +1,13 @@
 #pragma once
 
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/Logging.h>
 #include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 
@@ -18,6 +24,8 @@
   return AOTI_TORCH_SUCCESS;
 
 namespace torch::aot_inductor {
+
+// utility functions to convert a pointer to an optional value
 template <class T>
 inline c10::optional<T> pointer_to_optional(T* ptr) {
   return ptr ? c10::make_optional(*ptr) : c10::nullopt;
@@ -34,4 +42,101 @@ inline c10::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
              : c10::nullopt;
 }
 
+template <>
+inline c10::optional<at::Tensor> pointer_to_optional(
+    const AtenTensorHandle* ptr) {
+  return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : c10::nullopt;
+}
+
+inline c10::optional<c10::Device> pointer_to_optional_device(
+    int32_t* device_type,
+    int32_t device_index) {
+  return device_type ? c10::make_optional(c10::Device(
+                           static_cast<c10::DeviceType>(*device_type),
+                           static_cast<c10::DeviceIndex>(device_index)))
+                     : c10::nullopt;
+}
+
+// utility functions to convert a pointer to a list
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<c10::optional<T>> : std::true_type {};
+
+template <class T>
+inline c10::ArrayRef<T> pointer_to_list(T* ptr, int64_t len) {
+  return c10::ArrayRef<T>(ptr, len);
+}
+
+template <
+    class T,
+    class U,
+    typename = std::enable_if_t<!std::is_same_v<T, U>>,
+    typename = std::enable_if_t<!is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U* ptr, int64_t len) {
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(T(ptr[i]));
+  }
+  return result;
+}
+
+template <class T, class U, typename = std::enable_if_t<is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U** ptr, int64_t len) {
+  // Here U** denotes a list of optional arguments
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional(ptr[i]));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<at::Tensor> pointer_to_list(
+    const AtenTensorHandle* ptr,
+    int64_t len) {
+  std::vector<at::Tensor> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(*tensor_handle_to_tensor_pointer(*ptr));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<c10::optional<at::Tensor>> pointer_to_list(
+    const AtenTensorHandle** ptr,
+    int64_t len) {
+  std::vector<c10::optional<at::Tensor>> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional<at::Tensor>(ptr[i]));
+  }
+  return result;
+}
+
+template <int N>
+inline std::array<bool, N> pointer_to_list(const int32_t* ptr) {
+  std::array<bool, N> result;
+  std::copy(ptr, ptr + N, result.begin());
+  return result;
+}
+
+// utility functions to convert a pointer to a list of optional values
+template <class T, class U>
+inline c10::optional<c10::ArrayRef<T>> pointer_to_optional_list(
+    U** ptr,
+    int64_t len) {
+  return ptr
+      ? c10::make_optional<c10::ArrayRef<T>>(pointer_to_list<T>(*ptr, len))
+      : c10::nullopt;
+}
+
 } // namespace torch::aot_inductor
diff --git a/torchgen/gen.py b/torchgen/gen.py
@@ -44,6 +44,11 @@
     with_native_function,
     with_native_function_and_indices,
 )
+from torchgen.gen_aoti_c_shim import (
+    gen_aoti_c_shim,
+    gen_static_dispatch_backend_call_signature,
+    get_backend_index_for_aoti,
+)
 from torchgen.gen_functionalization_type import (
     gen_functionalization_definition,
     gen_functionalization_registration,
@@ -416,14 +421,7 @@ def generate_static_dispatch_backend_call(
     f: NativeFunction,
     backend_index: BackendIndex,
 ) -> str:
-    cpp_sigs = CppSignatureGroup.from_native_function(
-        f, method=False, fallback_binding=False
-    )
-    if sig.symint and f.func.has_symint():
-        cpp_sig = cpp_sigs.symint_signature
-    else:
-        cpp_sig = cpp_sigs.signature
-    assert cpp_sig is not None
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
     name = cpp_sig.name()
     exprs = translate_args(sig, cpp_sig)
     backend_metadata = backend_index.get_kernel(f)
@@ -2181,6 +2179,7 @@ def gen_source_files(
     selector: SelectiveBuilder,
     static_dispatch_idx: List[BackendIndex],
     backend_indices: Dict[DispatchKey, BackendIndex],
+    aoti_fm: FileManager,
     core_fm: FileManager,
     cpu_fm: FileManager,
     cpu_vec_fm: FileManager,
@@ -2350,6 +2349,60 @@ def operator_headers() -> List[str]:
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
+        if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA):
+
+            def get_header(
+                f: NativeFunction,
+            ) -> Optional[str]:
+                backend_index = get_backend_index_for_aoti(
+                    f, dispatch_key, backend_indices
+                )
+                return (
+                    None
+                    if backend_index is None
+                    else f"#include <ATen/ops/{f.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+                )
+
+            def headers_for_aoti() -> str:
+                headers = []
+                for g in grouped_native_functions:
+                    if isinstance(g, NativeFunctionsGroup):
+                        for f in g.functions():
+                            # some variants are registered in the backend, but some are registered as CompositeExplicitAutograd
+                            header = get_header(f)
+                            if header is not None:
+                                headers.append(header)
+                    else:
+                        header = get_header(g)
+                        if header is not None:
+                            headers.append(header)
+                return "\n".join(sorted(set(headers)))
+
+            extra_headers = (
+                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+            )
+
+            aoti_fm.write(
+                f"c_shim_{dispatch_key.lower()}.h",
+                lambda: gen_aoti_c_shim(
+                    native_functions,
+                    dispatch_key,
+                    backend_indices,
+                    header=True,
+                    includes="",
+                ),
+            )
+            aoti_fm.write(
+                f"c_shim_{dispatch_key.lower()}.cpp",
+                lambda: gen_aoti_c_shim(
+                    native_functions,
+                    dispatch_key,
+                    backend_indices,
+                    header=False,
+                    includes=headers_for_aoti() + "\n" + extra_headers,
+                ),
+            )
+
         del fm
 
     # BackendSelect is generated specially
@@ -2783,6 +2836,9 @@ def main() -> None:
     cpu_vec_fm = make_file_manager(options=options)
     cuda_fm = make_file_manager(options=options)
     ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
+    aoti_fm = make_file_manager(
+        options=options, install_dir="torch/csrc/inductor/aoti_torch/generated"
+    )
 
     # Only a limited set of dispatch keys get CPUFunctions.h headers generated
     # for them; this is the set
@@ -2825,6 +2881,7 @@ def main() -> None:
             selector=selector,
             static_dispatch_idx=static_dispatch_idx,
             backend_indices=backend_indices,
+            aoti_fm=aoti_fm,
             core_fm=core_fm,
             cpu_fm=cpu_fm,
             cpu_vec_fm=cpu_vec_fm,
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py