Add custom kernels AddSharedInput, MulSharedInput (#734)

xadupre · web-flow · commit 1e8c1211a5e4 · 2024-06-05T10:42:22.000+02:00
* Add custom kernel AddSharedInput, MulSharedInput

* fix compilation

* compilation issue

* fix unit test
diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "ocos.h"
+#include "add_mul_impl.cuh"
+#include "ortx_common.h"
+
+namespace contrib {
+
+template <typename T, bool addition>
+struct AddOrMulSharedInput {
+  template <typename TDict>
+  OrtxStatus OnModelAttach(const TDict& /*dict*/) {
+    return {};
+  }
+  OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
+                     const ortc::Tensor<T>& tensor_a,
+                     const ortc::Tensor<T>& tensor_b,
+                     const ortc::Tensor<T>& tensor_c,
+                     ortc::Tensor<T>& output_ab,
+                     ortc::Tensor<T>& output_ac) const {
+    const T* input_data_a = tensor_a.Data();
+    const T* input_data_b = tensor_b.Data();
+    const T* input_data_c = tensor_c.Data();
+
+    auto length_a = tensor_a.NumberOfElement();
+    auto length_b = tensor_b.NumberOfElement();
+    auto length_c = tensor_c.NumberOfElement();
+
+    T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
+    T* output_data_ac = output_ab.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());
+
+    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+      return {};
+    }
+    LaunchAddOrMulSharedInputKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                                       input_data_a, input_data_b, input_data_c,
+                                       output_data_ab, output_data_ac,
+                                       length_a, length_b, length_c,
+                                       addition);
+    return {};
+  }
+};
+
+}  // namespace contrib
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
@@ -0,0 +1,121 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "device_prop.cuh"
+#include "utils.cuh"
+#include "add_mul_impl.cuh"
+#include "cuda_type.h"
+
+#ifndef CUDA_LONG
+#define CUDA_LONG int32_t
+#endif
+
+using namespace Ort::Custom;
+
+__device__ __forceinline__ void _add3_op(float* ab, float* ac, const float a, const float b, const float c) {
+  *ab = a + b;
+  *ac = a + c;
+}
+
+__device__ __forceinline__ void _add3_op(half* ab, half* ac, const half a, const half b, const half c) {
+#if __CUDA_ARCH__ < 700
+  *ab = __float2half(__half2float(a) + __half2float(b));
+  *ac = __float2half(__half2float(a) + __half2float(c));
+#else
+  *ab = a + b;
+  *ac = a + c;
+#endif
+}
+
+__device__ __forceinline__ void _mul3_op(float* ab, float* ac, const float a, const float b, const float c) {
+  *ab = a * b;
+  *ac = a * c;
+}
+
+__device__ __forceinline__ void _mul3_op(half* ab, half* ac, const half a, const half b, const half c) {
+#if __CUDA_ARCH__ < 700
+  *ab = __float2half(__half2float(a) * __half2float(b));
+  *ac = __float2half(__half2float(a) * __half2float(c));
+#else
+  *ab = a * b;
+  *ac = a * c;
+#endif
+}
+
+template <typename T>
+struct Mul3SharedOp {
+  __device__ __forceinline__ void operator()(T* ab, T* ac, const T a, const T b, const T c) const {
+    _mul3_op(ab, ac, a, b, c);
+  }
+};
+
+template <typename T>
+struct Add3SharedOp {
+  __device__ __forceinline__ void operator()(T* ab, T* ac, const T a, const T b, const T c) const {
+    _add3_op(ab, ac, a, b, c);
+  }
+};
+
+template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void AddMulKernel(T* output_ab, T* output_ac, const T* pA, const T* pB,
+                             const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
+                             CUDA_LONG N, const TFunc func) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      func(output_ab + id, output_ac + id, pA[id % nA], pB[id % nB], pC[id % nC]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename T>
+cudaError_t _LaunchAddOrMulSharedInputKernel(cudaStream_t stream,
+                                             const T* pA, const T* pB, const T* pC,
+                                             T* output_ab, T* output_ac,
+                                             int64_t countA, int64_t countB, int64_t countC, bool addition) {
+  int64_t max_count = std::max(std::max(countA, countB), countC);
+  if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
+    return cudaGetLastError();
+
+  const int num_elements_per_thread = 4;
+  const int num_threads_per_block = 256;
+  const int num_el_th = num_threads_per_block * num_elements_per_thread;
+
+  int blocksPerGrid = (max_count + num_el_th - 1) / num_el_th;
+
+  using TT = typename contrib::CudaT<T>::MappedType;
+
+  if (addition) {
+    AddMulKernel<TT, Add3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            reinterpret_cast<TT*>(output_ab), reinterpret_cast<TT*>(output_ac),
+            reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
+  } else {
+    AddMulKernel<TT, Mul3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            reinterpret_cast<TT*>(output_ab), reinterpret_cast<TT*>(output_ac),
+            reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count), Mul3SharedOp<TT>());
+  }
+  return cudaGetLastError();
+}
+
+template <>
+cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
+                                                   float* output_ab, float* output_ac,
+                                                   int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c, output_ab, output_ac, length_a, length_b, length_c, addition);
+}
+
+template <>
+cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream, const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
+                                                            ortc::MFloat16* output_ab, ortc::MFloat16* output_ac,
+                                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c, output_ab, output_ac, length_a, length_b, length_c, addition);
+}
diff --git a/operators/cuda/add_mul_impl.cuh b/operators/cuda/add_mul_impl.cuh
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+template <typename T>
+cudaError_t LaunchAddOrMulSharedInputKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
+                                            T* output_ab, T* output_ac,
+                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
@@ -4,21 +4,36 @@
 #include "ocos.h"
 
 #ifdef USE_CUDA
+#include "cuda/add_mul.h"
 #include "cuda/fast_gelu.h"
 #include "cuda/negxplus1.h"
 #endif
 
 FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
+
+  using AddSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, true>;
+  using MulSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, false>;
+
+#if ORT_API_VERSION >= 16
+  using AddSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, true>;
+  using MulSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, false>;
+#endif
+
+
   static OrtOpLoader op_loader(
       []() { return nullptr; }
 #ifdef USE_CUDA
       ,
+      CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
+      CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
 #if ORT_API_VERSION >= 16
 
+      CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
+      CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>)
 #endif
 #endif
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
@@ -10,6 +10,10 @@
 import onnxruntime as _ort
 
 
+def has_cuda():
+    return "CUDAExecutionProvider" in _ort.get_available_providers()
+
+
 class NegXPlus1(OpRun):
     op_domain = "ai.onnx.contrib"
 
@@ -101,8 +105,6 @@ def test_cuda_fastgelu_f16(self):
             print("CUDAExecutionProvider not available, test_cuda_fastgelu_f16 skipped.")
 
     def _negxplus1_cuda(self, itype):
-        import onnxruntime
-
         dtype = np.float32 if itype == TensorProto.FLOAT else np.float16
         model1 = helper.make_model(
             helper.make_graph(
@@ -137,17 +139,128 @@ def _negxplus1_cuda(self, itype):
         ref = ReferenceEvaluator(model1, new_ops=[NegXPlus1])
         expected = ref.run(None, feeds1)[0]
 
-        opts = onnxruntime.SessionOptions()
+        opts = _ort.SessionOptions()
         opts.register_custom_ops_library(_get_library_path())
-        sess = onnxruntime.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
+        sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
         got = sess.run(None, feeds1)[0]
         assert_almost_equal(expected, got, decimal=5)
 
+    @unittest.skipIf(not has_cuda(), reason="CUDA is missing")
     def test_cuda_negxplus1(self):
-        eps = _ort.get_available_providers()
-        if "CUDAExecutionProvider" in eps:
-            self._negxplus1_cuda(TensorProto.FLOAT)
-            self._negxplus1_cuda(TensorProto.FLOAT16)
+        self._negxplus1_cuda(TensorProto.FLOAT)
+        self._negxplus1_cuda(TensorProto.FLOAT16)
+
+    def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3, 2, 3), shapec=(3, 2, 3)):
+        from onnx_extended.ortops.optim.cuda import get_ort_ext_libs
+
+        model1 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(op_type, ["X", "Y"], ["XY"]),
+                    helper.make_node(op_type, ["X", "Z"], ["XZ"]),
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [
+                    helper.make_tensor_value_info("XY", itype, [None, None, None]),
+                    helper.make_tensor_value_info("XZ", itype, [None, None, None]),
+                ],
+            ),
+            opset_imports=[helper.make_opsetid("", 18)],
+            ir_version=9,
+        )
+
+        model2 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(
+                        f"{op_type}SharedInput",
+                        ["X", "Y", "Z"],
+                        ["XY", "XZ"],
+                        domain="onnx_extended.ortops.optim.cuda",
+                    )
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [
+                    helper.make_tensor_value_info("XY", itype, [None, None, None]),
+                    helper.make_tensor_value_info("XZ", itype, [None, None, None]),
+                ],
+            ),
+            opset_imports=[
+                helper.make_opsetid("", 18),
+                helper.make_opsetid("onnx_extended.ortops.optim.cuda", 1),
+            ],
+            ir_version=9,
+        )
+
+        dtype = np.float32 if itype == TensorProto.FLOAT else np.float16
+        x = (np.arange(np.prod(shapea)) + 1).reshape((shapea)).astype(dtype)
+        y = (np.arange(np.prod(shapeb)) + 2).reshape((shapeb)).astype(dtype)
+        z = (np.arange(np.prod(shapec)) + 3).reshape((shapec)).astype(dtype)
+
+        feeds1 = dict(X=x, Y=y, Z=z)
+        ref = ReferenceEvaluator(model1)
+        expected = ref.run(None, feeds1)
+
+        opts = _ort.SessionOptions()
+        opts.register_custom_ops_library(get_ort_ext_libs()[0])
+        sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
+        got = sess.run(None, feeds1)
+        for i in range(2):
+            assert_almost_equal(expected[i], got[i])
+
+    @unittest.skipIf(not has_cuda(), reason="CUDA is missing")
+    def test_add_shared_input_cuda(self):
+        self._addmul_shared_input_cuda(TensorProto.FLOAT, "Add")
+        self._addmul_shared_input_cuda(TensorProto.FLOAT16, "Add")
+
+    @unittest.skipIf(not has_cuda(), reason="CUDA is missing")
+    def test_mul_shared_input_cuda(self):
+        self._addmul_shared_input_cuda(TensorProto.FLOAT, "Mul")
+        self._addmul_shared_input_cuda(TensorProto.FLOAT16, "Mul")
+
+    @unittest.skipIf(not has_cuda(), reason="CUDA is missing")
+    def test_add_shared_input_cuda_broadcast1(self):
+        self._addmul_shared_input_cuda(
+            TensorProto.FLOAT,
+            "Add",
+            shapea=(3, 2, 3),
+            shapeb=(1, 2, 3),
+            shapec=(1, 2, 3),
+        )
+        self._addmul_shared_input_cuda(
+            TensorProto.FLOAT16,
+            "Add",
+            shapea=(3, 2, 3),
+            shapeb=(1, 2, 3),
+            shapec=(1, 2, 3),
+        )
+
+    @unittest.skipIf(not has_cuda(), reason="CUDA is missing")
+    def test_add_shared_input_cuda_broadcast2(self):
+        self._addmul_shared_input_cuda(
+            TensorProto.FLOAT,
+            "Add",
+            shapea=(1, 2, 3),
+            shapeb=(3, 2, 3),
+            shapec=(3, 2, 3),
+        )
+        self._addmul_shared_input_cuda(
+            TensorProto.FLOAT16,
+            "Add",
+            shapea=(1, 2, 3),
+            shapeb=(3, 2, 3),
+            shapec=(3, 2, 3),
+        )
 
 
 if __name__ == "__main__":