Merge pull request #1726 from bstatcomp/cl_kernel_generator_bugfix_common_subexprs

rok-cesnovar · web-flow · commit 1decedeb2f40 · 2020-02-22T08:38:24.000+01:00
Bugfix common subexpression elimination in kernel generator
diff --git a/stan/math/opencl/kernel_generator/binary_operation.hpp b/stan/math/opencl/kernel_generator/binary_operation.hpp
@@ -109,10 +109,20 @@ class binary_operation : public operation_cl<Derived, T_res, T_a, T_b> {
   template <typename T_a, typename T_b>                                       \
   class class_name : public binary_operation<class_name<T_a, T_b>,            \
                                              scalar_type_expr, T_a, T_b> {    \
+    using base                                                                \
+        = binary_operation<class_name<T_a, T_b>, scalar_type_expr, T_a, T_b>; \
+    using base::arguments_;                                                   \
+                                                                              \
    public:                                                                    \
     class_name(T_a&& a, T_b&& b) /* NOLINT */                                 \
-        : binary_operation<class_name<T_a, T_b>, scalar_type_expr, T_a, T_b>( \
-              std::forward<T_a>(a), std::forward<T_b>(b), operation) {}       \
+        : base(std::forward<T_a>(a), std::forward<T_b>(b), operation) {}      \
+    inline auto deep_copy() {                                                 \
+      auto&& a_copy = std::get<0>(arguments_).deep_copy();                    \
+      auto&& b_copy = std::get<1>(arguments_).deep_copy();                    \
+      return class_name<std::remove_reference_t<decltype(a_copy)>,            \
+                        std::remove_reference_t<decltype(b_copy)>>(           \
+          std::move(a_copy), std::move(b_copy));                              \
+    }                                                                         \
   };                                                                          \
                                                                               \
   template <typename T_a, typename T_b,                                       \
@@ -146,10 +156,20 @@ class binary_operation : public operation_cl<Derived, T_res, T_a, T_b> {
   template <typename T_a, typename T_b>                                       \
   class class_name : public binary_operation<class_name<T_a, T_b>,            \
                                              scalar_type_expr, T_a, T_b> {    \
+    using base                                                                \
+        = binary_operation<class_name<T_a, T_b>, scalar_type_expr, T_a, T_b>; \
+    using base::arguments_;                                                   \
+                                                                              \
    public:                                                                    \
     class_name(T_a&& a, T_b&& b) /* NOLINT */                                 \
-        : binary_operation<class_name<T_a, T_b>, scalar_type_expr, T_a, T_b>( \
-              std::forward<T_a>(a), std::forward<T_b>(b), operation) {}       \
+        : base(std::forward<T_a>(a), std::forward<T_b>(b), operation) {}      \
+    inline auto deep_copy() {                                                 \
+      auto&& a_copy = std::get<0>(arguments_).deep_copy();                    \
+      auto&& b_copy = std::get<1>(arguments_).deep_copy();                    \
+      return class_name<std::remove_reference_t<decltype(a_copy)>,            \
+                        std::remove_reference_t<decltype(b_copy)>>(           \
+          std::move(a_copy), std::move(b_copy));                              \
+    }                                                                         \
     inline matrix_cl_view view() const { __VA_ARGS__; }                       \
   };                                                                          \
                                                                               \
diff --git a/stan/math/opencl/kernel_generator/block.hpp b/stan/math/opencl/kernel_generator/block.hpp
@@ -57,6 +57,16 @@ class block_
     }
   }
 
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline auto deep_copy() {
+    auto&& arg_copy = std::get<0>(arguments_).deep_copy();
+    return block_<std::remove_reference_t<decltype(arg_copy)>>{
+        std::move(arg_copy), start_row_, start_col_, rows_, cols_};
+  }
+
   /**
    * Generates kernel code for this expression.
    * @param i row index variable name
@@ -227,10 +237,10 @@ class block_
  */
 template <typename T,
           typename = require_all_valid_expressions_and_none_scalar_t<T>>
-inline block_<as_operation_cl_t<T>> block(T&& a, int start_row, int start_col,
-                                          int rows, int cols) {
-  return block_<as_operation_cl_t<T>>(as_operation_cl(std::forward<T>(a)),
-                                      start_row, start_col, rows, cols);
+inline auto block(T&& a, int start_row, int start_col, int rows, int cols) {
+  auto&& a_operation = as_operation_cl(std::forward<T>(a)).deep_copy();
+  return block_<std::remove_reference_t<decltype(a_operation)>>(
+      std::move(a_operation), start_row, start_col, rows, cols);
 }
 
 }  // namespace math
diff --git a/stan/math/opencl/kernel_generator/load.hpp b/stan/math/opencl/kernel_generator/load.hpp
@@ -44,6 +44,12 @@ class load_
    */
   explicit load_(T&& a) : a_(std::forward<T>(a)) {}
 
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline load_<T&> deep_copy() { return load_<T&>(a_); }
+
   /**
    * generates kernel code for this expression.
    * @param i row index variable name
diff --git a/stan/math/opencl/kernel_generator/operation_cl.hpp b/stan/math/opencl/kernel_generator/operation_cl.hpp
@@ -245,7 +245,8 @@ class operation_cl : public operation_cl_base {
    */
   inline int bottom_diagonal() const {
     return index_apply<N>([&](auto... Is) {
-      return std::min({std::get<Is>(arguments_).bottom_diagonal()...});
+      return std::min(std::initializer_list<int>(
+          {std::get<Is>(arguments_).bottom_diagonal()...}));
     });
   }
 
@@ -256,7 +257,8 @@ class operation_cl : public operation_cl_base {
    */
   inline int top_diagonal() const {
     return index_apply<N>([&](auto... Is) {
-      return std::max({std::get<Is>(arguments_).top_diagonal()...});
+      return std::max(std::initializer_list<int>(
+          {std::get<Is>(arguments_).top_diagonal()...}));
     });
   }
 };
diff --git a/stan/math/opencl/kernel_generator/rowwise_reduction.hpp b/stan/math/opencl/kernel_generator/rowwise_reduction.hpp
@@ -46,7 +46,7 @@ class rowwise_reduction
    * @param a the expression to reduce
    * @param init OpenCL source code of initialization value for reduction
    */
-  rowwise_reduction(T&& a, const std::string& init)
+  explicit rowwise_reduction(T&& a, const std::string& init)
       : base(std::forward<T>(a)), init_(init) {}
 
   /**
@@ -121,7 +121,7 @@ class rowwise_reduction
 
   /**
    * Determine index of top diagonal written.
-   * @return number of columns
+   * @return top diagonal
    */
   inline int top_diagonal() const { return 1; }
 };
@@ -149,10 +149,21 @@ struct sum_op {
 template <typename T>
 class rowwise_sum_
     : public rowwise_reduction<rowwise_sum_<T>, T, sum_op, true> {
+  using base = rowwise_reduction<rowwise_sum_<T>, T, sum_op, true>;
+  using base::arguments_;
+
  public:
-  explicit rowwise_sum_(T&& a)
-      : rowwise_reduction<rowwise_sum_<T>, T, sum_op, true>(std::forward<T>(a),
-                                                            "0") {}
+  explicit rowwise_sum_(T&& a) : base(std::forward<T>(a), "0") {}
+
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline rowwise_sum_<std::remove_reference_t<T>> deep_copy() {
+    auto&& arg_copy = std::get<0>(arguments_).deep_copy();
+    return rowwise_sum_<std::remove_reference_t<decltype(arg_copy)>>(
+        std::move(arg_copy));
+  }
 };
 
 /**
@@ -163,9 +174,10 @@ class rowwise_sum_
  */
 template <typename T,
           typename = require_all_valid_expressions_and_none_scalar_t<T>>
-inline rowwise_sum_<as_operation_cl_t<T>> rowwise_sum(T&& a) {
-  return rowwise_sum_<as_operation_cl_t<T>>(
-      as_operation_cl(std::forward<T>(a)));
+inline auto rowwise_sum(T&& a) {
+  auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
+  return rowwise_sum_<std::remove_reference_t<decltype(arg_copy)>>(
+      std::move(arg_copy));
 }
 
 /**
@@ -205,11 +217,21 @@ class rowwise_max_
     : public rowwise_reduction<
           rowwise_max_<T>, T,
           max_op<typename std::remove_reference_t<T>::Scalar>, false> {
- public:
   using op = max_op<typename std::remove_reference_t<T>::Scalar>;
-  explicit rowwise_max_(T&& a)
-      : rowwise_reduction<rowwise_max_<T>, T, op, false>(std::forward<T>(a),
-                                                         op::init()) {}
+  using base = rowwise_reduction<rowwise_max_<T>, T, op, false>;
+  using base::arguments_;
+
+ public:
+  explicit rowwise_max_(T&& a) : base(std::forward<T>(a), op::init()) {}
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline auto deep_copy() {
+    auto&& arg_copy = std::get<0>(arguments_).deep_copy();
+    return rowwise_max_<std::remove_reference_t<decltype(arg_copy)>>(
+        std::move(arg_copy));
+  }
 };
 
 /**
@@ -220,11 +242,11 @@ class rowwise_max_
  */
 template <typename T,
           typename = require_all_valid_expressions_and_none_scalar_t<T>>
-inline rowwise_max_<as_operation_cl_t<T>> rowwise_max(T&& a) {
-  return rowwise_max_<as_operation_cl_t<T>>(
-      as_operation_cl(std::forward<T>(a)));
+inline auto rowwise_max(T&& a) {
+  auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
+  return rowwise_max_<std::remove_reference_t<decltype(arg_copy)>>(
+      std::move(arg_copy));
 }
-
 /**
  * Operation for min reduction.
  * @tparam T type to reduce
@@ -262,11 +284,21 @@ class rowwise_min_
     : public rowwise_reduction<
           rowwise_min_<T>, T,
           min_op<typename std::remove_reference_t<T>::Scalar>, false> {
- public:
   using op = min_op<typename std::remove_reference_t<T>::Scalar>;
-  explicit rowwise_min_(T&& a)
-      : rowwise_reduction<rowwise_min_<T>, T, op, false>(std::forward<T>(a),
-                                                         op::init()) {}
+  using base = rowwise_reduction<rowwise_min_<T>, T, op, false>;
+  using base::arguments_;
+
+ public:
+  explicit rowwise_min_(T&& a) : base(std::forward<T>(a), op::init()) {}
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline auto deep_copy() {
+    auto&& arg_copy = std::get<0>(arguments_).deep_copy();
+    return rowwise_min_<std::remove_reference_t<decltype(arg_copy)>>(
+        std::move(arg_copy));
+  }
 };
 
 /**
@@ -277,9 +309,10 @@ class rowwise_min_
  */
 template <typename T,
           typename = require_all_valid_expressions_and_none_scalar_t<T>>
-inline rowwise_min_<as_operation_cl_t<T>> rowwise_min(T&& a) {
-  return rowwise_min_<as_operation_cl_t<T>>(
-      as_operation_cl(std::forward<T>(a)));
+inline auto rowwise_min(T&& a) {
+  auto&& arg_copy = as_operation_cl(std::forward<T>(a)).deep_copy();
+  return rowwise_min_<std::remove_reference_t<decltype(arg_copy)>>(
+      std::move(arg_copy));
 }
 
 }  // namespace math
diff --git a/stan/math/opencl/kernel_generator/scalar.hpp b/stan/math/opencl/kernel_generator/scalar.hpp
@@ -7,6 +7,7 @@
 #include <stan/math/opencl/kernel_generator/type_str.hpp>
 #include <stan/math/opencl/kernel_generator/name_generator.hpp>
 #include <stan/math/opencl/kernel_generator/operation_cl.hpp>
+#include <limits>
 #include <string>
 #include <type_traits>
 #include <set>
@@ -36,6 +37,12 @@ class scalar_ : public operation_cl<scalar_<T>, T> {
    */
   explicit scalar_(const T a) : a_(a) {}
 
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline scalar_<T> deep_copy() { return scalar_<T>(a_); }
+
   /**
    * generates kernel code for this expression.
    * @param i row index variable name
@@ -81,6 +88,18 @@ class scalar_ : public operation_cl<scalar_<T>, T> {
    * @return view
    */
   inline matrix_cl_view view() const { return matrix_cl_view::Entire; }
+
+  /**
+   * Determine index of bottom diagonal written.
+   * @return number of columns
+   */
+  inline int bottom_diagonal() const { return std::numeric_limits<int>::min(); }
+
+  /**
+   * Determine index of top diagonal written.
+   * @return number of columns
+   */
+  inline int top_diagonal() const { return std::numeric_limits<int>::max(); }
 };
 
 }  // namespace math
diff --git a/stan/math/opencl/kernel_generator/select.hpp b/stan/math/opencl/kernel_generator/select.hpp
@@ -69,6 +69,20 @@ class select_ : public operation_cl<select_<T_condition, T_then, T_else>,
     }
   }
 
+  /**
+   * Creates a deep copy of this expression.
+   * @return copy of \c *this
+   */
+  inline auto deep_copy() {
+    auto&& condition_copy = std::get<0>(arguments_).deep_copy();
+    auto&& then_copy = std::get<0>(arguments_).deep_copy();
+    auto&& else_copy = std::get<0>(arguments_).deep_copy();
+    return select_<std::remove_reference_t<decltype(condition_copy)>,
+                   std::remove_reference_t<decltype(then_copy)>,
+                   std::remove_reference_t<decltype(else_copy)>>(
+        std::move(condition_copy), std::move(then_copy), std::move(else_copy));
+  }
+
   /**
    * generates kernel code for this (select) operation.
    * @param i row index variable name
diff --git a/stan/math/opencl/kernel_generator/unary_function_cl.hpp b/stan/math/opencl/kernel_generator/unary_function_cl.hpp
@@ -78,9 +78,16 @@ class unary_function_cl
 #define ADD_UNARY_FUNCTION(fun)                                               \
   template <typename T>                                                       \
   class fun##_ : public unary_function_cl<fun##_<T>, T> {                     \
+    using base = unary_function_cl<fun##_<T>, T>;                             \
+    using base::arguments_;                                                   \
+                                                                              \
    public:                                                                    \
-    explicit fun##_(T&& a)                                                    \
-        : unary_function_cl<fun##_<T>, T>(std::forward<T>(a), #fun) {}        \
+    explicit fun##_(T&& a) : base(std::forward<T>(a), #fun) {}                \
+    inline auto deep_copy() {                                                 \
+      auto&& arg_copy = std::get<0>(arguments_).deep_copy();                  \
+      return fun##_<std::remove_reference_t<decltype(arg_copy)>>{             \
+          std::move(arg_copy)};                                               \
+    }                                                                         \
     inline matrix_cl_view view() const { return matrix_cl_view::Entire; }     \
   };                                                                          \
                                                                               \
@@ -99,9 +106,16 @@ class unary_function_cl
 #define ADD_UNARY_FUNCTION_PASS_ZERO(fun)                                     \
   template <typename T>                                                       \
   class fun##_ : public unary_function_cl<fun##_<T>, T> {                     \
+    using base = unary_function_cl<fun##_<T>, T>;                             \
+    using base::arguments_;                                                   \
+                                                                              \
    public:                                                                    \
-    explicit fun##_(T&& a)                                                    \
-        : unary_function_cl<fun##_<T>, T>(std::forward<T>(a), #fun) {}        \
+    explicit fun##_(T&& a) : base(std::forward<T>(a), #fun) {}                \
+    inline auto deep_copy() {                                                 \
+      auto&& arg_copy = std::get<0>(arguments_).deep_copy();                  \
+      return fun##_<std::remove_reference_t<decltype(arg_copy)>>{             \
+          std::move(arg_copy)};                                               \
+    }                                                                         \
   };                                                                          \
                                                                               \
   template <typename T, typename Cond                                         \
diff --git a/test/unit/math/opencl/kernel_generator/block_test.cpp b/test/unit/math/opencl/kernel_generator/block_test.cpp
@@ -108,4 +108,22 @@ TEST(MathMatrixCL, lhs_block_test) {
   EXPECT_MATRIX_NEAR(res, correct, 1e-9);
 }
 
+TEST(MathMatrixCL, two_blocks_of_same_expression) {
+  using stan::math::block;
+  MatrixXd m(2, 3);
+  m << 1, 2, 3, 4, 5, 6;
+
+  matrix_cl<double> m_cl(m);
+
+  auto tmp = m_cl + 1;
+  auto tmp2 = block(tmp, 0, 0, 2, 2) + block(tmp, 0, 1, 2, 2);
+
+  matrix_cl<double> res_cl = tmp2;
+
+  MatrixXd res = stan::math::from_matrix_cl(res_cl);
+  MatrixXd correct = (m.block(0, 0, 2, 2) + m.block(0, 1, 2, 2)).array() + 2;
+
+  EXPECT_MATRIX_NEAR(res, correct, 1e-9);
+}
+
 #endif