intel · bader · Dec 2, 2022 · Apr 21, 2022 · May 11, 2022 · May 11, 2022
diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -166,6 +166,13 @@ template <int N> bool check(vec<float, N> a, vec<float, N> b) {
 
 int main() {
   queue q;
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+
   float16 a, b, c, d;
   for (int i = 0; i < SZ_max; i++) {
     a[i] = i / (float)SZ_max;

@@ -0,0 +1,51 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// tests oneapi extension native tanh math function for sycl::vec and
+// sycl::marray float cases.
+
+#include "ext_native_math_common.hpp"
+
+int main() {
+
+  sycl::queue q;
+
+  const float tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                        -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const float tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                        -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+  const float tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                        -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+
+  native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                   {tu[0], tu[1]});
+  native_tanh_tester<sycl::float3>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+
+  native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                   {tl[0], tl[1], tl[2], tl[3]},
+                                   {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<float, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                             {tl[0], tl[1], tl[2], tl[3]},
+                                             {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::float8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::float16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+  return 0;
+}
@@ -0,0 +1,67 @@
+#include <cassert>
+#include <sycl/sycl.hpp>
+
+template <typename T, size_t N>
+void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
+                         sycl::marray<T, N> upper) {
+  for (int i = 0; i < N; i++) {
+    assert(lower[i] < val[i] && val[i] < upper[i]);
+  }
+}
+
+template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
+  assert(sycl::all(lower < val && val < upper));
+}
+
+template <>
+void assert_out_of_bound<float>(float val, float lower, float upper) {
+  assert(lower < val && val < upper);
+}
+
+template <>
+void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
+                                     sycl::half upper) {
+  assert(lower < val && val < upper);
+}
+
+template <typename T>
+void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
+
+template <typename T>
+void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
@@ -0,0 +1,94 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
+// test is compiled with the -fsycl-device-code-split flag
+
+// tests oneapi extension native math functions for sycl::vec and sycl::marray
+// fp16 cases.
+
+#include "ext_native_math_common.hpp"
+
+int main() {
+
+  sycl::queue q;
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+
+  const sycl::half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                             -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const sycl::half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95,  0.89,
+                             0.75,  -0.1,  -0.94, 0.92, -0.84, 0.82,
+                             -1.0,  0.98,  -1.10, 0.98};
+  const sycl::half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91,
+                             0.77,  0.1,   -0.92, 0.94, -0.82, 0.84,
+                             -0.98, 1.00,  -0.98, 1.10};
+
+  native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                  {tu[0], tu[1]});
+  native_tanh_tester<sycl::half3>(q, {tv[0], tv[1], tv[2]},
+                                  {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<sycl::half, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                  {tl[0], tl[1], tl[2], tl[3]},
+                                  {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<sycl::half, 4>>(
+      q, {tv[0], tv[1], tv[2], tv[3]}, {tl[0], tl[1], tl[2], tl[3]},
+      {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::half8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::half16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+  const sycl::half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+                             -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
+  const sycl::half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+                             0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
+  const sycl::half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+                             0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
+
+  native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
+  native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
+                                  {eu[0], eu[1]});
+  native_exp2_tester<sycl::half3>(q, {ev[0], ev[1], ev[2]},
+                                  {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                  {el[0], el[1], el[2], el[3]},
+                                  {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::marray<sycl::half, 3>>(
+      q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::marray<sycl::half, 4>>(
+      q, {ev[0], ev[1], ev[2], ev[3]}, {el[0], el[1], el[2], el[3]},
+      {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::half8>(
+      q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
+  native_exp2_tester<sycl::half16>(
+      q,
+      {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
+       ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
+       el[10], el[11], el[12], el[13], el[14], el[15]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
+       eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
+
+  return 0;
+}
diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -0,0 +1,159 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define HALF_PRECISION_OPERATOR(NAME)                                          \
+  template <typename T>                                                        \
+  void half_precision_math_test_##NAME(queue &deviceQueue, T result, T input,  \
+                                       size_t ref) {                           \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input_access(      \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class half_precision##NAME, T>>([=]() {     \
+          res_access[0] = sycl::half_precision::NAME(input_access[0]);         \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+HALF_PRECISION_OPERATOR(sin)
+HALF_PRECISION_OPERATOR(tan)
+HALF_PRECISION_OPERATOR(cos)
+HALF_PRECISION_OPERATOR(exp)
+HALF_PRECISION_OPERATOR(exp2)
+HALF_PRECISION_OPERATOR(exp10)
+HALF_PRECISION_OPERATOR(log)
+HALF_PRECISION_OPERATOR(log2)
+HALF_PRECISION_OPERATOR(log10)
+HALF_PRECISION_OPERATOR(sqrt)
+HALF_PRECISION_OPERATOR(rsqrt)
+HALF_PRECISION_OPERATOR(recip)
+
+#undef HALF_PRECISION_OPERATOR
+
+#define HALF_PRECISION_OPERATOR_2(NAME)                                        \
+  template <typename T>                                                        \
+  void half_precision_math_test_2_##NAME(queue &deviceQueue, T result,         \
+                                         T input1, T input2, size_t ref) {     \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class half_precision2##NAME, T>>([=]() {    \
+          res_access[0] =                                                      \
+              sycl::half_precision::NAME(input1_access[0], input2_access[0]);  \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+HALF_PRECISION_OPERATOR_2(divide)
+HALF_PRECISION_OPERATOR_2(powr)
+
+#undef HALF_PRECISION_OPERATOR_2
+
+template <typename T> void half_precision_math_tests_3(queue &deviceQueue) {
+  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
+  half_precision_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100},
+                                 2);
+  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1},
+                                 T{0.25, 0.25, 0.25}, 2);
+  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1},
+                                 T{0.25, 0.25, 0.25}, 4);
+  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1}, T{2, 2, 2},
+                                  T{2, 2, 2}, 4);
+  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4},
+                                    T{2, 2, 2}, 2);
+}
+
+template <typename T> void half_precision_math_tests_4(queue &deviceQueue) {
+  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               0);
+  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               0);
+  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               1);
+  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               1);
+  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                4);
+  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                 100);
+  half_precision_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1},
+                               0);
+  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                                2);
+  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1, -1},
+                                 T{100, 100, 100, 100}, 2);
+  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                                2);
+  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
+                                 T{0.25, 0.25, 0.25, 0.25}, 2);
+  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1, -1},
+                                 T{0.25, 0.25, 0.25, 0.25}, 4);
+  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                  T{2, 2, 2, 2}, 4);
+  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1, -1},
+                                    T{4, 4, 4, 4}, T{2, 2, 2, 2}, 2);
+}
+
+int main() {
+  queue deviceQueue;
+
+  half_precision_math_tests_3<float3>(deviceQueue);
+  half_precision_math_tests_3<marray<float, 3>>(deviceQueue);
+
+  half_precision_math_tests_4<float4>(deviceQueue);
+  half_precision_math_tests_4<marray<float, 4>>(deviceQueue);
+  std::cout << "Pass" << std::endl;
+  return 0;
+}