|
| 1 | +// RUN: %{build} -fsycl-embed-ir -o %t.out |
| 2 | +// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 \ |
| 3 | +// RUN: | FileCheck %s --implicit-check-not=ERROR |
| 4 | + |
| 5 | +// Test fusion with queue shortcut functions being involved. |
| 6 | + |
| 7 | +#include <sycl/sycl.hpp> |
| 8 | + |
| 9 | +using namespace sycl; |
| 10 | + |
| 11 | +template <int FusionStartPoint, int KernelNum> class Kernel; |
| 12 | + |
| 13 | +template <int FusionStartPoint> void test() { |
| 14 | + static_assert(0 <= FusionStartPoint && FusionStartPoint < 3, |
| 15 | + "Invalid fusion start point"); |
| 16 | + |
| 17 | + constexpr size_t size = 1024; |
| 18 | + constexpr float value = 10; |
| 19 | + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; |
| 20 | + std::array<float, size> h; |
| 21 | + h.fill(0); |
| 22 | + auto *ptr0 = sycl::malloc_device<float>(size, q); |
| 23 | + auto *ptr1 = sycl::malloc_device<float>(size, q); |
| 24 | + |
| 25 | + { |
| 26 | + range<1> r{size}; |
| 27 | + |
| 28 | + ext::codeplay::experimental::fusion_wrapper fw{q}; |
| 29 | + |
| 30 | + if constexpr (FusionStartPoint == 0) { |
| 31 | + fw.start_fusion(); |
| 32 | + } |
| 33 | + |
| 34 | + // ptr0(x) = value |
| 35 | + auto e0 = q.parallel_for<Kernel<FusionStartPoint, 0>>( |
| 36 | + r, [=](sycl::id<1> i) { ptr0[i] = value; }); |
| 37 | + // ptr1(x) = value / 2 |
| 38 | + auto e1 = q.parallel_for<Kernel<FusionStartPoint, 1>>( |
| 39 | + r, [=](sycl::id<1> i) { ptr1[i] = value / 2; }); |
| 40 | + |
| 41 | + if constexpr (FusionStartPoint == 1) { |
| 42 | + fw.start_fusion(); |
| 43 | + } |
| 44 | + |
| 45 | + // ptr0(x) = value / 2 if x < size / 2 else value |
| 46 | + auto e2 = q.memcpy(ptr0, ptr1, sizeof(float) * size / 2, {e0, e1}); |
| 47 | + |
| 48 | + if constexpr (FusionStartPoint == 2) { |
| 49 | + fw.start_fusion(); |
| 50 | + } |
| 51 | + |
| 52 | + // ptr0(x) = value / 2 + 1 if x < size / 2 else value + 1 |
| 53 | + auto e3 = q.parallel_for<Kernel<FusionStartPoint, 2>>( |
| 54 | + r, e2, [=](sycl::id<1> i) { ptr0[i]++; }); |
| 55 | + |
| 56 | + fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); |
| 57 | + |
| 58 | + // Copyback |
| 59 | + q.memcpy(h.data(), ptr0, sizeof(float) * size, e3).wait(); |
| 60 | + } |
| 61 | + |
| 62 | + sycl::free(ptr0, q); |
| 63 | + sycl::free(ptr1, q); |
| 64 | + |
| 65 | + assert(std::all_of(h.begin(), h.begin() + size / 2, |
| 66 | + [=](float f) { return f == value / 2 + 1; }) && |
| 67 | + "ERROR"); |
| 68 | + assert(std::all_of(h.begin() + size / 2, h.end(), |
| 69 | + [=](float f) { return f == value + 1; }) && |
| 70 | + "ERROR"); |
| 71 | +} |
| 72 | + |
| 73 | +int main() { |
| 74 | + std::cerr << "FusionStartPoint = 0:\n"; |
| 75 | + // COM: memcpy leads to a CG being created as it depends on CGs not producing |
| 76 | + // a PI event (coming from the CGs to be fused), so not safe to bypass. Fusion |
| 77 | + // should be cancelled as a dependency with an event to be fused is found. |
| 78 | + |
| 79 | + // CHECK: FusionStartPoint = 0: |
| 80 | + // CHECK-NEXT: WARNING: Not fusing 'copy usm' command group. Can only fuse device kernel command groups. |
| 81 | + // CHECK-NEXT: WARNING: Aborting fusion because synchronization with one of the kernels in the fusion list was requested |
| 82 | + test<0>(); |
| 83 | + |
| 84 | + std::cerr << "FusionStartPoint = 1:\n"; |
| 85 | + // COM: memcpy does not create CG, memory manager handles the operation |
| 86 | + // instead. As no dependency with a CG to be fused is found, events are issued |
| 87 | + // as usual and fusion takes place. |
| 88 | + |
| 89 | + // CHECK-NEXT: FusionStartPoint = 1: |
| 90 | + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found |
| 91 | + test<1>(); |
| 92 | + |
| 93 | + std::cerr << "FusionStartPoint = 2:\n"; |
| 94 | + // COM: Same as above. |
| 95 | + |
| 96 | + // CHECK-NEXT: FusionStartPoint = 2: |
| 97 | + // CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found |
| 98 | + test<2>(); |
| 99 | +} |
0 commit comments