This repository was archived by the owner on Mar 28, 2023. It is now read-only.
forked from llvm/llvm-test-suite
-
Notifications
You must be signed in to change notification settings - Fork 131
/
Copy pathjit_caching.cpp
142 lines (120 loc) · 5.05 KB
/
jit_caching.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// UNSUPPORTED: hip
// REQUIRES: fusion
// Test caching for JIT fused kernels. Also test for debug messages being
// printed when SYCL_RT_WARNING_LEVEL=1.
#include <iostream>
#include <sycl/sycl.hpp>
using namespace sycl;
constexpr size_t dataSize = 512;
enum class Internalization { None, Local, Private };
void performFusion(queue &q, Internalization internalize, range<1> globalSize,
int beta, int gamma, bool insertBarriers = false) {
int alpha = 1;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
{
buffer<int> bIn1{in1, globalSize};
buffer<int> bIn2{in2, globalSize};
buffer<int> bIn3{in3, globalSize};
buffer<int> bTmp{tmp, globalSize};
buffer<int> bOut{out, globalSize};
ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access(cgh);
auto accIn2 = bIn2.get_access(cgh);
property_list properties{};
if (internalize == Internalization::Private) {
properties = {
sycl::ext::codeplay::experimental::property::promote_private{}};
} else if (internalize == Internalization::Local) {
properties = {
sycl::ext::codeplay::experimental::property::promote_local{}};
}
accessor<int> accTmp = bTmp.get_access(cgh, properties);
cgh.parallel_for<class KernelOne>(globalSize, [=](id<1> i) {
accTmp[i] = accIn1[i] + accIn2[i] * alpha;
});
});
q.submit([&](handler &cgh) {
property_list properties{};
if (internalize == Internalization::Private) {
properties = {
sycl::ext::codeplay::experimental::property::promote_private{}};
} else if (internalize == Internalization::Local) {
properties = {
sycl::ext::codeplay::experimental::property::promote_local{}};
}
accessor<int> accTmp = bTmp.get_access(cgh, properties);
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(globalSize, [=](id<1> i) {
accOut[i] = accTmp[i] * accIn3[i] * beta * gamma;
});
});
if (insertBarriers) {
fw.complete_fusion();
} else {
fw.complete_fusion(
{ext::codeplay::experimental::property::no_barriers{}});
}
assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");
}
// Check the results
size_t numErrors = 0;
size_t numInternalized = 0;
for (size_t i = 0; i < dataSize; ++i) {
if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) {
++numErrors;
}
if (tmp[i] == -1) {
++numInternalized;
}
}
if (numErrors) {
std::cout << "COMPUTATION ERROR\n";
}
if ((internalize == Internalization::None) && numInternalized) {
std::cout << "WRONG INTERNALIZATION\n";
}
}
int main() {
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
// Initial invocation
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
// CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
// Identical invocation, should lead to JIT cache hit.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1);
// CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel
// CHECK-NEXT: INFO: Re-using existing device binary for fused kernel
// Invocation with a different beta. Because beta was identical to alpha so
// far, this should lead to a cache miss.
performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
// Invocation with barrier insertion should lead to a cache miss.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1,
/* insertBarriers */ true);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
// Invocation with different internalization target should lead to a cache
// miss.
performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
// Invocation with a different gamma should lead to a cache miss because gamma
// participates in constant propagation.
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2);
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found
return 0;
}