5
5
#include < algorithm>
6
6
#include < iostream>
7
7
#include < unistd.h>
8
+ #include < thread>
8
9
9
10
#include < hc.hpp>
10
11
#include < hc_am.hpp>
13
14
#include " hsa/hsa_ext_amd.h"
14
15
15
16
16
- static inline void * allocate_shared_mem (size_t size, hc::accelerator accelerator) {
17
+ static inline void * allocate_shared_mem (size_t size, hc::accelerator accelerator, int allocationMode ) {
17
18
18
19
void * hostPinned = hc::am_alloc (sizeof (std::atomic<unsigned int >), accelerator
19
- , amHostCoherent
20
+ , allocationMode
20
21
);
21
22
printf (" shared memory address: %p\n " ,hostPinned);
22
23
assert (hostPinned != nullptr );
23
24
return hostPinned;
24
25
}
25
26
27
+ static inline void __buffer_flush () [[hc]] {
28
+ #if 0
29
+ asm volatile (
30
+ "s_waitcnt vmcnt(0);"
31
+ "buffer_wbinvl1_vol;"
32
+ );
33
+ #endif
34
+ }
26
35
27
36
28
37
int main (int argc, char * argv[]) {
@@ -35,17 +44,21 @@ int main(int argc, char* argv[]) {
35
44
// initial value of the counter
36
45
unsigned int initValue = 1234 ;
37
46
47
+ int allocationMode = amHostCoherent;
48
+
49
+ bool threadEmptyKernel = false ;
50
+
38
51
enum P2PTest {
39
52
LOCKFREE_ATOMIC_COUNTER = 0
40
- ,LOCK_ATOMIC_COUNTER_SAME_CACHELINE
41
-
53
+ ,LOCK_ATOMIC_COUNTER_SAME_CACHELINE = 1
54
+ ,LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE = 2
42
55
,INVALID_P2P_TEST // last entry
43
56
};
44
57
P2PTest test = LOCKFREE_ATOMIC_COUNTER;
45
58
46
59
// process the command line arguments
47
60
{
48
- const char * options = " h :i:p:t:" ;
61
+ const char * options = " eh :i:p:t:a :" ;
49
62
int opt;
50
63
while ((opt = getopt (argc, argv, options))!=-1 ) {
51
64
switch (opt) {
@@ -62,6 +75,14 @@ int main(int argc, char* argv[]) {
62
75
test = (P2PTest) atoi (optarg );
63
76
assert (test < INVALID_P2P_TEST);
64
77
break ;
78
+ case ' a' :
79
+ {
80
+ allocationMode = atoi (optarg );
81
+ break ;
82
+ }
83
+ case ' e' :
84
+ threadEmptyKernel = true ;
85
+ break ;
65
86
default :
66
87
abort ();
67
88
}
@@ -106,17 +127,26 @@ int main(int argc, char* argv[]) {
106
127
switch (test) {
107
128
108
129
case LOCKFREE_ATOMIC_COUNTER:
109
- hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator);
130
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator, allocationMode );
110
131
shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
111
132
break ;
112
133
113
134
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
114
135
// create the counter and the lock on the same cacheline
115
- hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >)*2 , currentAccelerator);
136
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >)*2 , currentAccelerator, allocationMode );
116
137
shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
117
138
lock = new (hostPinned + sizeof (std::atomic<unsigned int >)) std::atomic<unsigned int >(0 );
118
139
break ;
119
140
141
+ case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
142
+ // create the counter and the lock on the same cacheline
143
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator, allocationMode);
144
+ shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
145
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator, allocationMode);
146
+ lock = new (hostPinned) std::atomic<unsigned int >(0 );
147
+ break ;
148
+
149
+
120
150
default :
121
151
abort ();
122
152
}
@@ -162,6 +192,9 @@ int main(int argc, char* argv[]) {
162
192
#pragma nounroll
163
193
while (count < hits) {
164
194
unsigned int expected = next;
195
+
196
+ __buffer_flush ();
197
+
165
198
if (std::atomic_compare_exchange_weak_explicit (shared_counter
166
199
, &expected
167
200
, expected + 1
@@ -171,6 +204,10 @@ int main(int argc, char* argv[]) {
171
204
last = expected;
172
205
next+=numGPUs;
173
206
count++;
207
+
208
+
209
+ __buffer_flush ();
210
+
174
211
}
175
212
} // while(count < hits)
176
213
finalValue[0 ] = last;
@@ -180,7 +217,9 @@ int main(int argc, char* argv[]) {
180
217
181
218
182
219
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
220
+ case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
183
221
222
+
184
223
futures.push_back (
185
224
hc::parallel_for_each (gpus[i].get_default_view ()
186
225
, hc::extent<1 >(1 )
@@ -210,6 +249,9 @@ int main(int argc, char* argv[]) {
210
249
while (count < hits) {
211
250
unsigned int expected = next;
212
251
unsigned int unlocked = 0 ;
252
+
253
+ __buffer_flush ();
254
+
213
255
if (std::atomic_compare_exchange_weak_explicit (lock
214
256
, &unlocked
215
257
, (unsigned int )1
@@ -224,6 +266,9 @@ int main(int argc, char* argv[]) {
224
266
shared_counter->store (expected + 1 , std::memory_order_relaxed);
225
267
}
226
268
lock->store (0 , std::memory_order_release);
269
+
270
+
271
+ __buffer_flush ();
227
272
}
228
273
}
229
274
finalValue[0 ] = last;
@@ -240,18 +285,56 @@ int main(int argc, char* argv[]) {
240
285
}
241
286
printf (" All GPUs have started\n " );
242
287
288
+
289
+ std::atomic<bool > allDone (false );
290
+ std::thread* emptyKernels = nullptr ;
291
+ if (threadEmptyKernel) {
292
+
293
+ // we create a host thread that keeps on
294
+ // launching empty kernels to differnt devices
295
+ // we hope that the barrier after kernel would
296
+ // flush and invalidate the cache
297
+ emptyKernels = new std::thread ([&]() {
298
+
299
+ std::vector<hc::accelerator_view> acc_views;
300
+ for (auto && a : gpus) {
301
+ acc_views.push_back (a.create_view ());
302
+ }
303
+
304
+ while (!allDone.load ()) {
305
+ for (auto && v : acc_views) {
306
+ // launch an empty kenrel
307
+ hc::parallel_for_each (v, hc::extent<1 >(1 ), [](hc::index <1 > i) [[hc]] {
308
+
309
+ });
310
+ }
311
+ }
312
+ });
313
+ }
314
+
315
+
243
316
for (int i = 0 ; i < futures.size (); ++i) {
244
317
printf (" Waiting for GPU #%d to finish\n " , i);
245
318
futures[i].wait ();
246
319
printf (" GPU #%d actual final value: %u, expected final value: %u\n\n "
247
320
, i, finalValues[i][0 ], initValue + (hits-1 ) * numGPUs + i);
248
321
}
249
322
323
+ if (emptyKernels != nullptr ) {
324
+ allDone.store (true );
325
+ emptyKernels->join ();
326
+ delete emptyKernels;
327
+ }
328
+
250
329
switch (test) {
251
330
case LOCKFREE_ATOMIC_COUNTER:
252
331
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
253
332
hc::am_free (shared_counter);
254
333
break ;
334
+ case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
335
+ hc::am_free (shared_counter);
336
+ hc::am_free (lock);
337
+ break ;
255
338
default : ;
256
339
}
257
340
0 commit comments