Skip to content

Commit f852325

Browse files
committed
allow memory allocation from non-coherent host memory, add option to create a thread to submit empty kernels
1 parent 32d681c commit f852325

File tree

1 file changed

+90
-7
lines changed

1 file changed

+90
-7
lines changed

pingpong/pingpong.cpp

+90-7
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <algorithm>
66
#include <iostream>
77
#include <unistd.h>
8+
#include <thread>
89

910
#include <hc.hpp>
1011
#include <hc_am.hpp>
@@ -13,16 +14,24 @@
1314
#include "hsa/hsa_ext_amd.h"
1415

1516

16-
static inline void* allocate_shared_mem(size_t size, hc::accelerator accelerator) {
17+
static inline void* allocate_shared_mem(size_t size, hc::accelerator accelerator, int allocationMode) {
1718

1819
void* hostPinned = hc::am_alloc(sizeof(std::atomic<unsigned int>), accelerator
19-
, amHostCoherent
20+
, allocationMode
2021
);
2122
printf("shared memory address: %p\n",hostPinned);
2223
assert(hostPinned != nullptr);
2324
return hostPinned;
2425
}
2526

27+
static inline void __buffer_flush() [[hc]] {
28+
#if 0
29+
asm volatile (
30+
"s_waitcnt vmcnt(0);"
31+
"buffer_wbinvl1_vol;"
32+
);
33+
#endif
34+
}
2635

2736

2837
int main(int argc, char* argv[]) {
@@ -35,17 +44,21 @@ int main(int argc, char* argv[]) {
3544
// initial value of the counter
3645
unsigned int initValue = 1234;
3746

47+
int allocationMode = amHostCoherent;
48+
49+
bool threadEmptyKernel = false;
50+
3851
enum P2PTest {
3952
LOCKFREE_ATOMIC_COUNTER = 0
40-
,LOCK_ATOMIC_COUNTER_SAME_CACHELINE
41-
53+
,LOCK_ATOMIC_COUNTER_SAME_CACHELINE = 1
54+
,LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE = 2
4255
,INVALID_P2P_TEST // last entry
4356
};
4457
P2PTest test = LOCKFREE_ATOMIC_COUNTER;
4558

4659
// process the command line arguments
4760
{
48-
const char* options = "h:i:p:t:";
61+
const char* options = "eh:i:p:t:a:";
4962
int opt;
5063
while ((opt = getopt(argc, argv, options))!=-1) {
5164
switch(opt) {
@@ -62,6 +75,14 @@ int main(int argc, char* argv[]) {
6275
test = (P2PTest) atoi(optarg);
6376
assert(test < INVALID_P2P_TEST);
6477
break;
78+
case 'a':
79+
{
80+
allocationMode = atoi(optarg);
81+
break;
82+
}
83+
case 'e':
84+
threadEmptyKernel = true;
85+
break;
6586
default:
6687
abort();
6788
}
@@ -106,17 +127,26 @@ int main(int argc, char* argv[]) {
106127
switch(test) {
107128

108129
case LOCKFREE_ATOMIC_COUNTER:
109-
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>), currentAccelerator);
130+
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>), currentAccelerator, allocationMode);
110131
shared_counter = new(hostPinned) std::atomic<unsigned int>(initValue);
111132
break;
112133

113134
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
114135
// create the counter and the lock on the same cacheline
115-
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>)*2, currentAccelerator);
136+
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>)*2, currentAccelerator, allocationMode);
116137
shared_counter = new(hostPinned) std::atomic<unsigned int>(initValue);
117138
lock = new(hostPinned + sizeof(std::atomic<unsigned int>)) std::atomic<unsigned int>(0);
118139
break;
119140

141+
case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
142+
// create the counter and the lock on the same cacheline
143+
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>), currentAccelerator, allocationMode);
144+
shared_counter = new(hostPinned) std::atomic<unsigned int>(initValue);
145+
hostPinned = (char*) allocate_shared_mem(sizeof(std::atomic<unsigned int>), currentAccelerator, allocationMode);
146+
lock = new(hostPinned) std::atomic<unsigned int>(0);
147+
break;
148+
149+
120150
default:
121151
abort();
122152
}
@@ -162,6 +192,9 @@ int main(int argc, char* argv[]) {
162192
#pragma nounroll
163193
while (count < hits) {
164194
unsigned int expected = next;
195+
196+
__buffer_flush();
197+
165198
if (std::atomic_compare_exchange_weak_explicit(shared_counter
166199
, &expected
167200
, expected + 1
@@ -171,6 +204,10 @@ int main(int argc, char* argv[]) {
171204
last = expected;
172205
next+=numGPUs;
173206
count++;
207+
208+
209+
__buffer_flush();
210+
174211
}
175212
} // while(count < hits)
176213
finalValue[0] = last;
@@ -180,7 +217,9 @@ int main(int argc, char* argv[]) {
180217

181218

182219
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
220+
case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
183221

222+
184223
futures.push_back(
185224
hc::parallel_for_each(gpus[i].get_default_view()
186225
, hc::extent<1>(1)
@@ -210,6 +249,9 @@ int main(int argc, char* argv[]) {
210249
while (count < hits) {
211250
unsigned int expected = next;
212251
unsigned int unlocked = 0;
252+
253+
__buffer_flush();
254+
213255
if (std::atomic_compare_exchange_weak_explicit(lock
214256
, &unlocked
215257
, (unsigned int)1
@@ -224,6 +266,9 @@ int main(int argc, char* argv[]) {
224266
shared_counter->store(expected + 1, std::memory_order_relaxed);
225267
}
226268
lock->store(0, std::memory_order_release);
269+
270+
271+
__buffer_flush();
227272
}
228273
}
229274
finalValue[0] = last;
@@ -240,18 +285,56 @@ int main(int argc, char* argv[]) {
240285
}
241286
printf("All GPUs have started\n");
242287

288+
289+
std::atomic<bool> allDone(false);
290+
std::thread* emptyKernels = nullptr;
291+
if (threadEmptyKernel) {
292+
293+
// we create a host thread that keeps on
294+
// launching empty kernels to differnt devices
295+
// we hope that the barrier after kernel would
296+
// flush and invalidate the cache
297+
emptyKernels = new std::thread([&]() {
298+
299+
std::vector<hc::accelerator_view> acc_views;
300+
for(auto&& a : gpus) {
301+
acc_views.push_back(a.create_view());
302+
}
303+
304+
while(!allDone.load()) {
305+
for(auto&& v : acc_views) {
306+
// launch an empty kenrel
307+
hc::parallel_for_each(v, hc::extent<1>(1), [](hc::index<1> i) [[hc]] {
308+
309+
});
310+
}
311+
}
312+
});
313+
}
314+
315+
243316
for (int i = 0; i < futures.size(); ++i) {
244317
printf("Waiting for GPU #%d to finish\n", i);
245318
futures[i].wait();
246319
printf("GPU #%d actual final value: %u, expected final value: %u\n\n"
247320
, i, finalValues[i][0], initValue + (hits-1) * numGPUs + i);
248321
}
249322

323+
if (emptyKernels != nullptr) {
324+
allDone.store(true);
325+
emptyKernels->join();
326+
delete emptyKernels;
327+
}
328+
250329
switch(test) {
251330
case LOCKFREE_ATOMIC_COUNTER:
252331
case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
253332
hc::am_free(shared_counter);
254333
break;
334+
case LOCK_ATOMIC_COUNTER_DIFFERENT_CACHELINE:
335+
hc::am_free(shared_counter);
336+
hc::am_free(lock);
337+
break;
255338
default: ;
256339
}
257340

0 commit comments

Comments
 (0)