@@ -35,14 +35,17 @@ int main(int argc, char* argv[]) {
35
35
// initial value of the counter
36
36
unsigned int initValue = 1234 ;
37
37
38
+ enum P2PTest {
39
+ LOCKFREE_ATOMIC_COUNTER = 0
40
+ ,LOCK_ATOMIC_COUNTER_SAME_CACHELINE
38
41
39
- // use lock implementation of counter
40
- bool useLock = true ;
41
-
42
+ ,INVALID_P2P_TEST // last entry
43
+ } ;
44
+ P2PTest test = LOCKFREE_ATOMIC_COUNTER;
42
45
43
46
// process the command line arguments
44
47
{
45
- const char * options = " h:i:p:" ;
48
+ const char * options = " h:i:p:t: " ;
46
49
int opt;
47
50
while ((opt = getopt (argc, argv, options))!=-1 ) {
48
51
switch (opt) {
@@ -55,6 +58,10 @@ int main(int argc, char* argv[]) {
55
58
case ' p' :
56
59
maxPlayers = atoi (optarg );
57
60
break ;
61
+ case ' t' :
62
+ test = (P2PTest) atoi (optarg );
63
+ assert (test < INVALID_P2P_TEST);
64
+ break ;
58
65
default :
59
66
abort ();
60
67
}
@@ -63,6 +70,7 @@ int main(int argc, char* argv[]) {
63
70
printf (" Max players: %d\n " , maxPlayers);
64
71
printf (" # of hits: %d\n " , hits);
65
72
printf (" Counter initial value: %d\n " , initValue);
73
+ printf (" test: %d\n " , test);
66
74
}
67
75
68
76
am_status_t amStatus;
@@ -91,50 +99,28 @@ int main(int argc, char* argv[]) {
91
99
unsigned int numGPUs = std::min ((unsigned int )gpus.size (), maxPlayers);
92
100
93
101
char * hostPinned = nullptr ;
102
+
103
+ std::atomic<unsigned int >* shared_counter = nullptr ;
104
+ std::atomic<unsigned int >* lock = nullptr ;
94
105
106
+ switch (test) {
95
107
108
+ case LOCKFREE_ATOMIC_COUNTER:
109
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator);
110
+ shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
111
+ break ;
96
112
113
+ case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
114
+ // create the counter and the lock on the same cacheline
115
+ hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >)*2 , currentAccelerator);
116
+ shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
117
+ lock = new (hostPinned + sizeof (std::atomic<unsigned int >)) std::atomic<unsigned int >(0 );
118
+ break ;
97
119
98
- #if 1
99
- hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator);
100
-
101
- #else
102
- #if USE_HC_AM
103
- hostPinned = hc::am_alloc(sizeof(std::atomic<unsigned int>), currentAccelerator
104
- , amHostCoherent
105
- );
106
- printf("shared memory address: %p\n",hostPinned);
107
- assert(hostPinned != nullptr);
108
- #else
109
- hsa_amd_memory_pool_t* alloc_region = static_cast<hsa_amd_memory_pool_t*>(currentAccelerator.get_hsa_am_finegrained_system_region());
110
- assert(alloc_region->handle != -1);
111
-
112
- hsa_status_t hs;
113
- hs = hsa_amd_memory_pool_allocate(*alloc_region, sizeof(std::atomic<unsigned int>), 0, (void**)&hostPinned);
114
- assert(hs == HSA_STATUS_SUCCESS);
115
-
116
-
117
- hsa_agent_t agents[numGPUs];
118
- for (int i = 0; i < numGPUs; i++) {
119
- agents[i] = *(static_cast<hsa_agent_t*> (gpus[i].get_default_view().get_hsa_agent()));
120
- }
121
- hs = hsa_amd_agents_allow_access(numGPUs, agents, nullptr, hostPinned);
122
- assert(hs == HSA_STATUS_SUCCESS);
123
- #endif
124
- #endif
125
-
126
-
127
-
128
- std::atomic<unsigned int >* shared_counter = new (hostPinned) std::atomic<unsigned int >(initValue);
129
-
130
-
131
- std::atomic<unsigned int >* lock = nullptr ;
132
- if (useLock) {
133
- hostPinned = (char *) allocate_shared_mem (sizeof (std::atomic<unsigned int >), currentAccelerator);
134
- lock = new (hostPinned) std::atomic<unsigned int >(0 );
120
+ default :
121
+ abort ();
135
122
}
136
123
137
-
138
124
std::vector<hc::completion_future> futures;
139
125
std::vector<hc::array_view<unsigned int ,1 >> finalValues;
140
126
@@ -143,69 +129,110 @@ int main(int argc, char* argv[]) {
143
129
hc::array_view<unsigned int ,1 > finalValue (1 );
144
130
finalValues.push_back (finalValue);
145
131
146
- futures.push_back (
147
- hc::parallel_for_each (gpus[i].get_default_view ()
132
+
133
+ switch (test) {
134
+ case LOCKFREE_ATOMIC_COUNTER:
135
+
136
+
137
+ futures.push_back (
138
+ hc::parallel_for_each (gpus[i].get_default_view ()
148
139
, hc::extent<1 >(1 )
149
140
, [=](hc::index <1 > idx) [[hc]] {
150
141
151
- // spin for a while here to ensure that all GPUs have started
152
- // and that each of them have loaded the inital value of
153
- // "shared_counter" into their cache
154
- #pragma nounroll
155
- for (int j = 0 ; j < (1024 * 1024 * 16 ); ++j) {
156
- if (shared_counter->load (std::memory_order_relaxed) == 0xFFFFFFFF )
157
- break ;
158
- }
159
-
160
- // counts how many times this GPU has updated the shared_counter
161
- unsigned int count = 0 ;
162
-
163
- unsigned int gpuID = i;
164
- unsigned int next = initValue + gpuID;
165
-
166
- // last known value of shared_counter observed by this GPU
167
- unsigned int last = shared_counter->load (std::memory_order_relaxed);
168
-
169
-
170
- // each GPU waits for its turn (according to the gpuID) to increment the shared_counter
171
- #pragma nounroll
172
- while (count < hits) {
173
- unsigned int expected = next;
174
- if (useLock) {
175
- unsigned int unlocked = 0 ;
176
- if (std::atomic_compare_exchange_weak_explicit (lock
177
- , &unlocked
178
- , (unsigned int )1
179
- , std::memory_order_seq_cst
180
- , std::memory_order_relaxed
181
- )) {
182
-
183
- if (shared_counter->load (std::memory_order_relaxed) == expected) {
142
+ // spin for a while here to ensure that all GPUs have started
143
+ // and that each of them have loaded the inital value of
144
+ // "shared_counter" into their cache
145
+ #pragma nounroll
146
+ for (int j = 0 ; j < (1024 * 1024 * 16 ); ++j) {
147
+ if (shared_counter->load (std::memory_order_relaxed) == 0xFFFFFFFF )
148
+ break ;
149
+ }
150
+
151
+ // counts how many times this GPU has updated the shared_counter
152
+ unsigned int count = 0 ;
153
+
154
+ unsigned int gpuID = i;
155
+ unsigned int next = initValue + gpuID;
156
+
157
+ // last known value of shared_counter observed by this GPU
158
+ unsigned int last = shared_counter->load (std::memory_order_relaxed);
159
+
160
+
161
+ // each GPU waits for its turn (according to the gpuID) to increment the shared_counter
162
+ #pragma nounroll
163
+ while (count < hits) {
164
+ unsigned int expected = next;
165
+ if (std::atomic_compare_exchange_weak_explicit (shared_counter
166
+ , &expected
167
+ , expected + 1
168
+ , std::memory_order_seq_cst
169
+ , std::memory_order_relaxed
170
+ )) {
184
171
last = expected;
185
172
next+=numGPUs;
186
173
count++;
187
-
188
- shared_counter->store (expected + 1 , std::memory_order_relaxed);
189
- lock->store (0 , std::memory_order_release);
190
174
}
175
+ } // while(count < hits)
176
+ finalValue[0 ] = last;
177
+ })
178
+ );
179
+ break ;
180
+
181
+
182
+ case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
183
+
184
+ futures.push_back (
185
+ hc::parallel_for_each (gpus[i].get_default_view ()
186
+ , hc::extent<1 >(1 )
187
+ , [=](hc::index <1 > idx) [[hc]] {
188
+
189
+ // spin for a while here to ensure that all GPUs have started
190
+ // and that each of them have loaded the inital value of
191
+ // "shared_counter" into their cache
192
+ #pragma nounroll
193
+ for (int j = 0 ; j < (1024 * 1024 * 16 ); ++j) {
194
+ if (shared_counter->load (std::memory_order_relaxed) == 0xFFFFFFFF )
195
+ break ;
191
196
}
192
- }
193
- else {
194
- if (std::atomic_compare_exchange_weak_explicit (shared_counter
195
- , &expected
196
- , expected + 1
197
- , std::memory_order_seq_cst
198
- , std::memory_order_relaxed
199
- )) {
200
- last = expected;
201
- next+=numGPUs;
202
- count++;
197
+
198
+ // counts how many times this GPU has updated the shared_counter
199
+ unsigned int count = 0 ;
200
+
201
+ unsigned int gpuID = i;
202
+ unsigned int next = initValue + gpuID;
203
+
204
+ // last known value of shared_counter observed by this GPU
205
+ unsigned int last = shared_counter->load (std::memory_order_relaxed);
206
+
207
+
208
+ // each GPU waits for its turn (according to the gpuID) to increment the shared_counter
209
+ #pragma nounroll
210
+ while (count < hits) {
211
+ unsigned int expected = next;
212
+ unsigned int unlocked = 0 ;
213
+ if (std::atomic_compare_exchange_weak_explicit (lock
214
+ , &unlocked
215
+ , (unsigned int )1
216
+ , std::memory_order_seq_cst
217
+ , std::memory_order_relaxed
218
+ )) {
219
+
220
+ if (shared_counter->load (std::memory_order_relaxed) == expected) {
221
+ last = expected;
222
+ next+=numGPUs;
223
+ count++;
224
+ shared_counter->store (expected + 1 , std::memory_order_relaxed);
225
+ }
226
+ lock->store (0 , std::memory_order_release);
227
+ }
203
228
}
204
- }
205
- }
206
- finalValue[0 ] = last;
207
- })
208
- );
229
+ finalValue[0 ] = last;
230
+ })
231
+ );
232
+ break ;
233
+ default :
234
+ abort ();
235
+ }
209
236
210
237
std::cout << " GPU %" << i << " (" ;
211
238
std::wcout<< gpus[i].get_description ();
@@ -220,20 +247,12 @@ int main(int argc, char* argv[]) {
220
247
, i, finalValues[i][0 ], initValue + (hits-1 ) * numGPUs + i);
221
248
}
222
249
223
- if (hostPinned) {
224
- #if 1
225
- hc::am_free (shared_counter);
226
- if (useLock) {
227
- hc::am_free (lock);
228
- }
229
- #else
230
- #if USE_HC_AM
231
- hc::am_free(hostPinned);
232
- #else
233
- hs = hsa_amd_memory_pool_free(hostPinned);
234
- assert(hs == HSA_STATUS_SUCCESS);
235
- #endif
236
- #endif
250
+ switch (test) {
251
+ case LOCKFREE_ATOMIC_COUNTER:
252
+ case LOCK_ATOMIC_COUNTER_SAME_CACHELINE:
253
+ hc::am_free (shared_counter);
254
+ break ;
255
+ default : ;
237
256
}
238
257
239
258
return 0 ;
0 commit comments