9
9
#include <math.h>
10
10
#include <complex.h>
11
11
#include <string.h>
12
+ #if _OPENMP
13
+ #include <omp.h>
14
+ #endif
12
15
#define TACO_MIN (_a ,_b ) ((_a) < (_b) ? (_a) : (_b))
13
16
#define TACO_MAX (_a ,_b ) ((_a) > (_b) ? (_a) : (_b))
14
17
#define TACO_DEREF (_a ) (((___context___*)(*__ctx__))->_a)
@@ -26,6 +29,10 @@ typedef struct {
26
29
int32_t vals_size ; // values array size
27
30
} taco_tensor_t ;
28
31
#endif
32
+ #if !_OPENMP
33
+ int omp_get_thread_num () { return 0 ; }
34
+ int omp_get_max_threads () { return 1 ; }
35
+ #endif
29
36
int cmp (const void * a , const void * b ) {
30
37
return * ((const int * )a ) - * ((const int * )b );
31
38
}
@@ -122,14 +129,18 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
122
129
int * restrict C2_crd = (int * )(C -> indices [1 ][1 ]);
123
130
double * restrict C_vals = (double * )(C -> vals );
124
131
132
+ double * restrict workspace_all = 0 ;
133
+ int32_t * restrict workspace_index_list_all = 0 ;
134
+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
135
+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
136
+ workspace_all = (double * )malloc (sizeof (double ) * (C2_dimension * omp_get_max_threads ()));
137
+
125
138
#pragma omp parallel for schedule(runtime)
126
139
for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
127
140
int32_t workspace_index_list_size = 0 ;
128
- double * restrict workspace = 0 ;
129
- int32_t * restrict workspace_index_list = 0 ;
130
- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
131
- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
132
- workspace = (double * )malloc (sizeof (double ) * C2_dimension );
141
+ double * restrict workspace = workspace_all + C2_dimension * omp_get_thread_num ();
142
+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
143
+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
133
144
for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
134
145
int32_t k = B2_crd [kB ];
135
146
for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -153,11 +164,12 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
153
164
A_vals [pA2 ] = workspace [j ];
154
165
workspace_already_set [j ] = 0 ;
155
166
}
156
- free (workspace_index_list );
157
- free (workspace_already_set );
158
- free (workspace );
159
167
}
160
168
169
+ free (workspace_index_list_all );
170
+ free (workspace_already_set_all );
171
+ free (workspace_all );
172
+
161
173
for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
162
174
A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
163
175
}
@@ -184,12 +196,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
184
196
int32_t * restrict A2_nnz = 0 ;
185
197
A2_nnz = (int32_t * )malloc (sizeof (int32_t ) * B1_dimension );
186
198
199
+ int32_t * restrict qworkspace_index_list_all = 0 ;
200
+ qworkspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
201
+ bool * restrict qworkspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
202
+
187
203
#pragma omp parallel for schedule(runtime)
188
204
for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
189
205
int32_t qworkspace_index_list_size = 0 ;
190
- int32_t * restrict qworkspace_index_list = 0 ;
191
- qworkspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
192
- bool * restrict qworkspace_already_set = calloc (C2_dimension , sizeof (bool ));
206
+ int32_t * restrict qworkspace_index_list = qworkspace_index_list_all + C2_dimension * omp_get_thread_num ();
207
+ bool * restrict qworkspace_already_set = qworkspace_already_set_all + C2_dimension * omp_get_thread_num ();
193
208
for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
194
209
int32_t k = B2_crd [kB ];
195
210
for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -208,10 +223,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
208
223
qworkspace_already_set [j ] = 0 ;
209
224
}
210
225
A2_nnz [i ] = tjA2_nnz_val ;
211
- free (qworkspace_index_list );
212
- free (qworkspace_already_set );
213
226
}
214
227
228
+ free (qworkspace_index_list_all );
229
+ free (qworkspace_already_set_all );
230
+
215
231
A2_pos = (int32_t * )malloc (sizeof (int32_t ) * (A1_dimension + 1 ));
216
232
A2_pos [0 ] = 0 ;
217
233
for (int32_t i = 0 ; i < A1_dimension ; i ++ ) {
@@ -220,12 +236,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
220
236
A2_crd = (int32_t * )malloc (sizeof (int32_t ) * A2_pos [A1_dimension ]);
221
237
A_vals = (double * )malloc (sizeof (double ) * A2_pos [A1_dimension ]);
222
238
239
+ int32_t * restrict workspace_index_list_all = 0 ;
240
+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
241
+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
242
+
223
243
#pragma omp parallel for schedule(runtime)
224
244
for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
225
245
int32_t workspace_index_list_size = 0 ;
226
- int32_t * restrict workspace_index_list = 0 ;
227
- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
228
- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
246
+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
247
+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
229
248
for (int32_t kB0 = B2_pos [i ]; kB0 < B2_pos [(i + 1 )]; kB0 ++ ) {
230
249
int32_t k = B2_crd [kB0 ];
231
250
for (int32_t jC0 = C2_pos [k ]; jC0 < C2_pos [(k + 1 )]; jC0 ++ ) {
@@ -246,10 +265,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
246
265
A2_crd [pA2 ] = j ;
247
266
workspace_already_set [j ] = 0 ;
248
267
}
249
- free (workspace_index_list );
250
- free (workspace_already_set );
251
268
}
252
269
270
+ free (workspace_index_list_all );
271
+ free (workspace_already_set_all );
272
+
253
273
for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
254
274
A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
255
275
}
@@ -281,12 +301,15 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
281
301
int32_t * restrict A2_nnz = 0 ;
282
302
A2_nnz = (int32_t * )malloc (sizeof (int32_t ) * B1_dimension );
283
303
304
+ int32_t * restrict qworkspace_index_list_all = 0 ;
305
+ qworkspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
306
+ bool * restrict qworkspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
307
+
284
308
#pragma omp parallel for schedule(runtime)
285
309
for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
286
310
int32_t qworkspace_index_list_size = 0 ;
287
- int32_t * restrict qworkspace_index_list = 0 ;
288
- qworkspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
289
- bool * restrict qworkspace_already_set = calloc (C2_dimension , sizeof (bool ));
311
+ int32_t * restrict qworkspace_index_list = qworkspace_index_list_all + C2_dimension * omp_get_thread_num ();
312
+ bool * restrict qworkspace_already_set = qworkspace_already_set_all + C2_dimension * omp_get_thread_num ();
290
313
for (int32_t kB = B2_pos [i ]; kB < B2_pos [(i + 1 )]; kB ++ ) {
291
314
int32_t k = B2_crd [kB ];
292
315
for (int32_t jC = C2_pos [k ]; jC < C2_pos [(k + 1 )]; jC ++ ) {
@@ -305,10 +328,11 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
305
328
qworkspace_already_set [j ] = 0 ;
306
329
}
307
330
A2_nnz [i ] = tjA2_nnz_val ;
308
- free (qworkspace_index_list );
309
- free (qworkspace_already_set );
310
331
}
311
332
333
+ free (qworkspace_index_list_all );
334
+ free (qworkspace_already_set_all );
335
+
312
336
A2_pos = (int32_t * )malloc (sizeof (int32_t ) * (A1_dimension + 1 ));
313
337
A2_pos [0 ] = 0 ;
314
338
for (int32_t i = 0 ; i < A1_dimension ; i ++ ) {
@@ -317,14 +341,18 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
317
341
A2_crd = (int32_t * )malloc (sizeof (int32_t ) * A2_pos [A1_dimension ]);
318
342
A_vals = (double * )malloc (sizeof (double ) * A2_pos [A1_dimension ]);
319
343
344
+ double * restrict workspace_all = 0 ;
345
+ int32_t * restrict workspace_index_list_all = 0 ;
346
+ workspace_index_list_all = (int32_t * )malloc (sizeof (int32_t ) * (C2_dimension * omp_get_max_threads ()));
347
+ bool * restrict workspace_already_set_all = calloc ((C2_dimension * omp_get_max_threads ()), sizeof (bool ));
348
+ workspace_all = (double * )malloc (sizeof (double ) * (C2_dimension * omp_get_max_threads ()));
349
+
320
350
#pragma omp parallel for schedule(runtime)
321
351
for (int32_t i = 0 ; i < B1_dimension ; i ++ ) {
322
352
int32_t workspace_index_list_size = 0 ;
323
- double * restrict workspace = 0 ;
324
- int32_t * restrict workspace_index_list = 0 ;
325
- workspace_index_list = (int32_t * )malloc (sizeof (int32_t ) * C2_dimension );
326
- bool * restrict workspace_already_set = calloc (C2_dimension , sizeof (bool ));
327
- workspace = (double * )malloc (sizeof (double ) * C2_dimension );
353
+ double * restrict workspace = workspace_all + C2_dimension * omp_get_thread_num ();
354
+ int32_t * restrict workspace_index_list = workspace_index_list_all + C2_dimension * omp_get_thread_num ();
355
+ bool * restrict workspace_already_set = workspace_already_set_all + C2_dimension * omp_get_thread_num ();
328
356
for (int32_t kB0 = B2_pos [i ]; kB0 < B2_pos [(i + 1 )]; kB0 ++ ) {
329
357
int32_t k = B2_crd [kB0 ];
330
358
for (int32_t jC0 = C2_pos [k ]; jC0 < C2_pos [(k + 1 )]; jC0 ++ ) {
@@ -350,11 +378,12 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
350
378
A_vals [pA2 ] = workspace [j ];
351
379
workspace_already_set [j ] = 0 ;
352
380
}
353
- free (workspace_index_list );
354
- free (workspace_already_set );
355
- free (workspace );
356
381
}
357
382
383
+ free (workspace_index_list_all );
384
+ free (workspace_already_set_all );
385
+ free (workspace_all );
386
+
358
387
for (int32_t p = 0 ; p < A1_dimension ; p ++ ) {
359
388
A2_pos [A1_dimension - p ] = A2_pos [((A1_dimension - p ) - 1 )];
360
389
}
0 commit comments