|
1 | 1 | // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
|
2 |
| -// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
| 2 | +// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
3 | 3 | #ifndef TACO_C_HEADERS
|
4 | 4 | #define TACO_C_HEADERS
|
5 | 5 | #include <stdio.h>
|
@@ -118,14 +118,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
|
118 | 118 | int x1_dimension = (int)(x->dimensions[0]);
|
119 | 119 | double* restrict x_vals = (double*)(x->vals);
|
120 | 120 |
|
| 121 | + #pragma omp parallel for schedule(static) |
| 122 | + for (int32_t py = 0; py < y1_dimension; py++) { |
| 123 | + y_vals[py] = 0.0; |
| 124 | + } |
| 125 | + |
121 | 126 | #pragma omp parallel for schedule(runtime)
|
122 |
| - for (int32_t i = 0; i < A1_dimension; i++) { |
123 |
| - double y_val = 0.0; |
124 |
| - for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
125 |
| - int32_t j = A2_crd[jA]; |
126 |
| - y_val += A_vals[jA] * x_vals[j]; |
| 127 | + for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) { |
| 128 | + for (int32_t i1 = 0; i1 < 32; i1++) { |
| 129 | + int32_t i = i0 * 32 + i1; |
| 130 | + if (i >= A1_dimension) |
| 131 | + continue; |
| 132 | + |
| 133 | + for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
| 134 | + int32_t j = A2_crd[jA]; |
| 135 | + y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j]; |
| 136 | + } |
127 | 137 | }
|
128 |
| - y_vals[i] = y_val; |
129 | 138 | }
|
130 | 139 | return 0;
|
131 | 140 | }
|
@@ -153,14 +162,23 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
|
153 | 162 | int32_t y_capacity = y1_dimension;
|
154 | 163 | y_vals = (double*)malloc(sizeof(double) * y_capacity);
|
155 | 164 |
|
| 165 | + #pragma omp parallel for schedule(static) |
| 166 | + for (int32_t py = 0; py < y_capacity; py++) { |
| 167 | + y_vals[py] = 0.0; |
| 168 | + } |
| 169 | + |
156 | 170 | #pragma omp parallel for schedule(runtime)
|
157 |
| - for (int32_t i = 0; i < A1_dimension; i++) { |
158 |
| - double y_val = 0.0; |
159 |
| - for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
160 |
| - int32_t j = A2_crd[jA]; |
161 |
| - y_val += A_vals[jA] * x_vals[j]; |
| 171 | + for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) { |
| 172 | + for (int32_t i1 = 0; i1 < 32; i1++) { |
| 173 | + int32_t i = i0 * 32 + i1; |
| 174 | + if (i >= A1_dimension) |
| 175 | + continue; |
| 176 | + |
| 177 | + for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) { |
| 178 | + int32_t j = A2_crd[jA]; |
| 179 | + y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j]; |
| 180 | + } |
162 | 181 | }
|
163 |
| - y_vals[i] = y_val; |
164 | 182 | }
|
165 | 183 |
|
166 | 184 | y->vals = (uint8_t*)y_vals;
|
@@ -218,12 +236,12 @@ int pack_A(taco_tensor_t *A, int* A_COO1_pos, int* A_COO1_crd, int* A_COO2_crd,
|
218 | 236 | jA_COO++;
|
219 | 237 | }
|
220 | 238 | if (A_capacity <= jA) {
|
221 |
| - A_vals = (double*)realloc(A_vals, sizeof(double) * (A_capacity * 2)); |
| 239 | + A_vals = (double*)realloc(A_vals, sizeof(double) * A_capacity * 2); |
222 | 240 | A_capacity *= 2;
|
223 | 241 | }
|
224 | 242 | A_vals[jA] = A_COO_val;
|
225 | 243 | if (A2_crd_size <= jA) {
|
226 |
| - A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2)); |
| 244 | + A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * A2_crd_size * 2); |
227 | 245 | A2_crd_size *= 2;
|
228 | 246 | }
|
229 | 247 | A2_crd[jA] = j;
|
@@ -294,12 +312,12 @@ int unpack(int** y_COO1_pos_ptr, int** y_COO1_crd_ptr, double** y_COO_vals_ptr,
|
294 | 312 |
|
295 | 313 | for (int32_t i = 0; i < y1_dimension; i++) {
|
296 | 314 | if (y_COO_capacity <= iy_COO) {
|
297 |
| - y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * (y_COO_capacity * 2)); |
| 315 | + y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * y_COO_capacity * 2); |
298 | 316 | y_COO_capacity *= 2;
|
299 | 317 | }
|
300 | 318 | y_COO_vals[iy_COO] = y_vals[i];
|
301 | 319 | if (y_COO1_crd_size <= iy_COO) {
|
302 |
| - y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * (y_COO1_crd_size * 2)); |
| 320 | + y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * y_COO1_crd_size * 2); |
303 | 321 | y_COO1_crd_size *= 2;
|
304 | 322 | }
|
305 | 323 | y_COO1_crd[iy_COO] = i;
|
|
0 commit comments