|
1 | 1 | // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
|
2 |
| -// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
| 2 | +// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c |
3 | 3 |
|
4 | 4 | int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
|
5 |
| - int* restrict A1_pos = (int*)(A->indices[0][0]); |
6 |
| - int* restrict A1_crd = (int*)(A->indices[0][1]); |
| 5 | + int A1_dimension = (int)(A->dimensions[0]); |
7 | 6 | int* restrict A2_pos = (int*)(A->indices[1][0]);
|
8 | 7 | int* restrict A2_crd = (int*)(A->indices[1][1]);
|
9 | 8 | double* restrict A_vals = (double*)(A->vals);
|
10 |
| - int B2_dimension = (int)(B->dimensions[1]); |
| 9 | + int B1_dimension = (int)(B->dimensions[0]); |
11 | 10 | int* restrict B1_pos = (int*)(B->indices[0][0]);
|
12 | 11 | int* restrict B1_crd = (int*)(B->indices[0][1]);
|
13 | 12 | int* restrict B2_pos = (int*)(B->indices[1][0]);
|
14 | 13 | int* restrict B2_crd = (int*)(B->indices[1][1]);
|
| 14 | + int* restrict B3_pos = (int*)(B->indices[2][0]); |
| 15 | + int* restrict B3_crd = (int*)(B->indices[2][1]); |
| 16 | + int c1_dimension = (int)(c->dimensions[0]); |
15 | 17 |
|
16 |
| - int32_t pB2_begin = 0; |
17 |
| - int32_t pB2_end = B1_pos[1]; |
| 18 | + int32_t* restrict A2_nnz = calloc(B1_dimension, sizeof(int32_t)); |
18 | 19 |
|
19 |
| - A1_pos = (int32_t*)malloc(sizeof(int32_t) * 2); |
20 |
| - A1_pos[0] = 0; |
21 |
| - int32_t A1_crd_size = 1048576; |
22 |
| - A1_crd = (int32_t*)malloc(sizeof(int32_t) * A1_crd_size); |
23 |
| - int32_t iA = 0; |
24 |
| - int32_t A2_pos_size = 1048576; |
25 |
| - A2_pos = (int32_t*)malloc(sizeof(int32_t) * A2_pos_size); |
| 20 | + #pragma omp parallel for schedule(runtime) |
| 21 | + for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) { |
| 22 | + int32_t i = B1_crd[iB]; |
| 23 | + int32_t tjA2_nnz_val = 0; |
| 24 | + for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) { |
| 25 | + bool qtkA_val = 0; |
| 26 | + for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) { |
| 27 | + int32_t k = B3_crd[kB]; |
| 28 | + qtkA_val = 1; |
| 29 | + } |
| 30 | + tjA2_nnz_val += (int32_t)qtkA_val; |
| 31 | + } |
| 32 | + A2_nnz[i] = tjA2_nnz_val; |
| 33 | + } |
| 34 | + |
| 35 | + A2_pos = (int32_t*)malloc(sizeof(int32_t) * (A1_dimension + 1)); |
26 | 36 | A2_pos[0] = 0;
|
27 |
| - int32_t A2_crd_size = 1048576; |
28 |
| - A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_crd_size); |
29 |
| - int32_t jA = 0; |
| 37 | + for (int32_t i = 0; i < A1_dimension; i++) { |
| 38 | + A2_pos[i + 1] = A2_pos[i] + A2_nnz[i]; |
| 39 | + } |
| 40 | + A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_pos[A1_dimension]); |
| 41 | + A_vals = (double*)malloc(sizeof(double) * A2_pos[A1_dimension]); |
30 | 42 |
|
31 | 43 | #pragma omp parallel for schedule(runtime)
|
32 |
| - for (int32_t chunk = 0; chunk < ((B2_pos[B1_pos[1]] + 7) / 8); chunk++) { |
33 |
| - int32_t fposB = chunk * 8; |
34 |
| - int32_t i_pos = taco_binarySearchBefore(B2_pos, pB2_begin, pB2_end, fposB); |
35 |
| - int32_t i = B1_crd[i_pos]; |
36 |
| - for (int32_t fpos2 = 0; fpos2 < 8; fpos2++) { |
37 |
| - int32_t fposB = chunk * 8 + fpos2; |
38 |
| - if (fposB >= B2_pos[B1_pos[1]]) |
39 |
| - continue; |
| 44 | + for (int32_t iB0 = B1_pos[0]; iB0 < B1_pos[1]; iB0++) { |
| 45 | + int32_t i = B1_crd[iB0]; |
40 | 46 |
|
41 |
| - int32_t f = B2_crd[fposB]; |
42 |
| - if (fposB == B2_pos[(i_pos + 1)]) { |
43 |
| - i_pos++; |
44 |
| - i = B1_crd[i_pos]; |
45 |
| - } |
46 |
| - if (pA2_begin < jA) { |
47 |
| - if (A1_crd_size <= iA) { |
48 |
| - A1_crd = (int32_t*)realloc(A1_crd, sizeof(int32_t) * (A1_crd_size * 2)); |
49 |
| - A1_crd_size *= 2; |
50 |
| - } |
51 |
| - A1_crd[iA] = fpos2; |
52 |
| - iA++; |
| 47 | + for (int32_t jB0 = B2_pos[iB0]; jB0 < B2_pos[(iB0 + 1)]; jB0++) { |
| 48 | + int32_t j = B2_crd[jB0]; |
| 49 | + bool tkA_set = 0; |
| 50 | + for (int32_t kB0 = B3_pos[jB0]; kB0 < B3_pos[(jB0 + 1)]; kB0++) { |
| 51 | + int32_t k = B3_crd[kB0]; |
| 52 | + tkA_set = 1; |
53 | 53 | }
|
54 |
| - if (A2_crd_size <= jA) { |
55 |
| - A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2)); |
56 |
| - A2_crd_size *= 2; |
| 54 | + if (tkA_set) { |
| 55 | + int32_t pA2 = A2_pos[i]; |
| 56 | + A2_pos[i] = A2_pos[i] + 1; |
| 57 | + A2_crd[pA2] = j; |
57 | 58 | }
|
58 |
| - A2_crd[jA] = fpos2; |
59 |
| - jA++; |
60 | 59 | }
|
| 60 | + } |
61 | 61 |
|
62 |
| - A1_pos[1] = iA; |
63 |
| - A2_pos[iA + 1] = jA; |
| 62 | + for (int32_t p = 0; p < A1_dimension; p++) { |
| 63 | + A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)]; |
64 | 64 | }
|
| 65 | + A2_pos[0] = 0; |
65 | 66 |
|
66 |
| - A_vals = (double*)malloc(sizeof(double) * jA); |
| 67 | + free(A2_nnz); |
67 | 68 |
|
68 |
| - A->indices[0][0] = (uint8_t*)(A1_pos); |
69 |
| - A->indices[0][1] = (uint8_t*)(A1_crd); |
70 | 69 | A->indices[1][0] = (uint8_t*)(A2_pos);
|
71 | 70 | A->indices[1][1] = (uint8_t*)(A2_crd);
|
72 | 71 | A->vals = (uint8_t*)A_vals;
|
|
0 commit comments