Skip to content

Commit 22b8ffb

Browse files
committed
Fixed SpMV and TTV examples
1 parent 53937a5 commit 22b8ffb

File tree

8 files changed

+199
-287
lines changed

8 files changed

+199
-287
lines changed

examples/generate_examples.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ mv taco_kernel.c spadd_full.c
1515
mv taco_compute.c spadd_compute.c
1616
mv taco_assembly.c spadd_assembly.c
1717

18-
taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
18+
taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
1919
mv taco_kernel.c ttv_full.c
2020
mv taco_compute.c ttv_compute.c
2121
mv taco_assembly.c ttv_assembly.c

examples/spmv_compute.c

+1-6
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
1111
int x1_dimension = (int)(x->dimensions[0]);
1212
double* restrict x_vals = (double*)(x->vals);
1313

14-
#pragma omp parallel for schedule(static)
15-
for (int32_t py = 0; py < y1_dimension; py++) {
16-
y_vals[py] = 0.0;
17-
}
18-
1914
#pragma omp parallel for schedule(runtime)
2015
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
2116
for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -28,7 +23,7 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
2823
int32_t j = A2_crd[jA];
2924
tjy_val += A_vals[jA] * x_vals[j];
3025
}
31-
y_vals[i] = y_vals[i] + tjy_val;
26+
y_vals[i] = tjy_val;
3227
}
3328
}
3429
return 0;

examples/spmv_full.c

+2-12
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,6 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
118118
int x1_dimension = (int)(x->dimensions[0]);
119119
double* restrict x_vals = (double*)(x->vals);
120120

121-
#pragma omp parallel for schedule(static)
122-
for (int32_t py = 0; py < y1_dimension; py++) {
123-
y_vals[py] = 0.0;
124-
}
125-
126121
#pragma omp parallel for schedule(runtime)
127122
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
128123
for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -135,7 +130,7 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
135130
int32_t j = A2_crd[jA];
136131
tjy_val += A_vals[jA] * x_vals[j];
137132
}
138-
y_vals[i] = y_vals[i] + tjy_val;
133+
y_vals[i] = tjy_val;
139134
}
140135
}
141136
return 0;
@@ -164,11 +159,6 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
164159
int32_t y_capacity = y1_dimension;
165160
y_vals = (double*)malloc(sizeof(double) * y_capacity);
166161

167-
#pragma omp parallel for schedule(static)
168-
for (int32_t py = 0; py < y_capacity; py++) {
169-
y_vals[py] = 0.0;
170-
}
171-
172162
#pragma omp parallel for schedule(runtime)
173163
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
174164
for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -181,7 +171,7 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
181171
int32_t j = A2_crd[jA];
182172
tjy_val += A_vals[jA] * x_vals[j];
183173
}
184-
y_vals[i] = y_vals[i] + tjy_val;
174+
y_vals[i] = tjy_val;
185175
}
186176
}
187177

examples/ttv_assembly.c

+45-46
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,71 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

44
int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
5-
int* restrict A1_pos = (int*)(A->indices[0][0]);
6-
int* restrict A1_crd = (int*)(A->indices[0][1]);
5+
int A1_dimension = (int)(A->dimensions[0]);
76
int* restrict A2_pos = (int*)(A->indices[1][0]);
87
int* restrict A2_crd = (int*)(A->indices[1][1]);
98
double* restrict A_vals = (double*)(A->vals);
10-
int B2_dimension = (int)(B->dimensions[1]);
9+
int B1_dimension = (int)(B->dimensions[0]);
1110
int* restrict B1_pos = (int*)(B->indices[0][0]);
1211
int* restrict B1_crd = (int*)(B->indices[0][1]);
1312
int* restrict B2_pos = (int*)(B->indices[1][0]);
1413
int* restrict B2_crd = (int*)(B->indices[1][1]);
14+
int* restrict B3_pos = (int*)(B->indices[2][0]);
15+
int* restrict B3_crd = (int*)(B->indices[2][1]);
16+
int c1_dimension = (int)(c->dimensions[0]);
1517

16-
int32_t pB2_begin = 0;
17-
int32_t pB2_end = B1_pos[1];
18+
int32_t* restrict A2_nnz = calloc(B1_dimension, sizeof(int32_t));
1819

19-
A1_pos = (int32_t*)malloc(sizeof(int32_t) * 2);
20-
A1_pos[0] = 0;
21-
int32_t A1_crd_size = 1048576;
22-
A1_crd = (int32_t*)malloc(sizeof(int32_t) * A1_crd_size);
23-
int32_t iA = 0;
24-
int32_t A2_pos_size = 1048576;
25-
A2_pos = (int32_t*)malloc(sizeof(int32_t) * A2_pos_size);
20+
#pragma omp parallel for schedule(runtime)
21+
for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
22+
int32_t i = B1_crd[iB];
23+
int32_t tjA2_nnz_val = 0;
24+
for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) {
25+
bool qtkA_val = 0;
26+
for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) {
27+
int32_t k = B3_crd[kB];
28+
qtkA_val = 1;
29+
}
30+
tjA2_nnz_val += (int32_t)qtkA_val;
31+
}
32+
A2_nnz[i] = tjA2_nnz_val;
33+
}
34+
35+
A2_pos = (int32_t*)malloc(sizeof(int32_t) * (A1_dimension + 1));
2636
A2_pos[0] = 0;
27-
int32_t A2_crd_size = 1048576;
28-
A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_crd_size);
29-
int32_t jA = 0;
37+
for (int32_t i = 0; i < A1_dimension; i++) {
38+
A2_pos[i + 1] = A2_pos[i] + A2_nnz[i];
39+
}
40+
A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_pos[A1_dimension]);
41+
A_vals = (double*)malloc(sizeof(double) * A2_pos[A1_dimension]);
3042

3143
#pragma omp parallel for schedule(runtime)
32-
for (int32_t chunk = 0; chunk < ((B2_pos[B1_pos[1]] + 7) / 8); chunk++) {
33-
int32_t fposB = chunk * 8;
34-
int32_t i_pos = taco_binarySearchBefore(B2_pos, pB2_begin, pB2_end, fposB);
35-
int32_t i = B1_crd[i_pos];
36-
for (int32_t fpos2 = 0; fpos2 < 8; fpos2++) {
37-
int32_t fposB = chunk * 8 + fpos2;
38-
if (fposB >= B2_pos[B1_pos[1]])
39-
continue;
44+
for (int32_t iB0 = B1_pos[0]; iB0 < B1_pos[1]; iB0++) {
45+
int32_t i = B1_crd[iB0];
4046

41-
int32_t f = B2_crd[fposB];
42-
if (fposB == B2_pos[(i_pos + 1)]) {
43-
i_pos++;
44-
i = B1_crd[i_pos];
45-
}
46-
if (pA2_begin < jA) {
47-
if (A1_crd_size <= iA) {
48-
A1_crd = (int32_t*)realloc(A1_crd, sizeof(int32_t) * (A1_crd_size * 2));
49-
A1_crd_size *= 2;
50-
}
51-
A1_crd[iA] = fpos2;
52-
iA++;
47+
for (int32_t jB0 = B2_pos[iB0]; jB0 < B2_pos[(iB0 + 1)]; jB0++) {
48+
int32_t j = B2_crd[jB0];
49+
bool tkA_set = 0;
50+
for (int32_t kB0 = B3_pos[jB0]; kB0 < B3_pos[(jB0 + 1)]; kB0++) {
51+
int32_t k = B3_crd[kB0];
52+
tkA_set = 1;
5353
}
54-
if (A2_crd_size <= jA) {
55-
A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2));
56-
A2_crd_size *= 2;
54+
if (tkA_set) {
55+
int32_t pA2 = A2_pos[i];
56+
A2_pos[i] = A2_pos[i] + 1;
57+
A2_crd[pA2] = j;
5758
}
58-
A2_crd[jA] = fpos2;
59-
jA++;
6059
}
60+
}
6161

62-
A1_pos[1] = iA;
63-
A2_pos[iA + 1] = jA;
62+
for (int32_t p = 0; p < A1_dimension; p++) {
63+
A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
6464
}
65+
A2_pos[0] = 0;
6566

66-
A_vals = (double*)malloc(sizeof(double) * jA);
67+
free(A2_nnz);
6768

68-
A->indices[0][0] = (uint8_t*)(A1_pos);
69-
A->indices[0][1] = (uint8_t*)(A1_crd);
7069
A->indices[1][0] = (uint8_t*)(A2_pos);
7170
A->indices[1][1] = (uint8_t*)(A2_crd);
7271
A->vals = (uint8_t*)A_vals;

examples/ttv_compute.c

+25-24
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

44
int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
5+
int A1_dimension = (int)(A->dimensions[0]);
6+
int* restrict A2_pos = (int*)(A->indices[1][0]);
57
double* restrict A_vals = (double*)(A->vals);
6-
int B2_dimension = (int)(B->dimensions[1]);
78
int* restrict B1_pos = (int*)(B->indices[0][0]);
89
int* restrict B1_crd = (int*)(B->indices[0][1]);
910
int* restrict B2_pos = (int*)(B->indices[1][0]);
@@ -14,33 +15,33 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
1415
int c1_dimension = (int)(c->dimensions[0]);
1516
double* restrict c_vals = (double*)(c->vals);
1617

17-
int32_t pB2_begin = 0;
18-
int32_t pB2_end = B1_pos[1];
19-
20-
int32_t jA = 0;
18+
#pragma omp parallel for schedule(static)
19+
for (int32_t pA = 0; pA < A2_pos[A1_dimension]; pA++) {
20+
A_vals[pA] = 0.0;
21+
}
2122

2223
#pragma omp parallel for schedule(runtime)
23-
for (int32_t chunk = 0; chunk < ((B2_pos[B1_pos[1]] + 7) / 8); chunk++) {
24-
int32_t fposB = chunk * 8;
25-
int32_t i_pos = taco_binarySearchBefore(B2_pos, pB2_begin, pB2_end, fposB);
26-
int32_t i = B1_crd[i_pos];
27-
for (int32_t fpos2 = 0; fpos2 < 8; fpos2++) {
28-
int32_t fposB = chunk * 8 + fpos2;
29-
if (fposB >= B2_pos[B1_pos[1]])
30-
continue;
31-
32-
int32_t f = B2_crd[fposB];
33-
if (fposB == B2_pos[(i_pos + 1)]) {
34-
i_pos++;
35-
i = B1_crd[i_pos];
36-
}
37-
A_vals[jA] = 0.0;
38-
for (int32_t kB = B3_pos[fposB]; kB < B3_pos[(fposB + 1)]; kB++) {
24+
for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
25+
int32_t i = B1_crd[iB];
26+
for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) {
27+
double tkA_val = 0.0;
28+
bool tkA_set = 0;
29+
for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) {
3930
int32_t k = B3_crd[kB];
40-
A_vals[jA] = A_vals[jA] + B_vals[kB] * c_vals[k];
31+
tkA_val += B_vals[kB] * c_vals[k];
32+
tkA_set = 1;
33+
}
34+
if (tkA_set) {
35+
int32_t pA2 = A2_pos[i];
36+
A2_pos[i] = A2_pos[i] + 1;
37+
A_vals[pA2] = tkA_val;
4138
}
42-
jA++;
4339
}
4440
}
41+
42+
for (int32_t p = 0; p < A1_dimension; p++) {
43+
A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
44+
}
45+
A2_pos[0] = 0;
4546
return 0;
4647
}

0 commit comments

Comments
 (0)