Skip to content

Commit 3ddea7b

Browse files
committed
Updated example generated code
1 parent c136464 commit 3ddea7b

17 files changed

+1372
-458
lines changed

examples/add_assembly.c

-100
This file was deleted.

examples/generate_examples.sh

+12-7
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,26 @@
11
#!/bin/bash
22

3-
taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
3+
taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s="split(i,i0,i1,32)" -s="reorder(i0,i1,j)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
44
mv taco_kernel.c spmv_full.c
55
mv taco_compute.c spmv_compute.c
66
mv taco_assembly.c spmv_assembly.c
77

8-
taco "A(i,j)=B(i,j)+C(i,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
9-
mv taco_kernel.c add_full.c
10-
mv taco_compute.c add_compute.c
11-
mv taco_assembly.c add_assembly.c
8+
taco "A(i,j)=B(i,k)*C(k,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="reorder(i,k,j)" -s="precompute(B(i,k)*C(k,j),j,j)" -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
9+
mv taco_kernel.c spgemm_full.c
10+
mv taco_compute.c spgemm_compute.c
11+
mv taco_assembly.c spgemm_assembly.c
1212

13-
taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
13+
taco "A(i,j)=B(i,j)+C(i,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
14+
mv taco_kernel.c spadd_full.c
15+
mv taco_compute.c spadd_compute.c
16+
mv taco_assembly.c spadd_assembly.c
17+
18+
taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
1419
mv taco_kernel.c ttv_full.c
1520
mv taco_compute.c ttv_compute.c
1621
mv taco_assembly.c ttv_assembly.c
1722

18-
taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
23+
taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
1924
mv taco_kernel.c mttkrp_full.c
2025
mv taco_compute.c mttkrp_compute.c
2126
mv taco_assembly.c mttkrp_assembly.c

examples/mttkrp_assembly.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

4-
int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
4+
int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *D, taco_tensor_t *C) {
55
int A1_dimension = (int)(A->dimensions[0]);
66
int A2_dimension = (int)(A->dimensions[1]);
77
double* restrict A_vals = (double*)(A->vals);

examples/mttkrp_compute.c

+46-15
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,75 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

4-
int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
4+
int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *D, taco_tensor_t *C) {
55
int A1_dimension = (int)(A->dimensions[0]);
66
int A2_dimension = (int)(A->dimensions[1]);
77
double* restrict A_vals = (double*)(A->vals);
8+
int B1_dimension = (int)(B->dimensions[0]);
89
int* restrict B1_pos = (int*)(B->indices[0][0]);
910
int* restrict B1_crd = (int*)(B->indices[0][1]);
1011
int* restrict B2_pos = (int*)(B->indices[1][0]);
1112
int* restrict B2_crd = (int*)(B->indices[1][1]);
1213
int* restrict B3_pos = (int*)(B->indices[2][0]);
1314
int* restrict B3_crd = (int*)(B->indices[2][1]);
1415
double* restrict B_vals = (double*)(B->vals);
15-
int C1_dimension = (int)(C->dimensions[0]);
16-
int C2_dimension = (int)(C->dimensions[1]);
17-
double* restrict C_vals = (double*)(C->vals);
1816
int D1_dimension = (int)(D->dimensions[0]);
1917
int D2_dimension = (int)(D->dimensions[1]);
2018
double* restrict D_vals = (double*)(D->vals);
19+
int C1_dimension = (int)(C->dimensions[0]);
20+
int C2_dimension = (int)(C->dimensions[1]);
21+
double* restrict C_vals = (double*)(C->vals);
2122

2223
#pragma omp parallel for schedule(static)
2324
for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) {
2425
A_vals[pA] = 0.0;
2526
}
2627

2728
#pragma omp parallel for schedule(runtime)
28-
for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
29+
for (int32_t i0 = 0; i0 < ((B1_dimension + 31) / 32); i0++) {
30+
int32_t pB1_begin = i0 * 32;
31+
int32_t iB = taco_binarySearchAfter(B1_crd, B1_pos[0], B1_pos[1], pB1_begin);
32+
int32_t pB1_end = B1_pos[1];
33+
int32_t iB0 = B1_crd[iB];
2934
int32_t i = B1_crd[iB];
30-
for (int32_t kB = B2_pos[iB]; kB < B2_pos[(iB + 1)]; kB++) {
31-
int32_t k = B2_crd[kB];
32-
for (int32_t lB = B3_pos[kB]; lB < B3_pos[(kB + 1)]; lB++) {
33-
int32_t l = B3_crd[lB];
34-
for (int32_t j = 0; j < D2_dimension; j++) {
35-
int32_t jA = i * A2_dimension + j;
36-
int32_t jC = k * C2_dimension + j;
37-
int32_t jD = l * D2_dimension + j;
38-
A_vals[jA] = A_vals[jA] + (B_vals[lB] * C_vals[jC]) * D_vals[jD];
35+
int32_t i1 = i - i0 * 32;
36+
int32_t i1_end = 32;
37+
38+
while (iB < pB1_end && i1 < i1_end) {
39+
iB0 = B1_crd[iB];
40+
i = B1_crd[iB];
41+
if (iB0 == i) {
42+
double* restrict workspace = 0;
43+
workspace = (double*)malloc(sizeof(double) * C2_dimension);
44+
45+
for (int32_t kB = B2_pos[iB]; kB < B2_pos[(iB + 1)]; kB++) {
46+
int32_t k = B2_crd[kB];
47+
for (int32_t pworkspace = 0; pworkspace < C2_dimension; pworkspace++) {
48+
workspace[pworkspace] = 0.0;
49+
}
50+
for (int32_t lB = B3_pos[kB]; lB < B3_pos[(kB + 1)]; lB++) {
51+
int32_t l = B3_crd[lB];
52+
for (int32_t j = 0; j < C2_dimension; j++) {
53+
int32_t jD = l * D2_dimension + j;
54+
workspace[j] = workspace[j] + B_vals[lB] * D_vals[jD];
55+
}
56+
}
57+
for (int32_t j = 0; j < C2_dimension; j++) {
58+
int32_t jA = i * A2_dimension + j;
59+
int32_t jC = k * C2_dimension + j;
60+
A_vals[jA] = A_vals[jA] + workspace[j] * C_vals[jC];
61+
}
3962
}
63+
64+
free(workspace);
4065
}
66+
iB += (int32_t)(iB0 == i);
67+
iB0 = B1_crd[iB];
68+
i = B1_crd[iB];
69+
i1 = i - i0 * 32;
4170
}
4271
}
72+
73+
A->vals = (uint8_t*)A_vals;
4374
return 0;
4475
}

0 commit comments

Comments
 (0)