tensor-compiler
diff --git a/‎examples/add_assembly.c
Lines changed: 0 additions & 100 deletions b/‎examples/add_assembly.c
Lines changed: 0 additions & 100 deletions
diff --git a/‎examples/generate_examples.sh
Lines changed: 12 additions & 7 deletions b/‎examples/generate_examples.sh
Lines changed: 12 additions & 7 deletions
diff --git a/‎examples/mttkrp_assembly.c
Lines changed: 2 additions & 2 deletions b/‎examples/mttkrp_assembly.c
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/mttkrp_compute.c
Lines changed: 46 additions & 15 deletions b/‎examples/mttkrp_compute.c
Lines changed: 46 additions & 15 deletions
@@ -1,21 +1,26 @@
 #!/bin/bash
 
-taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s="split(i,i0,i1,32)" -s="reorder(i0,i1,j)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 mv taco_kernel.c spmv_full.c
 mv taco_compute.c spmv_compute.c
 mv taco_assembly.c spmv_assembly.c
 
-taco "A(i,j)=B(i,j)+C(i,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
-mv taco_kernel.c add_full.c
-mv taco_compute.c add_compute.c
-mv taco_assembly.c add_assembly.c
+taco "A(i,j)=B(i,k)*C(k,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="reorder(i,k,j)" -s="precompute(B(i,k)*C(k,j),j,j)" -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+mv taco_kernel.c spgemm_full.c
+mv taco_compute.c spgemm_compute.c
+mv taco_assembly.c spgemm_assembly.c
 
-taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+taco "A(i,j)=B(i,j)+C(i,j)" -f=A:ds:0,1 -f=B:ds:0,1 -f=C:ds:0,1 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+mv taco_kernel.c spadd_full.c
+mv taco_compute.c spadd_compute.c
+mv taco_assembly.c spadd_assembly.c
+
+taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 mv taco_kernel.c ttv_full.c
 mv taco_compute.c ttv_compute.c
 mv taco_assembly.c ttv_assembly.c
 
-taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 mv taco_kernel.c mttkrp_full.c
 mv taco_compute.c mttkrp_compute.c
 mv taco_assembly.c mttkrp_assembly.c
@@ -1,7 +1,7 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
-int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *D, taco_tensor_t *C) {
   int A1_dimension = (int)(A->dimensions[0]);
   int A2_dimension = (int)(A->dimensions[1]);
   double* restrict A_vals = (double*)(A->vals);
 
@@ -1,44 +1,75 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "A(i,j)=B(i,k,l)*C(k,j)*D(l,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=C:dd:0,1 -f=D:dd:0,1 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "A(i,j)=B(i,k,l)*D(l,j)*C(k,j)" -f=A:dd:0,1 -f=B:sss:0,1,2 -f=D:dd:0,1 -f=C:dd:0,1 -s="reorder(i,k,l,j)" -s="precompute(B(i,k,l)*D(l,j),j,j)" -s="split(i,i0,i1,32)" -s="parallelize(i0,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
-int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C, taco_tensor_t *D) {
+int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *D, taco_tensor_t *C) {
   int A1_dimension = (int)(A->dimensions[0]);
   int A2_dimension = (int)(A->dimensions[1]);
   double* restrict A_vals = (double*)(A->vals);
+  int B1_dimension = (int)(B->dimensions[0]);
   int* restrict B1_pos = (int*)(B->indices[0][0]);
   int* restrict B1_crd = (int*)(B->indices[0][1]);
   int* restrict B2_pos = (int*)(B->indices[1][0]);
   int* restrict B2_crd = (int*)(B->indices[1][1]);
   int* restrict B3_pos = (int*)(B->indices[2][0]);
   int* restrict B3_crd = (int*)(B->indices[2][1]);
   double* restrict B_vals = (double*)(B->vals);
-  int C1_dimension = (int)(C->dimensions[0]);
-  int C2_dimension = (int)(C->dimensions[1]);
-  double* restrict C_vals = (double*)(C->vals);
   int D1_dimension = (int)(D->dimensions[0]);
   int D2_dimension = (int)(D->dimensions[1]);
   double* restrict D_vals = (double*)(D->vals);
+  int C1_dimension = (int)(C->dimensions[0]);
+  int C2_dimension = (int)(C->dimensions[1]);
+  double* restrict C_vals = (double*)(C->vals);
 
   #pragma omp parallel for schedule(static)
   for (int32_t pA = 0; pA < (A1_dimension * A2_dimension); pA++) {
     A_vals[pA] = 0.0;
   }
 
   #pragma omp parallel for schedule(runtime)
-  for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
+  for (int32_t i0 = 0; i0 < ((B1_dimension + 31) / 32); i0++) {
+    int32_t pB1_begin = i0 * 32;
+    int32_t iB = taco_binarySearchAfter(B1_crd, B1_pos[0], B1_pos[1], pB1_begin);
+    int32_t pB1_end = B1_pos[1];
+    int32_t iB0 = B1_crd[iB];
     int32_t i = B1_crd[iB];
-    for (int32_t kB = B2_pos[iB]; kB < B2_pos[(iB + 1)]; kB++) {
-      int32_t k = B2_crd[kB];
-      for (int32_t lB = B3_pos[kB]; lB < B3_pos[(kB + 1)]; lB++) {
-        int32_t l = B3_crd[lB];
-        for (int32_t j = 0; j < D2_dimension; j++) {
-          int32_t jA = i * A2_dimension + j;
-          int32_t jC = k * C2_dimension + j;
-          int32_t jD = l * D2_dimension + j;
-          A_vals[jA] = A_vals[jA] + (B_vals[lB] * C_vals[jC]) * D_vals[jD];
+    int32_t i1 = i - i0 * 32;
+    int32_t i1_end = 32;
+
+    while (iB < pB1_end && i1 < i1_end) {
+      iB0 = B1_crd[iB];
+      i = B1_crd[iB];
+      if (iB0 == i) {
+        double* restrict workspace = 0;
+        workspace = (double*)malloc(sizeof(double) * C2_dimension);
+
+        for (int32_t kB = B2_pos[iB]; kB < B2_pos[(iB + 1)]; kB++) {
+          int32_t k = B2_crd[kB];
+          for (int32_t pworkspace = 0; pworkspace < C2_dimension; pworkspace++) {
+            workspace[pworkspace] = 0.0;
+          }
+          for (int32_t lB = B3_pos[kB]; lB < B3_pos[(kB + 1)]; lB++) {
+            int32_t l = B3_crd[lB];
+            for (int32_t j = 0; j < C2_dimension; j++) {
+              int32_t jD = l * D2_dimension + j;
+              workspace[j] = workspace[j] + B_vals[lB] * D_vals[jD];
+            }
+          }
+          for (int32_t j = 0; j < C2_dimension; j++) {
+            int32_t jA = i * A2_dimension + j;
+            int32_t jC = k * C2_dimension + j;
+            A_vals[jA] = A_vals[jA] + workspace[j] * C_vals[jC];
+          }
         }
+
+        free(workspace);
       }
+      iB += (int32_t)(iB0 == i);
+      iB0 = B1_crd[iB];
+      i = B1_crd[iB];
+      i1 = i - i0 * 32;
     }
   }
+
+  A->vals = (uint8_t*)A_vals;
   return 0;
 }