tensor-compiler
diff --git a/‎examples/generate_examples.sh
Lines changed: 1 addition & 1 deletion b/‎examples/generate_examples.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/spmv_compute.c
Lines changed: 1 addition & 6 deletions b/‎examples/spmv_compute.c
Lines changed: 1 addition & 6 deletions
diff --git a/‎examples/spmv_full.c
Lines changed: 2 additions & 12 deletions b/‎examples/spmv_full.c
Lines changed: 2 additions & 12 deletions
diff --git a/‎examples/ttv_assembly.c
Lines changed: 45 additions & 46 deletions b/‎examples/ttv_assembly.c
Lines changed: 45 additions & 46 deletions
diff --git a/‎examples/ttv_compute.c
Lines changed: 25 additions & 24 deletions b/‎examples/ttv_compute.c
Lines changed: 25 additions & 24 deletions
@@ -15,7 +15,7 @@ mv taco_kernel.c spadd_full.c
 mv taco_compute.c spadd_compute.c
 mv taco_assembly.c spadd_assembly.c
 
-taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 mv taco_kernel.c ttv_full.c
 mv taco_compute.c ttv_compute.c
 mv taco_assembly.c ttv_assembly.c
 
@@ -11,11 +11,6 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int x1_dimension = (int)(x->dimensions[0]);
   double* restrict x_vals = (double*)(x->vals);
 
-  #pragma omp parallel for schedule(static)
-  for (int32_t py = 0; py < y1_dimension; py++) {
-    y_vals[py] = 0.0;
-  }
-
   #pragma omp parallel for schedule(runtime)
   for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
     for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -28,7 +23,7 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
         int32_t j = A2_crd[jA];
         tjy_val += A_vals[jA] * x_vals[j];
       }
-      y_vals[i] = y_vals[i] + tjy_val;
+      y_vals[i] = tjy_val;
     }
   }
   return 0;
 
@@ -118,11 +118,6 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int x1_dimension = (int)(x->dimensions[0]);
   double* restrict x_vals = (double*)(x->vals);
 
-  #pragma omp parallel for schedule(static)
-  for (int32_t py = 0; py < y1_dimension; py++) {
-    y_vals[py] = 0.0;
-  }
-
   #pragma omp parallel for schedule(runtime)
   for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
     for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -135,7 +130,7 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
         int32_t j = A2_crd[jA];
         tjy_val += A_vals[jA] * x_vals[j];
       }
-      y_vals[i] = y_vals[i] + tjy_val;
+      y_vals[i] = tjy_val;
     }
   }
   return 0;
@@ -164,11 +159,6 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int32_t y_capacity = y1_dimension;
   y_vals = (double*)malloc(sizeof(double) * y_capacity);
 
-  #pragma omp parallel for schedule(static)
-  for (int32_t py = 0; py < y_capacity; py++) {
-    y_vals[py] = 0.0;
-  }
-
   #pragma omp parallel for schedule(runtime)
   for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
     for (int32_t i1 = 0; i1 < 32; i1++) {
@@ -181,7 +171,7 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
         int32_t j = A2_crd[jA];
         tjy_val += A_vals[jA] * x_vals[j];
       }
-      y_vals[i] = y_vals[i] + tjy_val;
+      y_vals[i] = tjy_val;
     }
   }
 
 
@@ -1,72 +1,71 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
 int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
-  int* restrict A1_pos = (int*)(A->indices[0][0]);
-  int* restrict A1_crd = (int*)(A->indices[0][1]);
+  int A1_dimension = (int)(A->dimensions[0]);
   int* restrict A2_pos = (int*)(A->indices[1][0]);
   int* restrict A2_crd = (int*)(A->indices[1][1]);
   double* restrict A_vals = (double*)(A->vals);
-  int B2_dimension = (int)(B->dimensions[1]);
+  int B1_dimension = (int)(B->dimensions[0]);
   int* restrict B1_pos = (int*)(B->indices[0][0]);
   int* restrict B1_crd = (int*)(B->indices[0][1]);
   int* restrict B2_pos = (int*)(B->indices[1][0]);
   int* restrict B2_crd = (int*)(B->indices[1][1]);
+  int* restrict B3_pos = (int*)(B->indices[2][0]);
+  int* restrict B3_crd = (int*)(B->indices[2][1]);
+  int c1_dimension = (int)(c->dimensions[0]);
 
-  int32_t pB2_begin = 0;
-  int32_t pB2_end = B1_pos[1];
+  int32_t* restrict A2_nnz = calloc(B1_dimension, sizeof(int32_t));
 
-  A1_pos = (int32_t*)malloc(sizeof(int32_t) * 2);
-  A1_pos[0] = 0;
-  int32_t A1_crd_size = 1048576;
-  A1_crd = (int32_t*)malloc(sizeof(int32_t) * A1_crd_size);
-  int32_t iA = 0;
-  int32_t A2_pos_size = 1048576;
-  A2_pos = (int32_t*)malloc(sizeof(int32_t) * A2_pos_size);
+  #pragma omp parallel for schedule(runtime)
+  for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
+    int32_t i = B1_crd[iB];
+    int32_t tjA2_nnz_val = 0;
+    for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) {
+      bool qtkA_val = 0;
+      for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) {
+        int32_t k = B3_crd[kB];
+        qtkA_val = 1;
+      }
+      tjA2_nnz_val += (int32_t)qtkA_val;
+    }
+    A2_nnz[i] = tjA2_nnz_val;
+  }
+
+  A2_pos = (int32_t*)malloc(sizeof(int32_t) * (A1_dimension + 1));
   A2_pos[0] = 0;
-  int32_t A2_crd_size = 1048576;
-  A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_crd_size);
-  int32_t jA = 0;
+  for (int32_t i = 0; i < A1_dimension; i++) {
+    A2_pos[i + 1] = A2_pos[i] + A2_nnz[i];
+  }
+  A2_crd = (int32_t*)malloc(sizeof(int32_t) * A2_pos[A1_dimension]);
+  A_vals = (double*)malloc(sizeof(double) * A2_pos[A1_dimension]);
 
   #pragma omp parallel for schedule(runtime)
-  for (int32_t chunk = 0; chunk < ((B2_pos[B1_pos[1]] + 7) / 8); chunk++) {
-    int32_t fposB = chunk * 8;
-    int32_t i_pos = taco_binarySearchBefore(B2_pos, pB2_begin, pB2_end, fposB);
-    int32_t i = B1_crd[i_pos];
-    for (int32_t fpos2 = 0; fpos2 < 8; fpos2++) {
-      int32_t fposB = chunk * 8 + fpos2;
-      if (fposB >= B2_pos[B1_pos[1]])
-        continue;
+  for (int32_t iB0 = B1_pos[0]; iB0 < B1_pos[1]; iB0++) {
+    int32_t i = B1_crd[iB0];
 
-      int32_t f = B2_crd[fposB];
-      if (fposB == B2_pos[(i_pos + 1)]) {
-        i_pos++;
-        i = B1_crd[i_pos];
-      }
-      if (pA2_begin < jA) {
-        if (A1_crd_size <= iA) {
-          A1_crd = (int32_t*)realloc(A1_crd, sizeof(int32_t) * (A1_crd_size * 2));
-          A1_crd_size *= 2;
-        }
-        A1_crd[iA] = fpos2;
-        iA++;
+    for (int32_t jB0 = B2_pos[iB0]; jB0 < B2_pos[(iB0 + 1)]; jB0++) {
+      int32_t j = B2_crd[jB0];
+      bool tkA_set = 0;
+      for (int32_t kB0 = B3_pos[jB0]; kB0 < B3_pos[(jB0 + 1)]; kB0++) {
+        int32_t k = B3_crd[kB0];
+        tkA_set = 1;
       }
-      if (A2_crd_size <= jA) {
-        A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2));
-        A2_crd_size *= 2;
+      if (tkA_set) {
+        int32_t pA2 = A2_pos[i];
+        A2_pos[i] = A2_pos[i] + 1;
+        A2_crd[pA2] = j;
       }
-      A2_crd[jA] = fpos2;
-      jA++;
     }
+  }
 
-    A1_pos[1] = iA;
-    A2_pos[iA + 1] = jA;
+  for (int32_t p = 0; p < A1_dimension; p++) {
+    A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
   }
+  A2_pos[0] = 0;
 
-  A_vals = (double*)malloc(sizeof(double) * jA);
+  free(A2_nnz);
 
-  A->indices[0][0] = (uint8_t*)(A1_pos);
-  A->indices[0][1] = (uint8_t*)(A1_crd);
   A->indices[1][0] = (uint8_t*)(A2_pos);
   A->indices[1][1] = (uint8_t*)(A2_crd);
   A->vals = (uint8_t*)A_vals;
 
@@ -1,9 +1,10 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ss:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="fuse(i,j,f)" -s="pos(f,fpos,B)" -s="split(fpos,chunk,fpos2,8)" -s="reorder(chunk,fpos2,k)" -s="parallelize(chunk,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "A(i,j)=B(i,j,k)*c(k)" -f=A:ds:0,1 -f=B:sss:0,1,2 -f=c:d:0 -s="assemble(A,Insert)" -s="parallelize(i,CPUThread,NoRaces)" -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
 int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
+  int A1_dimension = (int)(A->dimensions[0]);
+  int* restrict A2_pos = (int*)(A->indices[1][0]);
   double* restrict A_vals = (double*)(A->vals);
-  int B2_dimension = (int)(B->dimensions[1]);
   int* restrict B1_pos = (int*)(B->indices[0][0]);
   int* restrict B1_crd = (int*)(B->indices[0][1]);
   int* restrict B2_pos = (int*)(B->indices[1][0]);
@@ -14,33 +15,33 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *c) {
   int c1_dimension = (int)(c->dimensions[0]);
   double* restrict c_vals = (double*)(c->vals);
 
-  int32_t pB2_begin = 0;
-  int32_t pB2_end = B1_pos[1];
-
-  int32_t jA = 0;
+  #pragma omp parallel for schedule(static)
+  for (int32_t pA = 0; pA < A2_pos[A1_dimension]; pA++) {
+    A_vals[pA] = 0.0;
+  }
 
   #pragma omp parallel for schedule(runtime)
-  for (int32_t chunk = 0; chunk < ((B2_pos[B1_pos[1]] + 7) / 8); chunk++) {
-    int32_t fposB = chunk * 8;
-    int32_t i_pos = taco_binarySearchBefore(B2_pos, pB2_begin, pB2_end, fposB);
-    int32_t i = B1_crd[i_pos];
-    for (int32_t fpos2 = 0; fpos2 < 8; fpos2++) {
-      int32_t fposB = chunk * 8 + fpos2;
-      if (fposB >= B2_pos[B1_pos[1]])
-        continue;
-
-      int32_t f = B2_crd[fposB];
-      if (fposB == B2_pos[(i_pos + 1)]) {
-        i_pos++;
-        i = B1_crd[i_pos];
-      }
-      A_vals[jA] = 0.0;
-      for (int32_t kB = B3_pos[fposB]; kB < B3_pos[(fposB + 1)]; kB++) {
+  for (int32_t iB = B1_pos[0]; iB < B1_pos[1]; iB++) {
+    int32_t i = B1_crd[iB];
+    for (int32_t jB = B2_pos[iB]; jB < B2_pos[(iB + 1)]; jB++) {
+      double tkA_val = 0.0;
+      bool tkA_set = 0;
+      for (int32_t kB = B3_pos[jB]; kB < B3_pos[(jB + 1)]; kB++) {
         int32_t k = B3_crd[kB];
-        A_vals[jA] = A_vals[jA] + B_vals[kB] * c_vals[k];
+        tkA_val += B_vals[kB] * c_vals[k];
+        tkA_set = 1;
+      }
+      if (tkA_set) {
+        int32_t pA2 = A2_pos[i];
+        A2_pos[i] = A2_pos[i] + 1;
+        A_vals[pA2] = tkA_val;
       }
-      jA++;
     }
   }
+
+  for (int32_t p = 0; p < A1_dimension; p++) {
+    A2_pos[A1_dimension - p] = A2_pos[((A1_dimension - p) - 1)];
+  }
+  A2_pos[0] = 0;
   return 0;
 }
Original file line number	Diff line number	Diff line change
`@@ -11,11 +11,6 @@ int compute(taco_tensor_t y, taco_tensor_t A, taco_tensor_t *x) {`
`11`	`11`	`int x1_dimension = (int)(x->dimensions[0]);`
`12`	`12`	`double* restrict x_vals = (double*)(x->vals);`
`13`	`13`
`14`		`- #pragma omp parallel for schedule(static)`
`15`		`- for (int32_t py = 0; py < y1_dimension; py++) {`
`16`		`- y_vals[py] = 0.0;`
`17`		`- }`
`18`		`-`
`19`	`14`	`#pragma omp parallel for schedule(runtime)`
`20`	`15`	`for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {`
`21`	`16`	`for (int32_t i1 = 0; i1 < 32; i1++) {`
`@@ -28,7 +23,7 @@ int compute(taco_tensor_t y, taco_tensor_t A, taco_tensor_t *x) {`
`28`	`23`	`int32_t j = A2_crd[jA];`
`29`	`24`	`tjy_val += A_vals[jA] * x_vals[j];`
`30`	`25`	`}`
`31`		`- y_vals[i] = y_vals[i] + tjy_val;`
	`26`	`+ y_vals[i] = tjy_val;`
`32`	`27`	`}`
`33`	`28`	`}`
`34`	`29`	`return 0;`