tensor-compiler
diff --git a/‎codegen.html
Lines changed: 31 additions & 0 deletions b/‎codegen.html
Lines changed: 31 additions & 0 deletions
diff --git a/‎examples/spmv_assembly.c
Lines changed: 2 additions & 2 deletions b/‎examples/spmv_assembly.c
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/spmv_compute.c
Lines changed: 17 additions & 8 deletions b/‎examples/spmv_compute.c
Lines changed: 17 additions & 8 deletions
diff --git a/‎examples/spmv_full.c
Lines changed: 35 additions & 17 deletions b/‎examples/spmv_full.c
Lines changed: 35 additions & 17 deletions
@@ -21,6 +21,8 @@
     <script src="javascripts/jquery.ui.touch-punch.min.js"></script>
     <script src="javascripts/FileSaver.min.js"></script>
     <script src="javascripts/parser.js"></script>
+    <script src="javascripts/parser-indices.js"></script>
+    <script src="javascripts/default-schedules.js"></script>
     <script src="javascripts/demo.js"></script>
     <script type="text/x-mathjax-config">
       MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
@@ -126,6 +128,35 @@ <h6 style="margin-bottom: 0px; margin-top: 18px">Input a tensor algebra expressi
             </div>
             <div class="mdl-layout-spacer"></div>
           </div>
+
+          <div class="mdl-grid" style="padding-top: 6px">
+            <div class="mdl-layout-spacer"></div>
+            <div class="mdl-cell mdl-cell--9-col">
+              <div>
+                <button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom: 10px; width: 30%">
+                  Add Scheduling Command
+                </button>
+
+                <span style="font-size: 12px; margin-left: 30px">Documentation on the scheduling language can be found <a href="http://tensor-compiler.org/docs/scheduling/index.html">here</a>.</span>
+
+                <div id="btnDefaults" style="float: right">
+                  <button id="btnCPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-right: 10px; width: inherit">
+                    SpMV CPU
+                  </button>
+
+                  <button id="btnGPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="width: inherit">
+                    SpMV GPU
+                  </button>
+                </div>
+              </div>
+              <table class="mdl-data-table mdl-js-data-table" style="width: 100%; margin-bottom: 8px">
+                <tbody id="tblSchedule">
+                </tbody>
+              </table>
+            </div>
+            <div class="mdl-layout-spacer"></div>
+          </div>
+
           <div class="mdl-grid" style="padding-top: 6px">
             <div class="mdl-layout-spacer"></div>
             <div class="mdl-cell mdl-cell--9-col">
 
@@ -1,5 +1,5 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
 int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int y1_dimension = (int)(y->dimensions[0]);
@@ -9,4 +9,4 @@ int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
 
   y->vals = (uint8_t*)y_vals;
   return 0;
-}
+}
@@ -1,5 +1,5 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 
 int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int y1_dimension = (int)(y->dimensions[0]);
@@ -11,14 +11,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int x1_dimension = (int)(x->dimensions[0]);
   double* restrict x_vals = (double*)(x->vals);
 
+  #pragma omp parallel for schedule(static)
+  for (int32_t py = 0; py < y1_dimension; py++) {
+    y_vals[py] = 0.0;
+  }
+
   #pragma omp parallel for schedule(runtime)
-  for (int32_t i = 0; i < A1_dimension; i++) {
-    double y_val = 0.0;
-    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
-      int32_t j = A2_crd[jA];
-      y_val += A_vals[jA] * x_vals[j];
+  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+    for (int32_t i1 = 0; i1 < 32; i1++) {
+      int32_t i = i0 * 32 + i1;
+      if (i >= A1_dimension)
+        continue;
+
+      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+      }
     }
-    y_vals[i] = y_val;
   }
   return 0;
-}
+}
@@ -1,5 +1,5 @@
 // Generated by the Tensor Algebra Compiler (tensor-compiler.org)
-// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
+// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
 #ifndef TACO_C_HEADERS
 #define TACO_C_HEADERS
 #include <stdio.h>
@@ -118,14 +118,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int x1_dimension = (int)(x->dimensions[0]);
   double* restrict x_vals = (double*)(x->vals);
 
+  #pragma omp parallel for schedule(static)
+  for (int32_t py = 0; py < y1_dimension; py++) {
+    y_vals[py] = 0.0;
+  }
+
   #pragma omp parallel for schedule(runtime)
-  for (int32_t i = 0; i < A1_dimension; i++) {
-    double y_val = 0.0;
-    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
-      int32_t j = A2_crd[jA];
-      y_val += A_vals[jA] * x_vals[j];
+  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+    for (int32_t i1 = 0; i1 < 32; i1++) {
+      int32_t i = i0 * 32 + i1;
+      if (i >= A1_dimension)
+        continue;
+
+      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+      }
     }
-    y_vals[i] = y_val;
   }
   return 0;
 }
@@ -153,14 +162,23 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
   int32_t y_capacity = y1_dimension;
   y_vals = (double*)malloc(sizeof(double) * y_capacity);
 
+  #pragma omp parallel for schedule(static)
+  for (int32_t py = 0; py < y_capacity; py++) {
+    y_vals[py] = 0.0;
+  }
+
   #pragma omp parallel for schedule(runtime)
-  for (int32_t i = 0; i < A1_dimension; i++) {
-    double y_val = 0.0;
-    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
-      int32_t j = A2_crd[jA];
-      y_val += A_vals[jA] * x_vals[j];
+  for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+    for (int32_t i1 = 0; i1 < 32; i1++) {
+      int32_t i = i0 * 32 + i1;
+      if (i >= A1_dimension)
+        continue;
+
+      for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+      }
     }
-    y_vals[i] = y_val;
   }
 
   y->vals = (uint8_t*)y_vals;
@@ -218,12 +236,12 @@ int pack_A(taco_tensor_t *A, int* A_COO1_pos, int* A_COO1_crd, int* A_COO2_crd,
         jA_COO++;
       }
       if (A_capacity <= jA) {
-        A_vals = (double*)realloc(A_vals, sizeof(double) * (A_capacity * 2));
+        A_vals = (double*)realloc(A_vals, sizeof(double) * A_capacity * 2);
         A_capacity *= 2;
       }
       A_vals[jA] = A_COO_val;
       if (A2_crd_size <= jA) {
-        A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2));
+        A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * A2_crd_size * 2);
         A2_crd_size *= 2;
       }
       A2_crd[jA] = j;
@@ -294,12 +312,12 @@ int unpack(int** y_COO1_pos_ptr, int** y_COO1_crd_ptr, double** y_COO_vals_ptr,
 
   for (int32_t i = 0; i < y1_dimension; i++) {
     if (y_COO_capacity <= iy_COO) {
-      y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * (y_COO_capacity * 2));
+      y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * y_COO_capacity * 2);
       y_COO_capacity *= 2;
     }
     y_COO_vals[iy_COO] = y_vals[i];
     if (y_COO1_crd_size <= iy_COO) {
-      y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * (y_COO1_crd_size * 2));
+      y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * y_COO1_crd_size * 2);
       y_COO1_crd_size *= 2;
     }
     y_COO1_crd[iy_COO] = i;