tensor-compiler
diff --git a/‎codegen.html
Lines changed: 14 additions & 3 deletions b/‎codegen.html
Lines changed: 14 additions & 3 deletions
diff --git a/‎documentation/docs/scheduling.md
Lines changed: 44 additions & 22 deletions b/‎documentation/docs/scheduling.md
Lines changed: 44 additions & 22 deletions
diff --git a/‎javascripts/default-schedules.js
Lines changed: 115 additions & 0 deletions b/‎javascripts/default-schedules.js
Lines changed: 115 additions & 0 deletions
@@ -23,6 +23,7 @@
     <script src="javascripts/parser.js"></script>
     <script src="javascripts/parser-indices.js"></script>
     <script src="javascripts/parser-accesses.js"></script>
+    <script src="javascripts/default-schedules.js"></script>
     <script src="javascripts/demo.js"></script>
     <script type="text/x-mathjax-config">
       MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
@@ -132,12 +133,22 @@ <h6 style="margin-bottom: 0px; margin-top: 18px">Input a tensor algebra expressi
           <div class="mdl-grid" style="padding-top: 6px">
             <div class="mdl-layout-spacer"></div>
             <div class="mdl-cell mdl-cell--9-col">
+              <!-- <div>
+                <span style="font-size: 14px;margin-left: 50px">Documentation on the scheduling language can be found here [URL to come].</span>
+              </div> -->
               <div>
-                <button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom:10px; width: 30%">
+                <button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom: 10px; width: 30%">
                   Add Scheduling Command
                 </button>
-                <div class="mdl-textfield" style="width: 69%">
-                  <span style="font-size: 14px;margin-left: 50px">Documentation on the scheduling language can be found here [URL to come].</span>
+
+                <div id="btnDefaults" style="float: right">
+                  <button id="btnCPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-right: 10px; width: inherit">
+                    SpMV CPU
+                  </button>
+
+                  <button id="btnGPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="width: inherit">
+                    SpMV GPU
+                  </button>
                 </div>
               </div>
               <table class="mdl-data-table mdl-js-data-table" style="width: 100%; 
 
@@ -92,17 +92,16 @@ for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
 }
 ```
 
-# Divide
+<!-- (not yet implemented) -->
+<!-- # Divide
 
-The `divide(i, i0, i1, divideFactor)` transformation divides an index variable `i` into two nested index variables `i0` and `i1`. The size of the outer index variable `i0` is then held constant at `divideFactor`, which must be a positive integer. 
-
-[TODO example, divide not implemented yet.]
+The `divide(i, i0, i1, divideFactor)` transformation divides an index variable `i` into two nested index variables `i0` and `i1`. The size of the outer index variable `i0` is then held constant at `divideFactor`, which must be a positive integer.  -->
 
 # Precompute
 
 The `precompute(expr, i, iw, workspace)` transformation, which is described in more detail [here](http://tensor-compiler.org/taco-workspaces.pdf), leverages scratchpad memories and reorders computations to  increase locality. 
 
-Given a subexpression `expr` to precompute, an index variable `i` to precompute over, and an index variable `iw` (can be the same or different as `i`) to precompute with, the precomputed results are stored in the tensor variable `workspace`. 
+Given a subexpression `expr` to precompute, an index variable `i` to precompute over, and an index variable `iw` (which can be the same or different as `i`) to precompute with, the precomputed results are stored in the tensor variable `workspace`. 
 
 For the SpMV example, if `rhs` is the right hand side of the original statement, we could have: 
 ```c++
@@ -165,31 +164,54 @@ for (int32_t ibound = 0; ibound < 100; ibound++) {
 
 The `unroll(i, unrollFactor)` transformation unrolls the loop corresponding to an index variable `i` by `unrollFactor` number of iterations, where `unrollFactor` is a positive integer. 
 
-[TODO example, can't get unroll to work?]
+For the SpMV example, we could have
+```c++
+stmt = stmt.split(i, i0, i1, 32);
+stmt = stmt.unroll(i0, 4);
+```
+```c
+if ((((A1_dimension + 31) / 32) * 32 + 32) + (((A1_dimension + 31) / 32) * 32 + 32) >= A1_dimension) {
+    for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+        for (int32_t i1 = 0; i1 < 32; i1++) {
+            int32_t i = i0 * 32 + i1;
+            if (i >= A1_dimension)
+                continue;
+
+            for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+                int32_t j = A2_crd[jA];
+                y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+            }
+        }
+    }
+}
+else {
+    #pragma unroll 4
+    for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+        for (int32_t i1 = 0; i1 < 32; i1++) {
+            int32_t i = i0 * 32 + i1;
+            for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+                int32_t j = A2_crd[jA];
+                y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+            }
+        }
+    }
+}
+```
 
 # Parallelize
 
-The `parallelize(i, parallel_unit, output_race_strategy)` transformation tags an index variable `i` for parallel execution on hardware type `parallel_unit`. Data races are handled by an `output_race_strategy`. 
+The `parallelize(i, parallel_unit, output_race_strategy)` transformation tags an index variable `i` for parallel execution on hardware type `parallel_unit`. Data races are handled by an `output_race_strategy`. Since the other transformations expect serial code, `parallelize` must come last in a series of transformations. 
 
-Since the other transformations expect serial code, `parallelize` must come last in a series of transformations. For the SpMV example, we could have
+For the SpMV example, we could have
 ```c++
-IndexVar i0("i0"), i1("i1");
-stmt = stmt.split(i, i0, i1, 32);
-stmt = stmt.reorder({i0, i1, j});
-stmt = stmt.parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+stmt = stmt.parallelize(i, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
 ```
 ```c
 #pragma omp parallel for schedule(runtime)
-for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
-    for (int32_t i1 = 0; i1 < 32; i1++) {
-        int32_t i = i0 * 32 + i1;
-        if (i >= A1_dimension)
-            continue;
-
-        for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
-            int32_t j = A2_crd[jA];
-            y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
-        }
+for (int32_t i = 0; i < A1_dimension; i++) {
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
     }
 }
 ```
 
@@ -0,0 +1,115 @@
+
+var NNZ_PER_THREAD = 8; 
+var WARP_SIZE = 32; 
+var BLOCK_SIZE = 256; 
+
+var default_CPU_schedules = {
+  spmv: [
+          { 
+            command: "split", 
+            parameters: ["i", "i0", "i1", 32]
+          }, 
+          {
+            command: "reorder",
+            numReordered: 3,
+            parameters: ["i0", "i1", "j"]
+          },
+          {
+            command: "parallelize", 
+            parameters: ["i0", "CPU Thread", "No Races"]
+          }
+        ],
+  add: [],
+  ttv:  [
+          { 
+            command: "fuse",
+            parameters: ["i", "j", "f"]
+          },
+          {
+            command: "pos",
+            parameters: ["f", "fpos", "B"]
+          },
+          {
+            command: "split",
+            parameters: ["fpos", "chunk", "fpos2", 8]
+          },
+          {
+            command: "reorder", 
+            numReordered: 3,
+            parameters: ["chunk", "fpos2", "k"]
+          },
+          {
+            command: "parallelize", 
+            parameters: ["chunk", "CPU Thread", "No Races"]
+        }
+       ],
+  mttkrp: [
+            {
+              command: "reorder",
+              numReordered: 4,
+              parameters: ["i", "k", "l", "j"]
+            },
+            {
+              command: "precompute",
+              parameters: ["j", "j", "B(i,k,l) * D(l,j)"]
+            },
+            {
+              command: "split",
+              parameters: ["i", "i0", "i1", 32]
+            },
+            {
+              command: "parallelize",
+              parameters: ["i0", "CPU Thread", "No Races"]
+            }
+          ]
+}
+
+var default_GPU_schedules = {
+  spmv: [
+        { 
+          command: "fuse", 
+          parameters: ["i", "j", "f"]
+        }, 
+        {
+          command: "pos",
+          parameters: ["f", "fpos", "A"]
+        },
+        {
+          command: "split", 
+          parameters: ["fpos", "block", "fpos1", NNZ_PER_THREAD * BLOCK_SIZE]
+        },
+        {
+          command: "split", 
+          parameters: ["fpos1", "warp", "fpos2", NNZ_PER_THREAD * WARP_SIZE]
+        },
+        {
+          command: "split", 
+          parameters: ["fpos2", "thread", "thr_nz", NNZ_PER_THREAD]
+        },
+        {
+          command: "reorder", 
+          numReordered: 4,
+          parameters: ["block", "warp", "thread", "thr_nz"]
+        },
+        {
+          command: "precompute",
+          parameters: ["thr_nz", "thr_nz_pre", "A(i, j) * x(j)"]
+        },
+        {
+          command: "unroll",
+          parameters: ["thr_nz_pre", NNZ_PER_THREAD]
+        },
+        {
+          command: "parallelize",
+          parameters: ["block", "GPU Block", "Ignore Races"]
+        },
+        {
+          command: "parallelize",
+          parameters: ["warp", "GPU Warp", "Ignore Races"]
+        },
+        {
+          command: "parallelize",
+          parameters: ["thread", "GPU Thread", "Atomics"]
+        }
+      ]
+}