update scheduling documentation

Jessica Shi · Jessica Shi · commit dbbfdb889084 · 2020-08-18T00:00:45.000-04:00
diff --git a/codegen.html b/codegen.html
@@ -132,9 +132,14 @@ <h6 style="margin-bottom: 0px; margin-top: 18px">Input a tensor algebra expressi
           <div class="mdl-grid" style="padding-top: 6px">
             <div class="mdl-layout-spacer"></div>
             <div class="mdl-cell mdl-cell--9-col">
-              <button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom:10px; width: 30%">
-                Add Scheduling Command
-              </button>
+              <div>
+                <button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom:10px; width: 30%">
+                  Add Scheduling Command
+                </button>
+                <div class="mdl-textfield" style="width: 69%">
+                  <span style="font-size: 14px;margin-left: 50px">Documentation on the scheduling language can be found here [URL to come].</span>
+                </div>
+              </div>
               <table class="mdl-data-table mdl-js-data-table" style="width: 100%; 
                 margin-bottom: 8px">
                 <tbody id="tblSchedule">
diff --git a/documentation/docs/scheduling.md b/documentation/docs/scheduling.md
@@ -8,31 +8,30 @@ Tensor<double> x("x", {64}, {Dense});
 Tensor<double> y("y", {512}, {Dense});
 
 IndexVar i("i"), j("j"); 
-y(i) = A(i, j) * x(j);
+Access matrix = A(i, j);
+y(i) = matrix * x(j);
 IndexStmt stmt = y.getAssignment().concretize();
 ```
 ```c
 for (int32_t i = 0; i < A1_dimension; i++) {
-	double y_val = 0.0;
-	for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
-		int32_t j = A2_crd[jA];
-		y_val += A_vals[jA] * x_vals[j];
-	}
-	y_vals[i] = y_val;
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+    }
 }
 ```
 # Pos
 
-The `pos(i, ipos, access)` transformation takes in an index variable `i` that operates over the coordinate space of `access` and replaces it with a derived index variable `ipos` that operates over the same iteration range, but with respect to the the position space. 
+The `pos(i, ipos, access)` transformation takes in an index variable `i` that iterates over the coordinate space of `access` and replaces it with a derived index variable `ipos` that iterates over the same iteration range, but with respect to the the position space. 
 
 Since the `pos` transformation is not valid for dense level formats, for the SpMV example, the following would result in an error:
 ```c++
-stmt = stmt.pos(i, IndexVar("ipos"), A);
+stmt = stmt.pos(i, IndexVar("ipos"), matrix);
 ```
 
 We could instead have: 
 ```c++
-stmt = stmt.pos(j, IndexVar("jpos"), A);
+stmt = stmt.pos(j, IndexVar("jpos"), matrix);
 ```
 ```c
 for (int32_t i = 0; i < A1_dimension; i++) {
@@ -50,9 +49,24 @@ for (int32_t i = 0; i < A1_dimension; i++) {
 
 The `fuse(i, j, f)` transformation takes in two index variables `i` and `j`, where `j` is directly nested under `i`, and collapses them into a fused index variable `f` that iterates over the product of the coordinates `i` and `j`. 
 
-For the SpMV example, we could have: 
+`fuse` helps facilitate other transformations, such as iterating over the position space of several index variables, as in this SpMV example: 
 ```c++
-stmt = stmt.fuse(i, j, IndexVar("f"));
+IndexVar f("f");
+stmt = stmt.fuse(i, j, f);
+stmt = stmt.pos(f, IndexVar("fpos"), matrix);
+```
+```c
+for (int32_t fposA = 0; fposA < A2_pos[A1_dimension]; fposA++) {
+    if (fposA >= A2_pos[A1_dimension])
+        continue;
+
+    int32_t f = A2_crd[fposA];
+    while (fposA == A2_pos[(i_pos + 1)]) {
+        i_pos++;
+        i = i_pos;
+    }
+    y_vals[i] = y_vals[i] + A_vals[fposA] * x_vals[f];
+}
 ```
 
 # Split 
@@ -61,7 +75,7 @@ The `split(i, i0, i1, splitFactor)` transformation splits (strip-mines) an index
 
 For the SpMV example, we could have: 
 ```c++
-stmt = stmt.split(j, IndexVar("i0"), IndexVar("i1"), 16);
+stmt = stmt.split(i, IndexVar("i0"), IndexVar("i1"), 16);
 ```
 ```c
 for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
@@ -80,8 +94,39 @@ for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
 
 # Divide
 
+The `divide(i, i0, i1, divideFactor)` transformation divides an index variable `i` into two nested index variables `i0` and `i1`. The size of the outer index variable `i0` is then held constant at `divideFactor`, which must be a positive integer. 
+
+[TODO example, divide not implemented yet.]
+
 # Precompute
 
+The `precompute(expr, i, iw, workspace)` transformation, which is described in more detail [here](http://tensor-compiler.org/taco-workspaces.pdf), leverages scratchpad memories and reorders computations to  increase locality. 
+
+Given a subexpression `expr` to precompute, an index variable `i` to precompute over, and an index variable `iw` (can be the same or different as `i`) to precompute with, the precomputed results are stored in the tensor variable `workspace`. 
+
+For the SpMV example, if `rhs` is the right hand side of the original statement, we could have: 
+```c++
+TensorVar workspace("workspace", Type(Float64, {Dimension(64)}), taco::dense);
+stmt = stmt.precompute(rhs, j, j, workspace);
+```
+```c
+for (int32_t i = 0; i < A1_dimension; i++) {
+    double* restrict workspace = 0;
+    workspace = (double*)malloc(sizeof(double) * 64);
+    for (int32_t pworkspace = 0; pworkspace < 64; pworkspace++) {
+        workspace[pworkspace] = 0.0;
+    }
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        workspace[j] = A_vals[jA] * x_vals[j];
+    }
+    for (int32_t j = 0; j < ; j++) {
+        y_vals[i] = y_vals[i] + workspace[j];
+    }
+    free(workspace);
+  }
+```
+
 # Reorder
 
 The `reorder(vars)` transformation takes in a new ordering for a set of index variables in the expression that are directly nested in the iteration order. 
@@ -101,11 +146,52 @@ for (int32_t jA = A2_pos[iA]; jA < A2_pos[(iA + 1)]; jA++) {
 
 # Bound
 
+The `bound(i, ibound, bound, bound_type)` transformation replaces an index variable `i` with an index variable `ibound` that obeys a compile-time constraint on its iteration space, incorporating knowledge about the size or structured sparsity pattern of the corresponding input. The meaning of `bound` depends on the `bound_type`.
+
+For the SpMV example, we could have
+```c++
+stmt = stmt.bound(i, IndexVar("ibound"), 100, BoundType::MaxExact); 
+```
+```c
+for (int32_t ibound = 0; ibound < 100; ibound++) {
+    for (int32_t jA = A2_pos[ibound]; jA < A2_pos[(ibound + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[ibound] = y_vals[ibound] + A_vals[jA] * x_vals[j];
+    }
+}
+```
+
 # Unroll
 
-# Parallelize
+The `unroll(i, unrollFactor)` transformation unrolls the loop corresponding to an index variable `i` by `unrollFactor` number of iterations, where `unrollFactor` is a positive integer. 
+
+[TODO example, can't get unroll to work?]
 
+# Parallelize
 
+The `parallelize(i, parallel_unit, output_race_strategy)` transformation tags an index variable `i` for parallel execution on hardware type `parallel_unit`. Data races are handled by an `output_race_strategy`. 
 
+Since the other transformations expect serial code, `parallelize` must come last in a series of transformations. For the SpMV example, we could have
+```c++
+IndexVar i0("i0"), i1("i1");
+stmt = stmt.split(i, i0, i1, 32);
+stmt = stmt.reorder({i0, i1, j});
+stmt = stmt.parallelize(i0, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+```
+```c
+#pragma omp parallel for schedule(runtime)
+for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+    for (int32_t i1 = 0; i1 < 32; i1++) {
+        int32_t i = i0 * 32 + i1;
+        if (i >= A1_dimension)
+            continue;
+
+        for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+            int32_t j = A2_crd[jA];
+            y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+        }
+    }
+}
+```
 
 
diff --git a/server/taco_server.py b/server/taco_server.py
@@ -32,7 +32,7 @@ def do_POST(self):
       computePath = prefix + "taco_compute.c"
       assemblyPath = prefix + "taco_assembly.c"
       cmd = tacoPath + " " + cmd + " -write-source=" + writePath + " -write-compute=" + computePath + " -write-assembly=" + assemblyPath
-
+      
       try:
         subprocess.check_output(str.split(cmd), timeout=3, stderr=subprocess.STDOUT)
         with open(writePath, 'r') as f:
@@ -52,10 +52,10 @@ def do_POST(self):
         if search is not None: 
           response['error'] = search.group()[3:-1]
         else: 
-          response['error'] = 'Expression is currently not supported'
+          response['error'] = 'Expression and/or schedule is currently not supported'
         logFile = "/home/ubuntu/errors.log"
-      except Exception as e:
-        response['error'] = 'Expression is currently not supported'
+      except:
+        response['error'] = 'Expression and/or schedule is currently not supported'
         logFile = "/home/ubuntu/errors.log"
 
       ip = ".".join(self.client_address[0].split('.')[0:-2]) + ".*.*"