Merge pull request #11 from jwshi21/sched-docs

stephenchouca · web-flow · commit 9b1205d54bc6 · 2020-08-25T16:17:38.000-04:00
add scheduling documentation
diff --git a/documentation/docs/scheduling.md b/documentation/docs/scheduling.md
@@ -0,0 +1,219 @@
+The scheduling language enables users to specify and compose transformations to further optimize the code generated by taco. 
+
+Consider the following SpMV computation and associated code, which we will transform below:
+```c++
+Format csr({Dense,Sparse});
+Tensor<double> A("A", {512, 64}, csr);
+Tensor<double> x("x", {64}, {Dense});
+Tensor<double> y("y", {512}, {Dense});
+
+IndexVar i("i"), j("j"); 
+Access matrix = A(i, j);
+y(i) = matrix * x(j);
+IndexStmt stmt = y.getAssignment().concretize();
+```
+```c
+for (int32_t i = 0; i < A1_dimension; i++) {
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+    }
+}
+```
+# Pos
+
+The `pos(i, ipos, access)` transformation takes in an index variable `i` that iterates over the coordinate space of `access` and replaces it with a derived index variable `ipos` that iterates over the same iteration range, but with respect to the the position space. 
+
+Since the `pos` transformation is not valid for dense level formats, for the SpMV example, the following would result in an error:
+```c++
+stmt = stmt.pos(i, IndexVar("ipos"), matrix);
+```
+
+We could instead have: 
+```c++
+stmt = stmt.pos(j, IndexVar("jpos"), matrix);
+```
+```c
+for (int32_t i = 0; i < A1_dimension; i++) {
+    for (int32_t jposA = A2_pos[i]; jposA < A2_pos[(i + 1)]; jposA++) {
+      	if (jposA < A2_pos[i] || jposA >= A2_pos[(i + 1)])
+        	continue;
+
+    	int32_t j = A2_crd[jposA];
+    	y_vals[i] = y_vals[i] + A_vals[jposA] * x_vals[j];
+    }
+} 
+```
+
+# Fuse
+
+The `fuse(i, j, f)` transformation takes in two index variables `i` and `j`, where `j` is directly nested under `i`, and collapses them into a fused index variable `f` that iterates over the product of the coordinates `i` and `j`. 
+
+`fuse` helps facilitate other transformations, such as iterating over the position space of several index variables, as in this SpMV example: 
+```c++
+IndexVar f("f");
+stmt = stmt.fuse(i, j, f);
+stmt = stmt.pos(f, IndexVar("fpos"), matrix);
+```
+```c
+for (int32_t fposA = 0; fposA < A2_pos[A1_dimension]; fposA++) {
+    if (fposA >= A2_pos[A1_dimension])
+        continue;
+
+    int32_t f = A2_crd[fposA];
+    while (fposA == A2_pos[(i_pos + 1)]) {
+        i_pos++;
+        i = i_pos;
+    }
+    y_vals[i] = y_vals[i] + A_vals[fposA] * x_vals[f];
+}
+```
+
+# Split 
+
+The `split(i, i0, i1, splitFactor)` transformation splits (strip-mines) an index variable `i` into two nested index variables `i0` and `i1`. The size of the inner index variable `i1` is then held constant at `splitFactor`, which must be a positive integer.
+
+For the SpMV example, we could have: 
+```c++
+stmt = stmt.split(i, IndexVar("i0"), IndexVar("i1"), 16);
+```
+```c
+for (int32_t i0 = 0; i0 < ((A1_dimension + 15) / 16); i0++) {
+    for (int32_t i1 = 0; i1 < 16; i1++) {
+      	int32_t i = i0 * 16 + i1;
+      	if (i >= A1_dimension)
+        	continue;
+
+    	for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        	int32_t j = A2_crd[jA];
+        	y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+    	}
+    }
+}
+```
+
+<!-- (not yet implemented) -->
+<!-- # Divide
+
+The `divide(i, i0, i1, divideFactor)` transformation divides an index variable `i` into two nested index variables `i0` and `i1`. The size of the outer index variable `i0` is then held constant at `divideFactor`, which must be a positive integer.  -->
+
+# Precompute
+
+The `precompute(expr, i, iw, workspace)` transformation, which is described in more detail [here](http://tensor-compiler.org/taco-workspaces.pdf), leverages scratchpad memories and reorders computations to  increase locality. 
+
+Given a subexpression `expr` to precompute, an index variable `i` to precompute over, and an index variable `iw` (which can be the same or different as `i`) to precompute with, the precomputed results are stored in the tensor variable `workspace`. 
+
+For the SpMV example, if `rhs` is the right hand side of the original statement, we could have: 
+```c++
+TensorVar workspace("workspace", Type(Float64, {Dimension(64)}), taco::dense);
+stmt = stmt.precompute(rhs, j, j, workspace);
+```
+```c
+for (int32_t i = 0; i < A1_dimension; i++) {
+    double* restrict workspace = 0;
+    workspace = (double*)malloc(sizeof(double) * 64);
+    for (int32_t pworkspace = 0; pworkspace < 64; pworkspace++) {
+        workspace[pworkspace] = 0.0;
+    }
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        workspace[j] = A_vals[jA] * x_vals[j];
+    }
+    for (int32_t j = 0; j < ; j++) {
+        y_vals[i] = y_vals[i] + workspace[j];
+    }
+    free(workspace);
+  }
+```
+
+# Reorder
+
+The `reorder(vars)` transformation takes in a new ordering for a set of index variables in the expression that are directly nested in the iteration order. 
+
+For the SpMV example, we could have: 
+```c++
+stmt = stmt.reorder({j, i});
+```
+```c
+for (int32_t jA = A2_pos[iA]; jA < A2_pos[(iA + 1)]; jA++) {
+    int32_t j = A2_crd[jA];
+    for (int32_t i = 0; i < A1_dimension; i++) {
+    	y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+    }
+ }
+```
+
+# Bound
+
+The `bound(i, ibound, bound, bound_type)` transformation replaces an index variable `i` with an index variable `ibound` that obeys a compile-time constraint on its iteration space, incorporating knowledge about the size or structured sparsity pattern of the corresponding input. The meaning of `bound` depends on the `bound_type`.
+
+For the SpMV example, we could have
+```c++
+stmt = stmt.bound(i, IndexVar("ibound"), 100, BoundType::MaxExact); 
+```
+```c
+for (int32_t ibound = 0; ibound < 100; ibound++) {
+    for (int32_t jA = A2_pos[ibound]; jA < A2_pos[(ibound + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[ibound] = y_vals[ibound] + A_vals[jA] * x_vals[j];
+    }
+}
+```
+
+# Unroll
+
+The `unroll(i, unrollFactor)` transformation unrolls the loop corresponding to an index variable `i` by `unrollFactor` number of iterations, where `unrollFactor` is a positive integer. 
+
+For the SpMV example, we could have
+```c++
+stmt = stmt.split(i, i0, i1, 32);
+stmt = stmt.unroll(i0, 4);
+```
+```c
+if ((((A1_dimension + 31) / 32) * 32 + 32) + (((A1_dimension + 31) / 32) * 32 + 32) >= A1_dimension) {
+    for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+        for (int32_t i1 = 0; i1 < 32; i1++) {
+            int32_t i = i0 * 32 + i1;
+            if (i >= A1_dimension)
+                continue;
+
+            for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+                int32_t j = A2_crd[jA];
+                y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+            }
+        }
+    }
+}
+else {
+    #pragma unroll 4
+    for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
+        for (int32_t i1 = 0; i1 < 32; i1++) {
+            int32_t i = i0 * 32 + i1;
+            for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+                int32_t j = A2_crd[jA];
+                y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+            }
+        }
+    }
+}
+```
+
+# Parallelize
+
+The `parallelize(i, parallel_unit, output_race_strategy)` transformation tags an index variable `i` for parallel execution on hardware type `parallel_unit`. Data races are handled by an `output_race_strategy`. Since the other transformations expect serial code, `parallelize` must come last in a series of transformations. 
+
+For the SpMV example, we could have
+```c++
+stmt = stmt.parallelize(i, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+```
+```c
+#pragma omp parallel for schedule(runtime)
+for (int32_t i = 0; i < A1_dimension; i++) {
+    for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
+        int32_t j = A2_crd[jA];
+        y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
+    }
+}
+```
+
+
diff --git a/documentation/mkdocs.yml b/documentation/mkdocs.yml
@@ -15,6 +15,7 @@ pages:
     - C++ Library:
         - 'Defining Tensors': 'tensors.md'
         - 'Computing on Tensors': 'computations.md'
+        - 'Providing a Schedule': 'scheduling.md'
     - Python Library:
         - 'Tutorial': 'tutorial.md'
         - 'Defining Tensors': 'pytensors.md'