Skip to content

Commit 1bcf55e

Browse files
Merge pull request #12 from jwshi21/master
add scheduling language selection
2 parents aad76b3 + ba85b80 commit 1bcf55e

11 files changed

+2170
-63
lines changed

codegen.html

+31
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
<script src="javascripts/jquery.ui.touch-punch.min.js"></script>
2222
<script src="javascripts/FileSaver.min.js"></script>
2323
<script src="javascripts/parser.js"></script>
24+
<script src="javascripts/parser-indices.js"></script>
25+
<script src="javascripts/default-schedules.js"></script>
2426
<script src="javascripts/demo.js"></script>
2527
<script type="text/x-mathjax-config">
2628
MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
@@ -126,6 +128,35 @@ <h6 style="margin-bottom: 0px; margin-top: 18px">Input a tensor algebra expressi
126128
</div>
127129
<div class="mdl-layout-spacer"></div>
128130
</div>
131+
132+
<div class="mdl-grid" style="padding-top: 6px">
133+
<div class="mdl-layout-spacer"></div>
134+
<div class="mdl-cell mdl-cell--9-col">
135+
<div>
136+
<button id="btnSchedule" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-bottom: 10px; width: 30%">
137+
Add Scheduling Command
138+
</button>
139+
140+
<span style="font-size: 12px; margin-left: 30px">Documentation on the scheduling language can be found <a href="http://tensor-compiler.org/docs/scheduling/index.html">here</a>.</span>
141+
142+
<div id="btnDefaults" style="float: right">
143+
<button id="btnCPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="margin-right: 10px; width: inherit">
144+
SpMV CPU
145+
</button>
146+
147+
<button id="btnGPU" class="mdl-button mdl-js-button mdl-button--raised mdl-js-ripple-effect demo-btn" style="width: inherit">
148+
SpMV GPU
149+
</button>
150+
</div>
151+
</div>
152+
<table class="mdl-data-table mdl-js-data-table" style="width: 100%; margin-bottom: 8px">
153+
<tbody id="tblSchedule">
154+
</tbody>
155+
</table>
156+
</div>
157+
<div class="mdl-layout-spacer"></div>
158+
</div>
159+
129160
<div class="mdl-grid" style="padding-top: 6px">
130161
<div class="mdl-layout-spacer"></div>
131162
<div class="mdl-cell mdl-cell--9-col">

examples/spmv_assembly.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

44
int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
55
int y1_dimension = (int)(y->dimensions[0]);
@@ -9,4 +9,4 @@ int assemble(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
99

1010
y->vals = (uint8_t*)y_vals;
1111
return 0;
12-
}
12+
}

examples/spmv_compute.c

+17-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33

44
int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
55
int y1_dimension = (int)(y->dimensions[0]);
@@ -11,14 +11,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
1111
int x1_dimension = (int)(x->dimensions[0]);
1212
double* restrict x_vals = (double*)(x->vals);
1313

14+
#pragma omp parallel for schedule(static)
15+
for (int32_t py = 0; py < y1_dimension; py++) {
16+
y_vals[py] = 0.0;
17+
}
18+
1419
#pragma omp parallel for schedule(runtime)
15-
for (int32_t i = 0; i < A1_dimension; i++) {
16-
double y_val = 0.0;
17-
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
18-
int32_t j = A2_crd[jA];
19-
y_val += A_vals[jA] * x_vals[j];
20+
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
21+
for (int32_t i1 = 0; i1 < 32; i1++) {
22+
int32_t i = i0 * 32 + i1;
23+
if (i >= A1_dimension)
24+
continue;
25+
26+
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
27+
int32_t j = A2_crd[jA];
28+
y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
29+
}
2030
}
21-
y_vals[i] = y_val;
2231
}
2332
return 0;
24-
}
33+
}

examples/spmv_full.c

+35-17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
2-
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
2+
// taco "y(i)=A(i,j)*x(j)" -f=y:d:0 -f=A:ds:0,1 -f=x:d:0 -s=split(i,i0,i1,32) -s=reorder(i0,i1,j) -s=parallelize(i0,CPUThread,NoRaces) -write-source=taco_kernel.c -write-compute=taco_compute.c -write-assembly=taco_assembly.c
33
#ifndef TACO_C_HEADERS
44
#define TACO_C_HEADERS
55
#include <stdio.h>
@@ -118,14 +118,23 @@ int compute(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
118118
int x1_dimension = (int)(x->dimensions[0]);
119119
double* restrict x_vals = (double*)(x->vals);
120120

121+
#pragma omp parallel for schedule(static)
122+
for (int32_t py = 0; py < y1_dimension; py++) {
123+
y_vals[py] = 0.0;
124+
}
125+
121126
#pragma omp parallel for schedule(runtime)
122-
for (int32_t i = 0; i < A1_dimension; i++) {
123-
double y_val = 0.0;
124-
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
125-
int32_t j = A2_crd[jA];
126-
y_val += A_vals[jA] * x_vals[j];
127+
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
128+
for (int32_t i1 = 0; i1 < 32; i1++) {
129+
int32_t i = i0 * 32 + i1;
130+
if (i >= A1_dimension)
131+
continue;
132+
133+
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
134+
int32_t j = A2_crd[jA];
135+
y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
136+
}
127137
}
128-
y_vals[i] = y_val;
129138
}
130139
return 0;
131140
}
@@ -153,14 +162,23 @@ int evaluate(taco_tensor_t *y, taco_tensor_t *A, taco_tensor_t *x) {
153162
int32_t y_capacity = y1_dimension;
154163
y_vals = (double*)malloc(sizeof(double) * y_capacity);
155164

165+
#pragma omp parallel for schedule(static)
166+
for (int32_t py = 0; py < y_capacity; py++) {
167+
y_vals[py] = 0.0;
168+
}
169+
156170
#pragma omp parallel for schedule(runtime)
157-
for (int32_t i = 0; i < A1_dimension; i++) {
158-
double y_val = 0.0;
159-
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
160-
int32_t j = A2_crd[jA];
161-
y_val += A_vals[jA] * x_vals[j];
171+
for (int32_t i0 = 0; i0 < ((A1_dimension + 31) / 32); i0++) {
172+
for (int32_t i1 = 0; i1 < 32; i1++) {
173+
int32_t i = i0 * 32 + i1;
174+
if (i >= A1_dimension)
175+
continue;
176+
177+
for (int32_t jA = A2_pos[i]; jA < A2_pos[(i + 1)]; jA++) {
178+
int32_t j = A2_crd[jA];
179+
y_vals[i] = y_vals[i] + A_vals[jA] * x_vals[j];
180+
}
162181
}
163-
y_vals[i] = y_val;
164182
}
165183

166184
y->vals = (uint8_t*)y_vals;
@@ -218,12 +236,12 @@ int pack_A(taco_tensor_t *A, int* A_COO1_pos, int* A_COO1_crd, int* A_COO2_crd,
218236
jA_COO++;
219237
}
220238
if (A_capacity <= jA) {
221-
A_vals = (double*)realloc(A_vals, sizeof(double) * (A_capacity * 2));
239+
A_vals = (double*)realloc(A_vals, sizeof(double) * A_capacity * 2);
222240
A_capacity *= 2;
223241
}
224242
A_vals[jA] = A_COO_val;
225243
if (A2_crd_size <= jA) {
226-
A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * (A2_crd_size * 2));
244+
A2_crd = (int32_t*)realloc(A2_crd, sizeof(int32_t) * A2_crd_size * 2);
227245
A2_crd_size *= 2;
228246
}
229247
A2_crd[jA] = j;
@@ -294,12 +312,12 @@ int unpack(int** y_COO1_pos_ptr, int** y_COO1_crd_ptr, double** y_COO_vals_ptr,
294312

295313
for (int32_t i = 0; i < y1_dimension; i++) {
296314
if (y_COO_capacity <= iy_COO) {
297-
y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * (y_COO_capacity * 2));
315+
y_COO_vals = (double*)realloc(y_COO_vals, sizeof(double) * y_COO_capacity * 2);
298316
y_COO_capacity *= 2;
299317
}
300318
y_COO_vals[iy_COO] = y_vals[i];
301319
if (y_COO1_crd_size <= iy_COO) {
302-
y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * (y_COO1_crd_size * 2));
320+
y_COO1_crd = (int32_t*)realloc(y_COO1_crd, sizeof(int32_t) * y_COO1_crd_size * 2);
303321
y_COO1_crd_size *= 2;
304322
}
305323
y_COO1_crd[iy_COO] = i;

0 commit comments

Comments
 (0)