Open
Description
Original code:
#define LEN 32000
#define LEN2 256
static int ntimes = 200000;
float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];
int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);
int s351()
{
float alpha = c[0];
for (int nl = 0; nl < 8*ntimes; nl++) {
for (int i = 0; i < LEN; i += 5) {
a[i] += alpha * b[i];
a[i + 1] += alpha * b[i + 1];
a[i + 2] += alpha * b[i + 2];
a[i + 3] += alpha * b[i + 3];
a[i + 4] += alpha * b[i + 4];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return 0;
}
Option:
-Ofast -march=armv8.2-a+sve
In the original code, only interleave is applied in loop-vectorize. (VF=1, IC=2)
Code for manually rerolling the original code:
int s351()
{
float alpha = c[0];
for (int nl = 0; nl < 8*ntimes; nl++) {
for (int i = 0; i < LEN; i += 1) {
a[i] += alpha * b[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return 0;
}
In the manually rerolled code, vectorization is applied in loop-vectorize (VF=vscale x 4, IC=2)
Register an Issue as a valid case for Loop-reroll.