35
35
#define COL_MAJOR 1
36
36
#endif
37
37
38
+ // #define MATMUL_DEBUG
39
+
38
40
namespace xf {
39
41
namespace dsp {
40
42
namespace aie {
@@ -94,13 +96,28 @@ constexpr loHi getUnTileShuffleOffsetsInt16(unsigned M, unsigned N, unsigned vec
94
96
loHi ret = {.lo = offLo, .hi = offHi};
95
97
return ret;
96
98
}
99
+
100
+ template <typename T_D, unsigned inRow, unsigned inCol>
101
+ static constexpr int getVecSize () {
102
+ constexpr unsigned minVBuffSizeforType = (512 / 8 ) / sizeof (T_D); // not sure why this is 512 bits?
103
+ if
104
+ constexpr (minVBuffSizeforType > (inRow * inCol)) { return inRow * inCol; }
105
+ else if
106
+ constexpr (inCol % minVBuffSizeforType == 0 || minVBuffSizeforType > inCol) { return minVBuffSizeforType; }
107
+ else {
108
+ int vSize = minVBuffSizeforType;
109
+ while (inCol % vSize != 0 ) {
110
+ vSize /= 2 ;
111
+ }
112
+ return vSize;
113
+ }
114
+ }
115
+
97
116
template <unsigned M, unsigned N, unsigned inRow, unsigned inCol, unsigned leadingDim, typename T_D>
98
117
static void doUnTile (T_D* __restrict inPtr, T_D* outPtr) {
99
118
constexpr unsigned minGranularity = (128 / 8 ) / sizeof (T_D);
100
119
constexpr unsigned loadSize = (N >= minGranularity) ? N : minGranularity;
101
- constexpr unsigned minVBuffSizeforType = (512 / 8 ) / sizeof (T_D);
102
- constexpr unsigned vectorSize = (minVBuffSizeforType > (inRow * inCol)) ? (inRow * inCol) : minVBuffSizeforType;
103
-
120
+ constexpr unsigned vectorSize = getVecSize<T_D, inRow, inCol>();
104
121
// static_assert(N >= minGranularity, "Granularity is awkward");
105
122
static_assert (vectorSize <= (1024 / 8 ) / sizeof (T_D), " calculated vector size too large for vector register." );
106
123
static_assert (!(leadingDim == COL_MAJOR && std::is_same_v<T_D, int16>),
@@ -110,9 +127,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
110
127
loHi offsets = std::is_same_v<T_D, int16> ? getUnTileShuffleOffsetsInt16 (M, N, vectorSize, leadingDim)
111
128
: getUnTileShuffleOffsets (M, N, vectorSize, leadingDim);
112
129
113
- // printf("M: %d, N: %d, vectorSize: %d, loadSize: %d, leadingDim: %d\n", M, N, vectorSize, loadSize, leadingDim);
114
- // printf("Offsets: lo : %0X, hi: %0X\n", offsets.lo, offsets.hi);
115
-
116
130
const unsigned loadsPerVector = vectorSize / loadSize;
117
131
const unsigned tilesPerVector = vectorSize / (M * N);
118
132
const unsigned colsPerLoad =
@@ -129,8 +143,7 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
129
143
130
144
const unsigned vectorsPerCol = inRow / rowsPerVector;
131
145
const unsigned vectorsPerRow = inCol / colsPerVector;
132
- // printf("colsPerLoad: %d, rowsPerLoad: %d, colsPerVector: %d, rowsPerVector: %d, vectorsPerCol: %d, vectorsPerRow:
133
- // %d\n",colsPerLoad, rowsPerLoad, colsPerVector, rowsPerVector, vectorsPerCol, vectorsPerRow );
146
+
134
147
// Loop through a row first if row major
135
148
const unsigned outerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerCol : vectorsPerRow;
136
149
const unsigned innerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerRow : vectorsPerCol;
@@ -145,9 +158,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
145
158
const unsigned outerDimStoreIncr = (leadingDim == ROW_MAJOR) ? inCol : inRow;
146
159
const unsigned innerDimStoreIncr = storeSize;
147
160
148
- // printf("outerLoopCount: %d, innerLoopCount: %d, outerDimPerVector: %d, innerDimPerVector: %d, storeSize: %d \n",
149
- // outerLoopCount, innerLoopCount, outerDimPerVector, innerDimPerVector, storeSize);
150
-
151
161
const bool shuffleIsNeeded = (leadingDim == COL_MAJOR) || ((leadingDim == ROW_MAJOR) && (loadSize > N));
152
162
153
163
for (unsigned outerDimIdx = 0 ; outerDimIdx < outerLoopCount; ++outerDimIdx)
@@ -161,8 +171,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
161
171
for (unsigned innerDimIdx = 0 ; innerDimIdx < innerLoopCount; ++innerDimIdx)
162
172
chess_loop_count ((innerLoopCount)) chess_prepare_for_pipelining {
163
173
const unsigned ptrInnerBase = innerDimIdx * innerLoopIncr;
164
- // printf("outerDimIdx: %d, ptrOuterBase: %d, innerDimIdx: %d, ptrInnerBase: %d\n",outerDimIdx,
165
- // ptrOuterBase, innerDimIdx, ptrInnerBase);
166
174
167
175
aie::vector<T_D, vectorSize> vec;
168
176
@@ -188,29 +196,17 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
188
196
loadSize * loadIdx; // unlikely
189
197
190
198
const unsigned loadPtr = innerLoadPtr + ptrInnerBase + ptrOuterBase;
191
-
192
- // printf("loadPtr=%d, innerLoadPtr=%d\n", loadPtr, innerLoadPtr);
193
- // load
194
-
195
199
vec.insert (loadIdx, aie::load_v<loadSize>(inPtr + loadPtr));
196
200
}
197
201
198
- // myprint(vec, true, "beforeShuffle: ");
199
202
if
200
- constexpr (shuffleIsNeeded) {
201
- // printf("We need to do a shuffle\n");
202
- vec = doShuffle (vec, 0 , offsets);
203
- // myprint(vec, true, "afterShuffle: ");
204
- }
203
+ constexpr (shuffleIsNeeded) { vec = doShuffle (vec, 0 , offsets); }
205
204
#pragma unroll((outerDimPerVector))
206
205
for (unsigned outerStoreIdx = 0 ; outerStoreIdx < outerDimPerVector; ++outerStoreIdx) {
207
206
const unsigned storeOuterPtr = outerStoreIdx * outerDimStoreIncr;
208
207
#pragma unroll((std::max(innerDimPerVector / storeSize, (unsigned) 1)))
209
208
for (unsigned innerStoreIdx = 0 ;
210
209
innerStoreIdx < std::max (innerDimPerVector / storeSize, (unsigned )1 ); ++innerStoreIdx) {
211
- // printf("outerStoreIdx=%d, storeOuterPtr=%d, innerStoreIdx=%d, storeInnerPtr=%d\n",
212
- // outerStoreIdx,storeOuterPtr, innerStoreIdx, innerStoreIdx*storeSize);
213
-
214
210
// If we don't shuffle and still load multiple outerDims, then we need to skip over that.
215
211
const unsigned sliceIdx =
216
212
(!shuffleIsNeeded && outerDimPerVector > 1 )
@@ -220,23 +216,12 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
220
216
innerDimIdx * innerDimPerVector +
221
217
outerDimIdx * outerDimPerVector * outerDimStoreIncr;
222
218
223
- // printf("storePtr=%d, sliceIdx=%d\n", storePtr, sliceIdx);
224
-
225
219
// store direct to window
226
220
aie::store_v (outPtr + storePtr, vec.template extract <storeSize>(sliceIdx));
227
221
}
228
222
}
229
223
}
230
224
}
231
-
232
- const unsigned tileSize = (M * N);
233
- // for (unsigned AChunk=0; AChunk<(inRow*inCol); AChunk+=tileSize){
234
- // aie::vector<T_D, tileSize> APost = aie::load_v<tileSize>(outPtr); outPtr += tileSize;
235
- // // //aie::vector<T_D, sizeTileA> A1 = aie::load_v<sizeTileA>(pA1); pA1 += sizeTileA;
236
- // myprint(APost,true,"A0postProc: ");
237
- // // myprint(A1,true,"A1preProc: ");
238
- // //
239
- // }
240
225
}
241
226
242
227
namespace aie = ::aie;
0 commit comments