Skip to content

Commit 1204c74

Browse files
SDA USRsdausr
authored and
GitHub Enterprise
committed
Squashed 'dsp' changes from 52e7cdc..b62ae54 (#713)
b62ae54 Merge pull request #466 from RepoOps/update_makefile_20220927-234745 a62c9e9 update Makefile 2659ee2 Merge pull request #463 from uvimalku/main 26cddc6 fix vector size matmul 19bdd68 fix vector sizes for matrix_mult untiler Co-authored-by: sdausr <[email protected]>
1 parent 9a01c38 commit 1204c74

File tree

1 file changed

+22
-37
lines changed

1 file changed

+22
-37
lines changed

dsp/L1/src/aie/matrix_mult_untiler.cpp

+22-37
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
#define COL_MAJOR 1
3636
#endif
3737

38+
//#define MATMUL_DEBUG
39+
3840
namespace xf {
3941
namespace dsp {
4042
namespace aie {
@@ -94,13 +96,28 @@ constexpr loHi getUnTileShuffleOffsetsInt16(unsigned M, unsigned N, unsigned vec
9496
loHi ret = {.lo = offLo, .hi = offHi};
9597
return ret;
9698
}
99+
100+
template <typename T_D, unsigned inRow, unsigned inCol>
101+
static constexpr int getVecSize() {
102+
constexpr unsigned minVBuffSizeforType = (512 / 8) / sizeof(T_D); // not sure why this is 512 bits?
103+
if
104+
constexpr(minVBuffSizeforType > (inRow * inCol)) { return inRow * inCol; }
105+
else if
106+
constexpr(inCol % minVBuffSizeforType == 0 || minVBuffSizeforType > inCol) { return minVBuffSizeforType; }
107+
else {
108+
int vSize = minVBuffSizeforType;
109+
while (inCol % vSize != 0) {
110+
vSize /= 2;
111+
}
112+
return vSize;
113+
}
114+
}
115+
97116
template <unsigned M, unsigned N, unsigned inRow, unsigned inCol, unsigned leadingDim, typename T_D>
98117
static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
99118
constexpr unsigned minGranularity = (128 / 8) / sizeof(T_D);
100119
constexpr unsigned loadSize = (N >= minGranularity) ? N : minGranularity;
101-
constexpr unsigned minVBuffSizeforType = (512 / 8) / sizeof(T_D);
102-
constexpr unsigned vectorSize = (minVBuffSizeforType > (inRow * inCol)) ? (inRow * inCol) : minVBuffSizeforType;
103-
120+
constexpr unsigned vectorSize = getVecSize<T_D, inRow, inCol>();
104121
// static_assert(N >= minGranularity, "Granularity is awkward");
105122
static_assert(vectorSize <= (1024 / 8) / sizeof(T_D), "calculated vector size too large for vector register.");
106123
static_assert(!(leadingDim == COL_MAJOR && std::is_same_v<T_D, int16>),
@@ -110,9 +127,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
110127
loHi offsets = std::is_same_v<T_D, int16> ? getUnTileShuffleOffsetsInt16(M, N, vectorSize, leadingDim)
111128
: getUnTileShuffleOffsets(M, N, vectorSize, leadingDim);
112129

113-
// printf("M: %d, N: %d, vectorSize: %d, loadSize: %d, leadingDim: %d\n", M, N, vectorSize, loadSize, leadingDim);
114-
// printf("Offsets: lo : %0X, hi: %0X\n", offsets.lo, offsets.hi);
115-
116130
const unsigned loadsPerVector = vectorSize / loadSize;
117131
const unsigned tilesPerVector = vectorSize / (M * N);
118132
const unsigned colsPerLoad =
@@ -129,8 +143,7 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
129143

130144
const unsigned vectorsPerCol = inRow / rowsPerVector;
131145
const unsigned vectorsPerRow = inCol / colsPerVector;
132-
// printf("colsPerLoad: %d, rowsPerLoad: %d, colsPerVector: %d, rowsPerVector: %d, vectorsPerCol: %d, vectorsPerRow:
133-
// %d\n",colsPerLoad, rowsPerLoad, colsPerVector, rowsPerVector, vectorsPerCol, vectorsPerRow );
146+
134147
// Loop through a row first if row major
135148
const unsigned outerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerCol : vectorsPerRow;
136149
const unsigned innerLoopCount = (leadingDim == ROW_MAJOR) ? vectorsPerRow : vectorsPerCol;
@@ -145,9 +158,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
145158
const unsigned outerDimStoreIncr = (leadingDim == ROW_MAJOR) ? inCol : inRow;
146159
const unsigned innerDimStoreIncr = storeSize;
147160

148-
// printf("outerLoopCount: %d, innerLoopCount: %d, outerDimPerVector: %d, innerDimPerVector: %d, storeSize: %d \n",
149-
// outerLoopCount, innerLoopCount, outerDimPerVector, innerDimPerVector, storeSize);
150-
151161
const bool shuffleIsNeeded = (leadingDim == COL_MAJOR) || ((leadingDim == ROW_MAJOR) && (loadSize > N));
152162

153163
for (unsigned outerDimIdx = 0; outerDimIdx < outerLoopCount; ++outerDimIdx)
@@ -161,8 +171,6 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
161171
for (unsigned innerDimIdx = 0; innerDimIdx < innerLoopCount; ++innerDimIdx)
162172
chess_loop_count((innerLoopCount)) chess_prepare_for_pipelining {
163173
const unsigned ptrInnerBase = innerDimIdx * innerLoopIncr;
164-
// printf("outerDimIdx: %d, ptrOuterBase: %d, innerDimIdx: %d, ptrInnerBase: %d\n",outerDimIdx,
165-
// ptrOuterBase, innerDimIdx, ptrInnerBase);
166174

167175
aie::vector<T_D, vectorSize> vec;
168176

@@ -188,29 +196,17 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
188196
loadSize * loadIdx; // unlikely
189197

190198
const unsigned loadPtr = innerLoadPtr + ptrInnerBase + ptrOuterBase;
191-
192-
// printf("loadPtr=%d, innerLoadPtr=%d\n", loadPtr, innerLoadPtr);
193-
// load
194-
195199
vec.insert(loadIdx, aie::load_v<loadSize>(inPtr + loadPtr));
196200
}
197201

198-
// myprint(vec, true, "beforeShuffle: ");
199202
if
200-
constexpr(shuffleIsNeeded) {
201-
// printf("We need to do a shuffle\n");
202-
vec = doShuffle(vec, 0, offsets);
203-
// myprint(vec, true, "afterShuffle: ");
204-
}
203+
constexpr(shuffleIsNeeded) { vec = doShuffle(vec, 0, offsets); }
205204
#pragma unroll((outerDimPerVector))
206205
for (unsigned outerStoreIdx = 0; outerStoreIdx < outerDimPerVector; ++outerStoreIdx) {
207206
const unsigned storeOuterPtr = outerStoreIdx * outerDimStoreIncr;
208207
#pragma unroll((std::max(innerDimPerVector / storeSize, (unsigned) 1)))
209208
for (unsigned innerStoreIdx = 0;
210209
innerStoreIdx < std::max(innerDimPerVector / storeSize, (unsigned)1); ++innerStoreIdx) {
211-
// printf("outerStoreIdx=%d, storeOuterPtr=%d, innerStoreIdx=%d, storeInnerPtr=%d\n",
212-
// outerStoreIdx,storeOuterPtr, innerStoreIdx, innerStoreIdx*storeSize);
213-
214210
// If we don't shuffle and still load multiple outerDims, then we need to skip over that.
215211
const unsigned sliceIdx =
216212
(!shuffleIsNeeded && outerDimPerVector > 1)
@@ -220,23 +216,12 @@ static void doUnTile(T_D* __restrict inPtr, T_D* outPtr) {
220216
innerDimIdx * innerDimPerVector +
221217
outerDimIdx * outerDimPerVector * outerDimStoreIncr;
222218

223-
// printf("storePtr=%d, sliceIdx=%d\n", storePtr, sliceIdx);
224-
225219
// store direct to window
226220
aie::store_v(outPtr + storePtr, vec.template extract<storeSize>(sliceIdx));
227221
}
228222
}
229223
}
230224
}
231-
232-
const unsigned tileSize = (M * N);
233-
// for (unsigned AChunk=0; AChunk<(inRow*inCol); AChunk+=tileSize){
234-
// aie::vector<T_D, tileSize> APost = aie::load_v<tileSize>(outPtr); outPtr += tileSize;
235-
//// //aie::vector<T_D, sizeTileA> A1 = aie::load_v<sizeTileA>(pA1); pA1 += sizeTileA;
236-
// myprint(APost,true,"A0postProc: ");
237-
//// myprint(A1,true,"A1preProc: ");
238-
////
239-
//}
240225
}
241226

242227
namespace aie = ::aie;

0 commit comments

Comments
 (0)