Skip to content

Commit 5147002

Browse files
authored
Enable AVX-512 in Memmove unrolling (#84348)
1 parent 082c5b7 commit 5147002

File tree

4 files changed

+110
-27
lines changed

4 files changed

+110
-27
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2581,12 +2581,8 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
25812581
regNumber src = genConsumeReg(srcIndir->Addr());
25822582
unsigned size = tree->Size();
25832583

2584-
// TODO-XARCH-AVX512: Consider enabling it here
2585-
unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2586-
? YMM_REGSIZE_BYTES
2587-
: XMM_REGSIZE_BYTES;
2588-
2589-
if (size >= simdSize)
2584+
const unsigned simdSize = compiler->roundDownSIMDSize(size);
2585+
if ((size >= simdSize) && (simdSize > 0))
25902586
{
25912587
// Number of SIMD regs needed to save the whole src to regs.
25922588
unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);
@@ -2603,33 +2599,37 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
26032599
}
26042600

26052601
auto emitSimdLoadStore = [&](bool load) {
2606-
unsigned offset = 0;
2607-
int regIndex = 0;
2608-
instruction simdMov = simdUnalignedMovIns();
2602+
unsigned offset = 0;
2603+
int regIndex = 0;
2604+
instruction simdMov = simdUnalignedMovIns();
2605+
unsigned curSimdSize = simdSize;
26092606
do
26102607
{
2608+
assert(curSimdSize >= XMM_REGSIZE_BYTES);
26112609
if (load)
26122610
{
26132611
// vmovdqu ymm, ymmword ptr[src + offset]
2614-
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset);
2612+
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], src, offset);
26152613
}
26162614
else
26172615
{
26182616
// vmovdqu ymmword ptr[dst + offset], ymm
2619-
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset);
2617+
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], dst, offset);
26202618
}
2621-
offset += simdSize;
2619+
offset += curSimdSize;
26222620
if (size == offset)
26232621
{
26242622
break;
26252623
}
26262624

2625+
// Overlap with the previously processed data. We'll always use SIMD for simplicity
26272626
assert(size > offset);
2628-
if ((size - offset) < simdSize)
2627+
unsigned remainder = size - offset;
2628+
if (remainder < curSimdSize)
26292629
{
2630-
// Overlap with the previously processed data. We'll always use SIMD for simplicity
2631-
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
2632-
offset = size - simdSize;
2630+
// Switch to smaller SIMD size if necessary
2631+
curSimdSize = compiler->roundUpSIMDSize(remainder);
2632+
offset = size - curSimdSize;
26332633
}
26342634
} while (true);
26352635
};

src/coreclr/jit/compiler.h

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8950,6 +8950,82 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89508950
#endif
89518951
}
89528952

8953+
//------------------------------------------------------------------------
8954+
// roundUpSIMDSize: rounds the given size up to the nearest SIMD size
8955+
// available on the target. Examples on XARCH:
8956+
//
8957+
// size: 7 -> XMM
8958+
// size: 30 -> YMM (or XMM if target doesn't support AVX)
8959+
// size: 70 -> ZMM (or YMM or XMM depending on target)
8960+
//
8961+
// Arguments:
8962+
// size - size of the data to process with SIMD
8963+
//
8964+
// Notes:
8965+
// It's only supposed to be used for scenarios where we can
8966+
// perform an overlapped load/store.
8967+
//
8968+
unsigned int roundUpSIMDSize(unsigned size)
8969+
{
8970+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
8971+
unsigned maxSimdSize = maxSIMDStructBytes();
8972+
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
8973+
if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES)
8974+
{
8975+
return XMM_REGSIZE_BYTES;
8976+
}
8977+
if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES)
8978+
{
8979+
return YMM_REGSIZE_BYTES;
8980+
}
8981+
return maxSimdSize;
8982+
#elif defined(TARGET_ARM64)
8983+
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
8984+
return FP_REGSIZE_BYTES;
8985+
#else
8986+
assert(!"roundUpSIMDSize() unimplemented on target arch");
8987+
unreached();
8988+
#endif
8989+
}
8990+
8991+
//------------------------------------------------------------------------
8992+
// roundDownSIMDSize: rounds the given size down to the nearest SIMD size
8993+
// available on the target. Examples on XARCH:
8994+
//
8995+
// size: 7 -> 0
8996+
// size: 30 -> XMM (not enough for AVX)
8997+
// size: 60 -> YMM (or XMM if target doesn't support AVX)
8998+
// size: 70 -> ZMM/YMM/XMM whatever the current system can offer
8999+
//
9000+
// Arguments:
9001+
// size - size of the data to process with SIMD
9002+
//
9003+
unsigned int roundDownSIMDSize(unsigned size)
9004+
{
9005+
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
9006+
unsigned maxSimdSize = maxSIMDStructBytes();
9007+
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
9008+
if (size >= maxSimdSize)
9009+
{
9010+
// Size is bigger than max SIMD size the current target supports
9011+
return maxSimdSize;
9012+
}
9013+
if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES)
9014+
{
9015+
// Size is >= YMM but not enough for ZMM -> YMM
9016+
return YMM_REGSIZE_BYTES;
9017+
}
9018+
// Return 0 if size is even less than XMM, otherwise - XMM
9019+
return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0;
9020+
#elif defined(TARGET_ARM64)
9021+
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
9022+
return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0;
9023+
#else
9024+
assert(!"roundDownSIMDSize() unimplemented on target arch");
9025+
unreached();
9026+
#endif
9027+
}
9028+
89539029
unsigned int minSIMDStructBytes()
89549030
{
89559031
return emitTypeSize(TYP_SIMD8);
@@ -8997,6 +9073,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89979073
{
89989074
return false;
89999075
}
9076+
unsigned int roundUpSIMDSize(unsigned size)
9077+
{
9078+
return 0;
9079+
}
9080+
unsigned int roundDownSIMDSize(unsigned size)
9081+
{
9082+
return 0;
9083+
}
90009084
#endif // FEATURE_SIMD
90019085

90029086
public:
@@ -9027,9 +9111,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
90279111
{
90289112
maxRegSize = maxSIMDStructBytes();
90299113
#if defined(TARGET_XARCH)
9030-
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
9031-
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
9032-
threshold = maxRegSize;
9114+
if (type != UnrollKind::Memmove)
9115+
{
9116+
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial.
9117+
// Enabled for Memmove only for now.
9118+
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
9119+
}
9120+
threshold = maxRegSize;
90339121
#elif defined(TARGET_ARM64)
90349122
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
90359123
//

src/coreclr/jit/emitxarch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7901,7 +7901,7 @@ void emitter::emitIns_I_ARX(
79017901
void emitter::emitIns_R_ARX(
79027902
instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp)
79037903
{
7904-
assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE) && (reg != REG_NA));
7904+
assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_64BYTE) && (reg != REG_NA));
79057905
noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));
79067906

79077907
if ((ins == INS_lea) && (reg == base) && (index == REG_NA) && (disp == 0))

src/coreclr/jit/lsraxarch.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,13 +1536,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
15361536
// Lowering was expected to get rid of memmove in case of zero
15371537
assert(size > 0);
15381538

1539-
// TODO-XARCH-AVX512: Consider enabling it here
1540-
unsigned simdSize =
1541-
(size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
1542-
? YMM_REGSIZE_BYTES
1543-
: XMM_REGSIZE_BYTES;
1544-
1545-
if (size >= simdSize)
1539+
const unsigned simdSize = compiler->roundDownSIMDSize(size);
1540+
if ((size >= simdSize) && (simdSize > 0))
15461541
{
15471542
unsigned simdRegs = size / simdSize;
15481543
if ((size % simdSize) != 0)

0 commit comments

Comments
 (0)