Skip to content

Enable AVX-512 in Memmove unrolling #84348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2581,12 +2581,8 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
regNumber src = genConsumeReg(srcIndir->Addr());
unsigned size = tree->Size();

// TODO-XARCH-AVX512: Consider enabling it here
unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

if (size >= simdSize)
const unsigned simdSize = compiler->roundDownSIMDSize(size);
if ((size >= simdSize) && (simdSize > 0))
{
// Number of SIMD regs needed to save the whole src to regs.
unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);
Expand All @@ -2603,33 +2599,37 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
}

auto emitSimdLoadStore = [&](bool load) {
unsigned offset = 0;
int regIndex = 0;
instruction simdMov = simdUnalignedMovIns();
unsigned offset = 0;
int regIndex = 0;
instruction simdMov = simdUnalignedMovIns();
unsigned curSimdSize = simdSize;
do
{
assert(curSimdSize >= XMM_REGSIZE_BYTES);
if (load)
{
// vmovdqu ymm, ymmword ptr[src + offset]
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset);
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], src, offset);
}
else
{
// vmovdqu ymmword ptr[dst + offset], ymm
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset);
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(curSimdSize), tempRegs[regIndex++], dst, offset);
}
offset += simdSize;
offset += curSimdSize;
if (size == offset)
{
break;
}

// Overlap with the previously processed data. We'll always use SIMD for simplicity
assert(size > offset);
if ((size - offset) < simdSize)
unsigned remainder = size - offset;
if (remainder < curSimdSize)
{
// Overlap with the previously processed data. We'll always use SIMD for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
// Switch to smaller SIMD size if necessary
curSimdSize = compiler->roundUpSIMDSize(remainder);
offset = size - curSimdSize;
}
} while (true);
};
Expand Down
94 changes: 91 additions & 3 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8908,6 +8908,82 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#endif
}

//------------------------------------------------------------------------
// roundUpSIMDSize: rounds the given size up to the nearest SIMD size
// available on the target. Examples on XARCH:
//
// size: 7 -> XMM
// size: 30 -> YMM (or XMM if target doesn't support AVX)
// size: 70 -> ZMM (or YMM or XMM depending on target)
//
// Arguments:
// size - size of the data to process with SIMD
//
// Notes:
// It's only supposed to be used for scenarios where we can
// perform an overlapped load/store.
//
unsigned int roundUpSIMDSize(unsigned size)
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
unsigned maxSimdSize = maxSIMDStructBytes();
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
if (size <= XMM_REGSIZE_BYTES && maxSimdSize > XMM_REGSIZE_BYTES)
{
return XMM_REGSIZE_BYTES;
}
if (size <= YMM_REGSIZE_BYTES && maxSimdSize > YMM_REGSIZE_BYTES)
{
return YMM_REGSIZE_BYTES;
}
return maxSimdSize;
#elif defined(TARGET_ARM64)
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
return FP_REGSIZE_BYTES;
#else
assert(!"roundUpSIMDSize() unimplemented on target arch");
unreached();
#endif
}

//------------------------------------------------------------------------
// roundDownSIMDSize: rounds the given size down to the nearest SIMD size
// available on the target. Examples on XARCH:
//
// size: 7 -> 0
// size: 30 -> XMM (not enough for AVX)
// size: 60 -> YMM (or XMM if target doesn't support AVX)
// size: 70 -> ZMM/YMM/XMM whatever the current system can offer
//
// Arguments:
// size - size of the data to process with SIMD
//
unsigned int roundDownSIMDSize(unsigned size)
{
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
unsigned maxSimdSize = maxSIMDStructBytes();
assert(maxSimdSize <= ZMM_REGSIZE_BYTES);
if (size >= maxSimdSize)
{
// Size is bigger than max SIMD size the current target supports
return maxSimdSize;
}
if (size >= YMM_REGSIZE_BYTES && maxSimdSize >= YMM_REGSIZE_BYTES)
{
// Size is >= YMM but not enough for ZMM -> YMM
return YMM_REGSIZE_BYTES;
}
// Return 0 if size is even less than XMM, otherwise - XMM
return size >= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : 0;
#elif defined(TARGET_ARM64)
assert(maxSIMDStructBytes() == FP_REGSIZE_BYTES);
return size >= FP_REGSIZE_BYTES ? FP_REGSIZE_BYTES : 0;
#else
assert(!"roundDownSIMDSize() unimplemented on target arch");
unreached();
#endif
}

unsigned int minSIMDStructBytes()
{
return emitTypeSize(TYP_SIMD8);
Expand Down Expand Up @@ -8955,6 +9031,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{
return false;
}
unsigned int roundUpSIMDSize(unsigned size)
{
return 0;
}
unsigned int roundDownSIMDSize(unsigned size)
{
return 0;
}
#endif // FEATURE_SIMD

public:
Expand Down Expand Up @@ -8985,9 +9069,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{
maxRegSize = maxSIMDStructBytes();
#if defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
threshold = maxRegSize;
if (type != UnrollKind::Memmove)
{
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial.
// Enabled for Memmove only for now.
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
}
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7814,7 +7814,7 @@ void emitter::emitIns_I_ARX(
void emitter::emitIns_R_ARX(
instruction ins, emitAttr attr, regNumber reg, regNumber base, regNumber index, unsigned scale, int disp)
{
assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE) && (reg != REG_NA));
assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_64BYTE) && (reg != REG_NA));
noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg));

if ((ins == INS_lea) && (reg == base) && (index == REG_NA) && (disp == 0))
Expand Down
9 changes: 2 additions & 7 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1536,13 +1536,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
// Lowering was expected to get rid of memmove in case of zero
assert(size > 0);

// TODO-XARCH-AVX512: Consider enabling it here
unsigned simdSize =
(size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

if (size >= simdSize)
const unsigned simdSize = compiler->roundDownSIMDSize(size);
if ((size >= simdSize) && (simdSize > 0))
{
unsigned simdRegs = size / simdSize;
if ((size % simdSize) != 0)
Expand Down