Skip to content

Commit 65889d1

Browse files
EgorBoMichalPetrykaBruceForstall
authored
Unroll Buffer.Memmove for arm64 (#83740)
Co-authored-by: Michał Petryka <[email protected]> Co-authored-by: Bruce Forstall <[email protected]>
1 parent d795694 commit 65889d1

12 files changed

+442
-35
lines changed

src/coreclr/jit/codegenarmarch.cpp

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
30503050
}
30513051
}
30523052

3053+
//------------------------------------------------------------------------
3054+
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
3055+
// ignore the fact that src and dst might overlap if we save the whole
3056+
// src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30):
3057+
//
3058+
// ldr q16, [x0]
3059+
// ldr q17, [x0, #0x0E]
3060+
// str q16, [x1]
3061+
// str q17, [x1, #0x0E]
3062+
//
3063+
// Arguments:
3064+
// tree - GenTreeBlk node
3065+
//
3066+
void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
3067+
{
3068+
#ifdef TARGET_ARM64
3069+
// TODO-CQ: Support addressing modes, for now we don't use them
3070+
GenTreeIndir* srcIndir = tree->Data()->AsIndir();
3071+
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());
3072+
3073+
regNumber dst = genConsumeReg(tree->Addr());
3074+
regNumber src = genConsumeReg(srcIndir->Addr());
3075+
unsigned size = tree->Size();
3076+
3077+
auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
3078+
var_types memType;
3079+
switch (regSize)
3080+
{
3081+
case 1:
3082+
memType = TYP_UBYTE;
3083+
break;
3084+
case 2:
3085+
memType = TYP_USHORT;
3086+
break;
3087+
case 4:
3088+
memType = TYP_INT;
3089+
break;
3090+
case 8:
3091+
memType = TYP_LONG;
3092+
break;
3093+
case 16:
3094+
memType = TYP_SIMD16;
3095+
break;
3096+
default:
3097+
unreached();
3098+
}
3099+
if (load)
3100+
{
3101+
GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
3102+
}
3103+
else
3104+
{
3105+
GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
3106+
}
3107+
};
3108+
3109+
// Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here.
3110+
3111+
// Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent
3112+
// ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815
3113+
unsigned simdSize = FP_REGSIZE_BYTES;
3114+
if (size >= simdSize)
3115+
{
3116+
// Number of SIMD regs needed to save the whole src to regs.
3117+
const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);
3118+
3119+
// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
3120+
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
3121+
for (unsigned i = 0; i < numberOfSimdRegs; i++)
3122+
{
3123+
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
3124+
}
3125+
3126+
auto emitSimdLoadStore = [&](bool load) {
3127+
unsigned offset = 0;
3128+
int regIndex = 0;
3129+
do
3130+
{
3131+
emitLoadStore(load, simdSize, tempRegs[regIndex++], offset);
3132+
offset += simdSize;
3133+
if (size == offset)
3134+
{
3135+
break;
3136+
}
3137+
if ((size - offset) < simdSize)
3138+
{
3139+
// Overlap with the previously processed data. We'll always use SIMD for simplicity
3140+
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
3141+
offset = size - simdSize;
3142+
}
3143+
} while (true);
3144+
};
3145+
3146+
// load everything from SRC to temp regs
3147+
emitSimdLoadStore(/* load */ true);
3148+
// store them to DST
3149+
emitSimdLoadStore(/* load */ false);
3150+
}
3151+
else
3152+
{
3153+
// Here we work with size 1..15
3154+
assert((size > 0) && (size < FP_REGSIZE_BYTES));
3155+
3156+
// Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
3157+
const unsigned loadStoreSize = 1 << BitOperations::Log2(size);
3158+
if (loadStoreSize == size)
3159+
{
3160+
const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
3161+
emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
3162+
emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
3163+
}
3164+
else
3165+
{
3166+
assert(tree->AvailableTempRegCount() == 2);
3167+
const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
3168+
const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
3169+
emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
3170+
emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
3171+
emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
3172+
emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
3173+
}
3174+
}
3175+
#else // TARGET_ARM64
3176+
unreached();
3177+
#endif
3178+
}
3179+
30533180
//------------------------------------------------------------------------
30543181
// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
30553182
//
@@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
43704497
break;
43714498

43724499
case GenTreeBlk::BlkOpKindUnroll:
4500+
case GenTreeBlk::BlkOpKindUnrollMemmove:
43734501
if (isCopyBlk)
43744502
{
43754503
if (blkOp->gtBlkOpGcUnsafe)
43764504
{
43774505
GetEmitter()->emitDisableGC();
43784506
}
4379-
genCodeForCpBlkUnroll(blkOp);
4507+
if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
4508+
{
4509+
genCodeForCpBlkUnroll(blkOp);
4510+
}
4511+
else
4512+
{
4513+
assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
4514+
genCodeForMemmove(blkOp);
4515+
}
43804516
if (blkOp->gtBlkOpGcUnsafe)
43814517
{
43824518
GetEmitter()->emitEnableGC();

src/coreclr/jit/codegenxarch.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)
25562556

25572557
//------------------------------------------------------------------------
25582558
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
2559-
// ignore the fact that dst and src might overlap if we save the whole
2560-
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
2559+
// ignore the fact that src and dst might overlap if we save the whole
2560+
// src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120):
25612561
//
25622562
// vmovdqu ymm0, ymmword ptr[rax + 0]
25632563
// vmovdqu ymm1, ymmword ptr[rax + 32]
@@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
25982598
// temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
25992599
assert(numberOfSimdRegs * simdSize >= size);
26002600

2601-
// Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
2601+
// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
26022602
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
26032603
for (unsigned i = 0; i < numberOfSimdRegs; i++)
26042604
{
@@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
26302630
assert(size > offset);
26312631
if ((size - offset) < simdSize)
26322632
{
2633-
// Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
2633+
// Overlap with the previously processed data. We'll always use SIMD for simplicity
26342634
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
26352635
offset = size - simdSize;
26362636
}
@@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
32853285

32863286
size -= bytesWritten;
32873287

3288-
// Handle the remainder by overlapping with previosly processed data (only for zeroing)
3288+
// Handle the remainder by overlapping with previously processed data (only for zeroing)
32893289
if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
32903290
{
32913291
if (isPow2(size) && (size <= REGSIZE_BYTES))
@@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
35503550

35513551
assert((size >= 0) && (size < regSize));
35523552

3553-
// Handle the remainder by overlapping with previosly processed data
3553+
// Handle the remainder by overlapping with previously processed data
35543554
if ((size > 0) && (size < regSize))
35553555
{
35563556
assert(regSize >= XMM_REGSIZE_BYTES);

src/coreclr/jit/compiler.h

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89418941
//
89428942
unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
89438943
{
8944-
unsigned threshold = TARGET_POINTER_SIZE;
8944+
unsigned maxRegSize = REGSIZE_BYTES;
8945+
unsigned threshold = maxRegSize;
89458946

89468947
#if defined(FEATURE_SIMD)
89478948
if (canUseSimd)
89488949
{
8949-
threshold = maxSIMDStructBytes();
8950-
#if defined(TARGET_ARM64)
8950+
maxRegSize = maxSIMDStructBytes();
8951+
#if defined(TARGET_XARCH)
8952+
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
8953+
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
8954+
threshold = maxRegSize;
8955+
#elif defined(TARGET_ARM64)
89518956
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
89528957
//
89538958
// ldp q0, q1, [x1]
89548959
// stp q0, q1, [x0]
89558960
//
8956-
threshold *= 2;
8957-
#elif defined(TARGET_XARCH)
8958-
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
8959-
threshold = min(threshold, YMM_REGSIZE_BYTES);
8961+
threshold = maxRegSize * 2;
89608962
#endif
89618963
}
89628964
#if defined(TARGET_XARCH)
@@ -8987,12 +8989,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89878989
// | arm | 32 | 16 | no SIMD support
89888990
// | loongarch64 | 64 | 32 | no SIMD support
89898991
//
8990-
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
8992+
// We might want to use a different multiplier for truly hot/cold blocks based on PGO data
89918993
//
89928994
threshold *= 4;
89938995

8994-
// NOTE: Memmove's unrolling is currently limitted with LSRA -
8995-
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
8996+
if (type == UnrollKind::Memmove)
8997+
{
8998+
// NOTE: Memmove's unrolling is currently limited with LSRA -
8999+
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64
9000+
threshold = maxRegSize * 4;
9001+
}
9002+
89969003
return threshold;
89979004
}
89989005

src/coreclr/jit/gentree.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const
14191419
}
14201420
}
14211421

1422+
//---------------------------------------------------------------
1423+
// IsUserArg: Check if this is an argument that can be treated as
1424+
// user-defined (in IL).
1425+
//
1426+
// Remarks:
1427+
// "this" and ShiftLow/ShiftHigh are recognized as user-defined
1428+
//
1429+
bool CallArg::IsUserArg() const
1430+
{
1431+
switch (static_cast<WellKnownArg>(m_wellKnownArg))
1432+
{
1433+
case WellKnownArg::None:
1434+
case WellKnownArg::ShiftLow:
1435+
case WellKnownArg::ShiftHigh:
1436+
case WellKnownArg::ThisPointer:
1437+
return true;
1438+
default:
1439+
return false;
1440+
}
1441+
}
1442+
14221443
#ifdef DEBUG
14231444
//---------------------------------------------------------------
14241445
// CheckIsStruct: Verify that the struct ABI information is consistent with the IR node.
@@ -1603,6 +1624,37 @@ CallArg* CallArgs::GetArgByIndex(unsigned index)
16031624
return cur;
16041625
}
16051626

1627+
//---------------------------------------------------------------
1628+
// GetUserArgByIndex: Get an argument with the specified index.
1629+
// Unlike GetArgByIndex, this function ignores non-user args
1630+
// like r2r cells.
1631+
//
1632+
// Parameters:
1633+
// index - The index of the argument to find.
1634+
//
1635+
// Returns:
1636+
// A pointer to the argument.
1637+
//
1638+
// Remarks:
1639+
// This function assumes enough arguments exist. Also, see IsUserArg's
1640+
// comments
1641+
//
1642+
CallArg* CallArgs::GetUserArgByIndex(unsigned index)
1643+
{
1644+
CallArg* cur = m_head;
1645+
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
1646+
for (unsigned i = 0; i < index || !cur->IsUserArg();)
1647+
{
1648+
if (cur->IsUserArg())
1649+
{
1650+
i++;
1651+
}
1652+
cur = cur->GetNext();
1653+
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
1654+
}
1655+
return cur;
1656+
}
1657+
16061658
//---------------------------------------------------------------
16071659
// GetIndex: Get the index for the specified argument.
16081660
//

src/coreclr/jit/gentree.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4644,6 +4644,8 @@ class CallArg
46444644

46454645
bool IsArgAddedLate() const;
46464646

4647+
bool IsUserArg() const;
4648+
46474649
#ifdef DEBUG
46484650
void Dump(Compiler* comp);
46494651
// Check that the value of 'AbiInfo.IsStruct' is consistent.
@@ -4704,6 +4706,7 @@ class CallArgs
47044706
CallArg* GetThisArg();
47054707
CallArg* GetRetBufferArg();
47064708
CallArg* GetArgByIndex(unsigned index);
4709+
CallArg* GetUserArgByIndex(unsigned index);
47074710
unsigned GetIndex(CallArg* arg);
47084711

47094712
bool IsEmpty() const
@@ -4772,6 +4775,7 @@ class CallArgs
47724775
unsigned OutgoingArgsStackSize() const;
47734776

47744777
unsigned CountArgs();
4778+
unsigned CountUserArgs();
47754779

47764780
template <CallArg* (CallArg::*Next)()>
47774781
class CallArgIterator

0 commit comments

Comments
 (0)