Skip to content

Commit 59a17b1

Browse files
EgorBoSingleAccretionjakobbotsch
authored
Unroll Buffer.Memmove for constant lengths (#83638)
Co-authored-by: SingleAccretion <[email protected]> Co-authored-by: Jakob Botsch Nielsen <[email protected]>
1 parent edb161a commit 59a17b1

File tree

15 files changed

+499
-12
lines changed

15 files changed

+499
-12
lines changed

src/coreclr/jit/codegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
12701270
#endif // !TARGET_XARCH
12711271

12721272
void genLclHeap(GenTree* tree);
1273+
void genCodeForMemmove(GenTreeBlk* tree);
12731274

12741275
bool genIsRegCandidateLocal(GenTree* tree)
12751276
{

src/coreclr/jit/codegenxarch.cpp

Lines changed: 156 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2554,6 +2554,152 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)
25542554
inst_Mov(TYP_I_IMPL, REG_SPBASE, regSpDelta, /* canSkip */ false);
25552555
}
25562556

2557+
//------------------------------------------------------------------------
2558+
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
2559+
// ignore the fact that dst and src might overlap if we save the whole
2560+
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
2561+
//
2562+
// vmovdqu ymm0, ymmword ptr[rax + 0]
2563+
// vmovdqu ymm1, ymmword ptr[rax + 32]
2564+
// vmovdqu ymm2, ymmword ptr[rax + 64]
2565+
// vmovdqu ymm3, ymmword ptr[rax + 88]
2566+
// vmovdqu ymmword ptr[rcx + 0], ymm0
2567+
// vmovdqu ymmword ptr[rcx + 32], ymm1
2568+
// vmovdqu ymmword ptr[rcx + 64], ymm2
2569+
// vmovdqu ymmword ptr[rcx + 88], ymm3
2570+
//
2571+
// Arguments:
2572+
// tree - GenTreeBlk node
2573+
//
2574+
void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
2575+
{
2576+
// Not yet finished for x86
2577+
assert(TARGET_POINTER_SIZE == 8);
2578+
2579+
// TODO-CQ: Support addressing modes, for now we don't use them
2580+
GenTreeIndir* srcIndir = tree->Data()->AsIndir();
2581+
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());
2582+
2583+
regNumber dst = genConsumeReg(tree->Addr());
2584+
regNumber src = genConsumeReg(srcIndir->Addr());
2585+
unsigned size = tree->Size();
2586+
2587+
// TODO-XARCH-AVX512: Consider enabling it here
2588+
unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2589+
? YMM_REGSIZE_BYTES
2590+
: XMM_REGSIZE_BYTES;
2591+
2592+
if (size >= simdSize)
2593+
{
2594+
// Number of SIMD regs needed to save the whole src to regs.
2595+
unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);
2596+
2597+
// Lowering takes care to only introduce this node such that we will always have enough
2598+
// temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
2599+
assert(numberOfSimdRegs * simdSize >= size);
2600+
2601+
// Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
2602+
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
2603+
for (unsigned i = 0; i < numberOfSimdRegs; i++)
2604+
{
2605+
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
2606+
}
2607+
2608+
auto emitSimdLoadStore = [&](bool load) {
2609+
unsigned offset = 0;
2610+
int regIndex = 0;
2611+
instruction simdMov = simdUnalignedMovIns();
2612+
do
2613+
{
2614+
if (load)
2615+
{
2616+
// vmovdqu ymm, ymmword ptr[src + offset]
2617+
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset);
2618+
}
2619+
else
2620+
{
2621+
// vmovdqu ymmword ptr[dst + offset], ymm
2622+
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset);
2623+
}
2624+
offset += simdSize;
2625+
if (size == offset)
2626+
{
2627+
break;
2628+
}
2629+
2630+
assert(size > offset);
2631+
if ((size - offset) < simdSize)
2632+
{
2633+
// Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
2634+
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
2635+
offset = size - simdSize;
2636+
}
2637+
} while (true);
2638+
};
2639+
2640+
// load everything from SRC to temp regs
2641+
emitSimdLoadStore(/* load */ true);
2642+
// store them to DST
2643+
emitSimdLoadStore(/* load */ false);
2644+
}
2645+
else
2646+
{
2647+
// Here we work with size 1..15 (x64)
2648+
assert((size > 0) && (size < XMM_REGSIZE_BYTES));
2649+
2650+
auto emitScalarLoadStore = [&](bool load, int size, regNumber tempReg, int offset) {
2651+
var_types memType;
2652+
switch (size)
2653+
{
2654+
case 1:
2655+
memType = TYP_UBYTE;
2656+
break;
2657+
case 2:
2658+
memType = TYP_USHORT;
2659+
break;
2660+
case 4:
2661+
memType = TYP_INT;
2662+
break;
2663+
case 8:
2664+
memType = TYP_LONG;
2665+
break;
2666+
default:
2667+
unreached();
2668+
}
2669+
2670+
if (load)
2671+
{
2672+
// mov reg, qword ptr [src + offset]
2673+
GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
2674+
}
2675+
else
2676+
{
2677+
// mov qword ptr [dst + offset], reg
2678+
GetEmitter()->emitIns_AR_R(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
2679+
}
2680+
};
2681+
2682+
// Use overlapping loads/stores, e. g. for size == 9: "mov [dst], tmpReg1; mov [dst+1], tmpReg2".
2683+
unsigned loadStoreSize = 1 << BitOperations::Log2(size);
2684+
if (loadStoreSize == size)
2685+
{
2686+
regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
2687+
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
2688+
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
2689+
}
2690+
else
2691+
{
2692+
assert(tree->AvailableTempRegCount() == 2);
2693+
regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
2694+
regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
2695+
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
2696+
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
2697+
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
2698+
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
2699+
}
2700+
}
2701+
}
2702+
25572703
//------------------------------------------------------------------------
25582704
// genLclHeap: Generate code for localloc.
25592705
//
@@ -2921,6 +3067,7 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
29213067
genCodeForInitBlkRepStos(storeBlkNode);
29223068
}
29233069
break;
3070+
case GenTreeBlk::BlkOpKindUnrollMemmove:
29243071
case GenTreeBlk::BlkOpKindUnroll:
29253072
if (isCopyBlk)
29263073
{
@@ -2930,7 +3077,15 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode)
29303077
GetEmitter()->emitDisableGC();
29313078
}
29323079
#endif
2933-
genCodeForCpBlkUnroll(storeBlkNode);
3080+
if (storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
3081+
{
3082+
genCodeForCpBlkUnroll(storeBlkNode);
3083+
}
3084+
else
3085+
{
3086+
assert(storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
3087+
genCodeForMemmove(storeBlkNode);
3088+
}
29343089
#ifndef JIT32_GCENCODER
29353090
if (storeBlkNode->gtBlkOpGcUnsafe)
29363091
{

src/coreclr/jit/compiler.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8924,8 +8924,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89248924
public:
89258925
enum UnrollKind
89268926
{
8927-
Memset, // Initializing memory with some value
8928-
Memcpy // Copying memory from src to dst
8927+
Memset,
8928+
Memcpy,
8929+
Memmove
89298930
};
89308931

89318932
//------------------------------------------------------------------------
@@ -8955,7 +8956,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89558956
threshold *= 2;
89568957
#elif defined(TARGET_XARCH)
89578958
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
8958-
threshold = max(threshold, YMM_REGSIZE_BYTES);
8959+
threshold = min(threshold, YMM_REGSIZE_BYTES);
89598960
#endif
89608961
}
89618962
#if defined(TARGET_XARCH)
@@ -8988,7 +8989,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89888989
//
89898990
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
89908991
//
8991-
return threshold * 4;
8992+
threshold *= 4;
8993+
8994+
// NOTE: Memmove's unrolling is currently limitted with LSRA -
8995+
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
8996+
return threshold;
89928997
}
89938998

89948999
//------------------------------------------------------------------------

src/coreclr/jit/gentree.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11977,6 +11977,10 @@ void Compiler::gtDispTree(GenTree* tree,
1197711977
case GenTreeBlk::BlkOpKindUnroll:
1197811978
printf(" (Unroll)");
1197911979
break;
11980+
11981+
case GenTreeBlk::BlkOpKindUnrollMemmove:
11982+
printf(" (Memmove)");
11983+
break;
1198011984
#ifndef TARGET_X86
1198111985
case GenTreeBlk::BlkOpKindHelper:
1198211986
printf(" (Helper)");

src/coreclr/jit/gentree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7322,6 +7322,7 @@ struct GenTreeBlk : public GenTreeIndir
73227322
BlkOpKindRepInstr,
73237323
#endif
73247324
BlkOpKindUnroll,
7325+
BlkOpKindUnrollMemmove,
73257326
} gtBlkOpKind;
73267327

73277328
#ifndef JIT32_GCENCODER

src/coreclr/jit/importercalls.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3810,6 +3810,13 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
38103810
break;
38113811
}
38123812

3813+
case NI_System_Buffer_Memmove:
3814+
{
3815+
// We'll try to unroll this in lower for constant input.
3816+
isSpecial = true;
3817+
break;
3818+
}
3819+
38133820
case NI_System_BitConverter_DoubleToInt64Bits:
38143821
{
38153822
GenTree* op1 = impStackTop().val;
@@ -7903,6 +7910,13 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
79037910
result = NI_System_BitConverter_Int64BitsToDouble;
79047911
}
79057912
}
7913+
else if (strcmp(className, "Buffer") == 0)
7914+
{
7915+
if (strcmp(methodName, "Memmove") == 0)
7916+
{
7917+
result = NI_System_Buffer_Memmove;
7918+
}
7919+
}
79067920
break;
79077921
}
79087922

src/coreclr/jit/lower.cpp

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,8 +473,14 @@ GenTree* Lowering::LowerNode(GenTree* node)
473473
return LowerSwitch(node);
474474

475475
case GT_CALL:
476-
LowerCall(node);
477-
break;
476+
{
477+
GenTree* newNode = LowerCall(node);
478+
if (newNode != nullptr)
479+
{
480+
return newNode;
481+
}
482+
}
483+
break;
478484

479485
case GT_LT:
480486
case GT_LE:
@@ -1775,14 +1781,64 @@ GenTree* Lowering::AddrGen(void* addr)
17751781
return AddrGen((ssize_t)addr);
17761782
}
17771783

1784+
//------------------------------------------------------------------------
1785+
// LowerCallMemmove: Replace Buffer.Memmove(DST, SRC, CNS_SIZE) with a GT_STORE_BLK:
1786+
//
1787+
// * STORE_BLK struct<CNS_SIZE> (copy) (Unroll)
1788+
// +--* LCL_VAR byref dst
1789+
// \--* IND struct
1790+
// \--* LCL_VAR byref src
1791+
//
1792+
// Arguments:
1793+
// tree - GenTreeCall node to replace with STORE_BLK
1794+
//
1795+
GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
1796+
{
1797+
assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove);
1798+
assert(call->gtArgs.CountArgs() == 3);
1799+
1800+
GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode();
1801+
if (lengthArg->IsIntegralConst())
1802+
{
1803+
ssize_t cnsSize = lengthArg->AsIntCon()->IconValue();
1804+
// TODO-CQ: drop the whole thing in case of 0
1805+
if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove)))
1806+
{
1807+
GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode();
1808+
GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode();
1809+
1810+
// TODO-CQ: Try to create an addressing mode
1811+
GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr);
1812+
srcBlk->gtFlags |= GTF_GLOB_REF;
1813+
srcBlk->SetContained();
1814+
1815+
GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK)
1816+
GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, dstAddr, srcBlk, comp->typGetBlkLayout((unsigned)cnsSize));
1817+
storeBlk->gtFlags |= (GTF_BLK_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF);
1818+
1819+
// TODO-CQ: Use GenTreeObj::BlkOpKindUnroll here if srcAddr and dstAddr don't overlap, thus, we can
1820+
// unroll this memmove as memcpy - it doesn't require lots of temp registers
1821+
storeBlk->gtBlkOpKind = GenTreeObj::BlkOpKindUnrollMemmove;
1822+
1823+
BlockRange().InsertBefore(call, srcBlk);
1824+
BlockRange().InsertBefore(call, storeBlk);
1825+
BlockRange().Remove(lengthArg);
1826+
BlockRange().Remove(call);
1827+
1828+
return storeBlk;
1829+
}
1830+
}
1831+
return nullptr;
1832+
}
1833+
17781834
// do lowering steps for a call
17791835
// this includes:
17801836
// - adding the placement nodes (either stack or register variety) for arguments
17811837
// - lowering the expression that calculates the target address
17821838
// - adding nodes for other operations that occur after the call sequence starts and before
17831839
// control transfer occurs (profiling and tail call helpers, pinvoke incantations)
17841840
//
1785-
void Lowering::LowerCall(GenTree* node)
1841+
GenTree* Lowering::LowerCall(GenTree* node)
17861842
{
17871843
GenTreeCall* call = node->AsCall();
17881844

@@ -1793,6 +1849,20 @@ void Lowering::LowerCall(GenTree* node)
17931849
// All runtime lookups are expected to be expanded in fgExpandRuntimeLookups
17941850
assert(!call->IsExpRuntimeLookup());
17951851

1852+
if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC)
1853+
{
1854+
#ifdef TARGET_AMD64
1855+
if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove)
1856+
{
1857+
GenTree* newNode = LowerCallMemmove(call);
1858+
if (newNode != nullptr)
1859+
{
1860+
return newNode->gtNext;
1861+
}
1862+
}
1863+
#endif
1864+
}
1865+
17961866
call->ClearOtherRegs();
17971867
LowerArgsForCall(call);
17981868

@@ -1911,6 +1981,7 @@ void Lowering::LowerCall(GenTree* node)
19111981
JITDUMP("lowering call (after):\n");
19121982
DISPTREERANGE(BlockRange(), call);
19131983
JITDUMP("\n");
1984+
return nullptr;
19141985
}
19151986

19161987
// Inserts profiler hook, GT_PROF_HOOK for a tail call node.

src/coreclr/jit/lower.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ class Lowering final : public Phase
126126
// ------------------------------
127127
// Call Lowering
128128
// ------------------------------
129-
void LowerCall(GenTree* call);
129+
GenTree* LowerCall(GenTree* call);
130+
GenTree* LowerCallMemmove(GenTreeCall* call);
130131
void LowerCFGCall(GenTreeCall* call);
131132
void MoveCFGCallArg(GenTreeCall* call, GenTree* node);
132133
#ifndef TARGET_64BIT

0 commit comments

Comments
 (0)