-
Notifications
You must be signed in to change notification settings - Fork 5k
Unroll Buffer.Memmove for constant lengths #83638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9f45dbf
cc66929
60cf0ea
2ec2e4c
33d9c4e
42e325f
adc1da3
b6a11f1
a3b90e6
2bb7dfe
c569201
9ea8e86
ff66578
aa3e32d
e228f97
f4a9174
2124513
0fbc82c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2554,6 +2554,152 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta) | |
inst_Mov(TYP_I_IMPL, REG_SPBASE, regSpDelta, /* canSkip */ false); | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can | ||
// ignore the fact that dst and src might overlap if we save the whole | ||
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120): | ||
// | ||
// vmovdqu ymm0, ymmword ptr[rax + 0] | ||
// vmovdqu ymm1, ymmword ptr[rax + 32] | ||
// vmovdqu ymm2, ymmword ptr[rax + 64] | ||
// vmovdqu ymm3, ymmword ptr[rax + 88] | ||
// vmovdqu ymmword ptr[rcx + 0], ymm0 | ||
// vmovdqu ymmword ptr[rcx + 32], ymm1 | ||
// vmovdqu ymmword ptr[rcx + 64], ymm2 | ||
// vmovdqu ymmword ptr[rcx + 88], ymm3 | ||
// | ||
// Arguments: | ||
// tree - GenTreeBlk node | ||
// | ||
void CodeGen::genCodeForMemmove(GenTreeBlk* tree) | ||
{ | ||
// Not yet finished for x86 | ||
assert(TARGET_POINTER_SIZE == 8); | ||
Comment on lines
+2576
to
+2577
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious -- how do we expect the logic/heuristics to be different on x86? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. special byte-register for 1 byte access, up to 4 instead of 2 GPR for 1..15 range. I plan to file a follow up to add arm64 and then I'll probably add x86 and arm32 when I have time There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would we use more registers on x86, there are less registers available? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, but in order to memmove 15 bytes you need |
||
|
||
// TODO-CQ: Support addressing modes, for now we don't use them | ||
GenTreeIndir* srcIndir = tree->Data()->AsIndir(); | ||
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained()); | ||
|
||
regNumber dst = genConsumeReg(tree->Addr()); | ||
regNumber src = genConsumeReg(srcIndir->Addr()); | ||
unsigned size = tree->Size(); | ||
|
||
// TODO-XARCH-AVX512: Consider enabling it here | ||
unsigned simdSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) | ||
? YMM_REGSIZE_BYTES | ||
: XMM_REGSIZE_BYTES; | ||
|
||
if (size >= simdSize) | ||
{ | ||
// Number of SIMD regs needed to save the whole src to regs. | ||
unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT); | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Lowering takes care to only introduce this node such that we will always have enough | ||
// temporary SIMD registers to fully load the source and avoid any potential issues with overlap. | ||
assert(numberOfSimdRegs * simdSize >= size); | ||
|
||
// Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
regNumber tempRegs[LinearScan::MaxInternalCount] = {}; | ||
for (unsigned i = 0; i < numberOfSimdRegs; i++) | ||
{ | ||
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT); | ||
} | ||
|
||
auto emitSimdLoadStore = [&](bool load) { | ||
unsigned offset = 0; | ||
int regIndex = 0; | ||
instruction simdMov = simdUnalignedMovIns(); | ||
do | ||
{ | ||
if (load) | ||
{ | ||
// vmovdqu ymm, ymmword ptr[src + offset] | ||
GetEmitter()->emitIns_R_AR(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], src, offset); | ||
} | ||
else | ||
{ | ||
// vmovdqu ymmword ptr[dst + offset], ymm | ||
GetEmitter()->emitIns_AR_R(simdMov, EA_ATTR(simdSize), tempRegs[regIndex++], dst, offset); | ||
} | ||
offset += simdSize; | ||
if (size == offset) | ||
{ | ||
break; | ||
} | ||
|
||
assert(size > offset); | ||
if ((size - offset) < simdSize) | ||
{ | ||
// Overlap with the previosly processed data. We'll always use SIMD for that for simplicity | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. | ||
offset = size - simdSize; | ||
} | ||
} while (true); | ||
}; | ||
|
||
// load everything from SRC to temp regs | ||
emitSimdLoadStore(/* load */ true); | ||
// store them to DST | ||
emitSimdLoadStore(/* load */ false); | ||
} | ||
else | ||
{ | ||
// Here we work with size 1..15 (x64) | ||
assert((size > 0) && (size < XMM_REGSIZE_BYTES)); | ||
|
||
auto emitScalarLoadStore = [&](bool load, int size, regNumber tempReg, int offset) { | ||
var_types memType; | ||
switch (size) | ||
{ | ||
case 1: | ||
memType = TYP_UBYTE; | ||
break; | ||
case 2: | ||
memType = TYP_USHORT; | ||
break; | ||
case 4: | ||
memType = TYP_INT; | ||
break; | ||
case 8: | ||
memType = TYP_LONG; | ||
break; | ||
default: | ||
unreached(); | ||
} | ||
|
||
if (load) | ||
{ | ||
// mov reg, qword ptr [src + offset] | ||
GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset); | ||
} | ||
else | ||
{ | ||
// mov qword ptr [dst + offset], reg | ||
GetEmitter()->emitIns_AR_R(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset); | ||
} | ||
}; | ||
|
||
// Use overlapping loads/stores, e. g. for size == 9: "mov [dst], tmpReg1; mov [dst+1], tmpReg2". | ||
unsigned loadStoreSize = 1 << BitOperations::Log2(size); | ||
if (loadStoreSize == size) | ||
{ | ||
regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT); | ||
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg, 0); | ||
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg, 0); | ||
} | ||
else | ||
{ | ||
assert(tree->AvailableTempRegCount() == 2); | ||
regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT); | ||
regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT); | ||
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0); | ||
emitScalarLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize); | ||
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0); | ||
emitScalarLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize); | ||
} | ||
} | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// genLclHeap: Generate code for localloc. | ||
// | ||
|
@@ -2921,6 +3067,7 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) | |
genCodeForInitBlkRepStos(storeBlkNode); | ||
} | ||
break; | ||
case GenTreeBlk::BlkOpKindUnrollMemmove: | ||
case GenTreeBlk::BlkOpKindUnroll: | ||
if (isCopyBlk) | ||
{ | ||
|
@@ -2930,7 +3077,15 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* storeBlkNode) | |
GetEmitter()->emitDisableGC(); | ||
} | ||
#endif | ||
genCodeForCpBlkUnroll(storeBlkNode); | ||
if (storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll) | ||
{ | ||
genCodeForCpBlkUnroll(storeBlkNode); | ||
} | ||
else | ||
{ | ||
assert(storeBlkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove); | ||
genCodeForMemmove(storeBlkNode); | ||
} | ||
#ifndef JIT32_GCENCODER | ||
if (storeBlkNode->gtBlkOpGcUnsafe) | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8925,8 +8925,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX | |
public: | ||
enum UnrollKind | ||
{ | ||
Memset, // Initializing memory with some value | ||
Memcpy // Copying memory from src to dst | ||
Memset, | ||
Memcpy, | ||
Memmove | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Unused? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is used in |
||
}; | ||
|
||
//------------------------------------------------------------------------ | ||
|
@@ -8956,7 +8957,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX | |
threshold *= 2; | ||
#elif defined(TARGET_XARCH) | ||
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial | ||
threshold = max(threshold, YMM_REGSIZE_BYTES); | ||
threshold = min(threshold, YMM_REGSIZE_BYTES); | ||
#endif | ||
} | ||
#if defined(TARGET_XARCH) | ||
|
@@ -8989,7 +8990,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX | |
// | ||
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data | ||
// | ||
return threshold * 4; | ||
threshold *= 4; | ||
|
||
// NOTE: Memmove's unrolling is currently limitted with LSRA - | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu. | ||
return threshold; | ||
Comment on lines
+8995
to
+8997
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not obvious what the significance of this comment is at a glance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just a hint to whoever decides to change the thresholds. E.g. I planned to play with the thresholds for memset/memcpy for very hot places based on PGO data. But me (or whoever) will have to be careful with memmove |
||
} | ||
|
||
//------------------------------------------------------------------------ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -473,8 +473,14 @@ GenTree* Lowering::LowerNode(GenTree* node) | |
return LowerSwitch(node); | ||
|
||
case GT_CALL: | ||
LowerCall(node); | ||
break; | ||
{ | ||
GenTree* newNode = LowerCall(node); | ||
if (newNode != nullptr) | ||
{ | ||
return newNode; | ||
} | ||
} | ||
break; | ||
|
||
case GT_LT: | ||
case GT_LE: | ||
|
@@ -1775,14 +1781,64 @@ GenTree* Lowering::AddrGen(void* addr) | |
return AddrGen((ssize_t)addr); | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// LowerCallMemmove: Replace Buffer.Memmove(DST, SRC, CNS_SIZE) with a GT_STORE_BLK: | ||
// | ||
// * STORE_BLK struct<CNS_SIZE> (copy) (Unroll) | ||
// +--* LCL_VAR byref dst | ||
// \--* IND struct | ||
// \--* LCL_VAR byref src | ||
// | ||
// Arguments: | ||
// tree - GenTreeCall node to replace with STORE_BLK | ||
// | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) | ||
{ | ||
assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove); | ||
assert(call->gtArgs.CountArgs() == 3); | ||
|
||
GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode(); | ||
if (lengthArg->IsIntegralConst()) | ||
{ | ||
ssize_t cnsSize = lengthArg->AsIntCon()->IconValue(); | ||
// TODO-CQ: drop the whole thing in case of 0 | ||
if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove))) | ||
{ | ||
GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode(); | ||
GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode(); | ||
|
||
// TODO-CQ: Try to create an addressing mode | ||
GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr); | ||
srcBlk->gtFlags |= GTF_GLOB_REF; | ||
srcBlk->SetContained(); | ||
|
||
GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK) | ||
GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, dstAddr, srcBlk, comp->typGetBlkLayout((unsigned)cnsSize)); | ||
storeBlk->gtFlags |= (GTF_BLK_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF); | ||
|
||
// TODO-CQ: Use GenTreeObj::BlkOpKindUnroll here if srcAddr and dstAddr don't overlap, thus, we can | ||
// unroll this memmove as memcpy - it doesn't require lots of temp registers | ||
storeBlk->gtBlkOpKind = GenTreeObj::BlkOpKindUnrollMemmove; | ||
|
||
BlockRange().InsertBefore(call, srcBlk); | ||
BlockRange().InsertBefore(call, storeBlk); | ||
BlockRange().Remove(lengthArg); | ||
BlockRange().Remove(call); | ||
|
||
return storeBlk; | ||
} | ||
} | ||
return nullptr; | ||
} | ||
|
||
// do lowering steps for a call | ||
// this includes: | ||
// - adding the placement nodes (either stack or register variety) for arguments | ||
// - lowering the expression that calculates the target address | ||
// - adding nodes for other operations that occur after the call sequence starts and before | ||
// control transfer occurs (profiling and tail call helpers, pinvoke incantations) | ||
// | ||
void Lowering::LowerCall(GenTree* node) | ||
GenTree* Lowering::LowerCall(GenTree* node) | ||
{ | ||
GenTreeCall* call = node->AsCall(); | ||
|
||
|
@@ -1793,6 +1849,20 @@ void Lowering::LowerCall(GenTree* node) | |
// All runtime lookups are expected to be expanded in fgExpandRuntimeLookups | ||
assert(!call->IsExpRuntimeLookup()); | ||
|
||
if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC) | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
#ifdef TARGET_AMD64 | ||
if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove) | ||
{ | ||
GenTree* newNode = LowerCallMemmove(call); | ||
if (newNode != nullptr) | ||
{ | ||
return newNode->gtNext; | ||
} | ||
} | ||
#endif | ||
} | ||
|
||
call->ClearOtherRegs(); | ||
LowerArgsForCall(call); | ||
|
||
|
@@ -1911,6 +1981,7 @@ void Lowering::LowerCall(GenTree* node) | |
JITDUMP("lowering call (after):\n"); | ||
DISPTREERANGE(BlockRange(), call); | ||
JITDUMP("\n"); | ||
return nullptr; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couldn't this return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't return |
||
} | ||
|
||
// Inserts profiler hook, GT_PROF_HOOK for a tail call node. | ||
|
Uh oh!
There was an error while loading. Please reload this page.