Skip to content

Commit 84ffa0f

Browse files
authored
JIT: Support bitwise field insertions for return registers (dotnet#113178)
Based on the new `FIELD_LIST` support for returns this PR adds support for the JIT to combine smaller fields via bitwise operations when returned, instead of spilling these to stack. win-x64 examples: ```csharp static int? Test() { return Environment.TickCount; } ``` ```diff call System.Environment:get_TickCount():int - mov dword ptr [rsp+0x24], eax - mov byte ptr [rsp+0x20], 1 - mov rax, qword ptr [rsp+0x20] - ;; size=19 bbWeight=1 PerfScore 4.00 + mov eax, eax + shl rax, 32 + or rax, 1 + ;; size=15 bbWeight=1 PerfScore 2.00 ``` (the `mov eax, eax` is unnecessary, but not that simple to get rid of) ```csharp static (int x, float y) Test(int x, float y) { return (x, y); } ``` ```diff - mov dword ptr [rsp], ecx - vmovss dword ptr [rsp+0x04], xmm1 - mov rax, qword ptr [rsp] + vmovd eax, xmm1 + shl rax, 32 + mov ecx, ecx + or rax, rcx ;; size=13 bbWeight=1 PerfScore 3.00 ``` An arm64 example: ```csharp static Memory<int> ToMemory(int[] arr) { return arr.AsMemory(); } ``` ```diff G_M45070_IG01: ;; offset=0x0000 - stp fp, lr, [sp, #-0x20]! + stp fp, lr, [sp, #-0x10]! mov fp, sp - str xzr, [fp, #0x10] // [V03 tmp2] - ;; size=12 bbWeight=1 PerfScore 2.50 -G_M45070_IG02: ;; offset=0x000C + ;; size=8 bbWeight=1 PerfScore 1.50 +G_M45070_IG02: ;; offset=0x0008 cbz x0, G_M45070_IG06 ;; size=4 bbWeight=1 PerfScore 1.00 -G_M45070_IG03: ;; offset=0x0010 - str x0, [fp, #0x10] // [V07 tmp6] - str wzr, [fp, #0x18] // [V08 tmp7] - ldr x0, [fp, #0x10] // [V07 tmp6] - ldr w0, [x0, #0x08] - str w0, [fp, #0x1C] // [V09 tmp8] - ;; size=20 bbWeight=0.80 PerfScore 6.40 -G_M45070_IG04: ;; offset=0x0024 - ldp x0, x1, [fp, #0x10] // [V03 tmp2], [V03 tmp2+0x08] - ;; size=4 bbWeight=1 PerfScore 3.00 -G_M45070_IG05: ;; offset=0x0028 - ldp fp, lr, [sp], #0x20 +G_M45070_IG03: ;; offset=0x000C + ldr w1, [x0, #0x08] + ;; size=4 bbWeight=0.80 PerfScore 2.40 +G_M45070_IG04: ;; offset=0x0010 + mov w1, w1 + mov x2, xzr + orr x1, x2, x1, LSL #32 + ;; size=12 bbWeight=1 PerfScore 2.00 +G_M45070_IG05: ;; offset=0x001C + ldp fp, lr, [sp], #0x10 ret lr ;; size=8 bbWeight=1 PerfScore 2.00 -G_M45070_IG06: ;; offset=0x0030 - str xzr, [fp, #0x10] // [V07 tmp6] - str xzr, [fp, #0x18] +G_M45070_IG06: ;; offset=0x0024 + mov x0, xzr + mov w1, wzr b G_M45070_IG04 - ;; size=12 bbWeight=0.20 PerfScore 0.60 + ;; size=12 bbWeight=0.20 PerfScore 0.40 ``` (sneak peek -- this codegen requires some supplementary changes, and there's additional opportunities here) This is the return counterpart to dotnet#112740. That PR has a bunch of regressions that makes it look like we need to support returns/call arguments first, before we try to support parameters. There's a few follow-ups here: - Support for float->float insertions (when a float value needs to be returned as the 1st, 2nd, .... field of a SIMD register) - Support for coalescing memory loads, particularly because the fields of the `FIELD_LIST` come from a promoted struct that ended up DNER. In those cases we should be able to recombine the fields back to a single large field, instead of combining them with bitwise operations. - Support for constant folding the bitwise insertions. This requires some more constant folding support in lowering. - The JIT has lots of (now outdated) restrictions based around multi-reg returns that get in the way. Lifting these should improve things considerably.
1 parent 7f00027 commit 84ffa0f

File tree

8 files changed

+259
-160
lines changed

8 files changed

+259
-160
lines changed

src/coreclr/jit/compiler.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6590,7 +6590,7 @@ class Compiler
65906590
#endif // FEATURE_SIMD
65916591
GenTree* fgMorphIndexAddr(GenTreeIndexAddr* tree);
65926592
GenTree* fgMorphExpandCast(GenTreeCast* tree);
6593-
GenTreeFieldList* fgMorphLclArgToFieldList(GenTreeLclVarCommon* lcl);
6593+
GenTreeFieldList* fgMorphLclToFieldList(GenTreeLclVar* lcl);
65946594
GenTreeCall* fgMorphArgs(GenTreeCall* call);
65956595

65966596
void fgMakeOutgoingStructArgCopy(GenTreeCall* call, CallArg* arg);
@@ -6665,7 +6665,7 @@ class Compiler
66656665
GenTree* fgMorphCopyBlock(GenTree* tree);
66666666
private:
66676667
GenTree* fgMorphSmpOp(GenTree* tree, MorphAddrContext* mac, bool* optAssertionPropDone = nullptr);
6668-
void fgTryReplaceStructLocalWithField(GenTree* tree);
6668+
void fgTryReplaceStructLocalWithFields(GenTree** use);
66696669
GenTree* fgMorphFinalizeIndir(GenTreeIndir* indir);
66706670
GenTree* fgOptimizeCast(GenTreeCast* cast);
66716671
GenTree* fgOptimizeCastOnStore(GenTree* store);

src/coreclr/jit/gentree.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3095,6 +3095,19 @@ struct GenTreeOp : public GenTreeUnOp
30953095
// then sets the flag GTF_DIV_BY_CNS_OPT and GTF_DONT_CSE on the constant
30963096
void CheckDivideByConstOptimized(Compiler* comp);
30973097

3098+
GenTree*& ReturnValueRef()
3099+
{
3100+
assert(OperIs(GT_RETURN, GT_RETFILT, GT_SWIFT_ERROR_RET));
3101+
#ifdef SWIFT_SUPPORT
3102+
if (OperIs(GT_SWIFT_ERROR_RET))
3103+
{
3104+
return gtOp2;
3105+
}
3106+
#endif // SWIFT_SUPPORT
3107+
3108+
return gtOp1;
3109+
}
3110+
30983111
GenTree* GetReturnValue() const
30993112
{
31003113
assert(OperIs(GT_RETURN, GT_RETFILT, GT_SWIFT_ERROR_RET));

src/coreclr/jit/lclmorph.cpp

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,38 +1240,6 @@ class LocalAddressVisitor final : public GenTreeVisitor<LocalAddressVisitor>
12401240
PopValue();
12411241
break;
12421242

1243-
case GT_RETURN:
1244-
if (TopValue(0).Node() != node)
1245-
{
1246-
assert(TopValue(1).Node() == node);
1247-
assert(TopValue(0).Node() == node->gtGetOp1());
1248-
GenTreeUnOp* ret = node->AsUnOp();
1249-
GenTree* retVal = ret->gtGetOp1();
1250-
if (retVal->OperIs(GT_LCL_VAR))
1251-
{
1252-
// TODO-1stClassStructs: this block is a temporary workaround to keep diffs small,
1253-
// having `doNotEnreg` affect block init and copy transformations that affect many methods.
1254-
// I have a change that introduces more precise and effective solution for that, but it would
1255-
// be merged separately.
1256-
GenTreeLclVar* lclVar = retVal->AsLclVar();
1257-
unsigned lclNum = lclVar->GetLclNum();
1258-
if (!m_compiler->compMethodReturnsMultiRegRetType() &&
1259-
!m_compiler->lvaIsImplicitByRefLocal(lclVar->GetLclNum()))
1260-
{
1261-
LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclNum);
1262-
if (varDsc->lvFieldCnt > 1)
1263-
{
1264-
m_compiler->lvaSetVarDoNotEnregister(
1265-
lclNum DEBUGARG(DoNotEnregisterReason::BlockOpRet));
1266-
}
1267-
}
1268-
}
1269-
1270-
EscapeValue(TopValue(0), node);
1271-
PopValue();
1272-
}
1273-
break;
1274-
12751243
case GT_CALL:
12761244
while (TopValue(0).Node() != node)
12771245
{

src/coreclr/jit/lower.cpp

Lines changed: 211 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -538,13 +538,14 @@ GenTree* Lowering::LowerNode(GenTree* node)
538538

539539
case GT_CAST:
540540
{
541-
GenTree* nextNode = LowerCast(node);
542-
#if defined(TARGET_XARCH)
543-
if (nextNode != nullptr)
541+
if (!TryRemoveCast(node->AsCast()))
544542
{
545-
return nextNode;
543+
GenTree* nextNode = LowerCast(node);
544+
if (nextNode != nullptr)
545+
{
546+
return nextNode;
547+
}
546548
}
547-
#endif // TARGET_XARCH
548549
}
549550
break;
550551

@@ -4752,20 +4753,7 @@ void Lowering::LowerRetFieldList(GenTreeOp* ret, GenTreeFieldList* fieldList)
47524753
return;
47534754
}
47544755

4755-
unsigned regIndex = 0;
4756-
for (GenTreeFieldList::Use& use : fieldList->Uses())
4757-
{
4758-
var_types regType = retDesc.GetReturnRegType(regIndex);
4759-
if (varTypeUsesIntReg(regType) != varTypeUsesIntReg(use.GetNode()))
4760-
{
4761-
GenTree* bitcast = comp->gtNewOperNode(GT_BITCAST, regType, use.GetNode());
4762-
BlockRange().InsertBefore(fieldList, bitcast);
4763-
use.SetNode(bitcast);
4764-
LowerNode(bitcast);
4765-
}
4766-
4767-
regIndex++;
4768-
}
4756+
LowerFieldListToFieldListOfRegisters(fieldList);
47694757
}
47704758

47714759
//----------------------------------------------------------------------------------------------
@@ -4777,54 +4765,196 @@ void Lowering::LowerRetFieldList(GenTreeOp* ret, GenTreeFieldList* fieldList)
47774765
// fieldList - The FIELD_LIST node
47784766
//
47794767
// Returns:
4780-
// True if the fields of the FIELD_LIST map cleanly to the ABI returned
4781-
// registers. Insertions of bitcasts may still be required.
4768+
// True if the fields of the FIELD_LIST are all direct insertions into the
4769+
// return registers.
47824770
//
47834771
bool Lowering::IsFieldListCompatibleWithReturn(GenTreeFieldList* fieldList)
47844772
{
47854773
JITDUMP("Checking if field list [%06u] is compatible with return ABI: ", Compiler::dspTreeID(fieldList));
47864774
const ReturnTypeDesc& retDesc = comp->compRetTypeDesc;
47874775
unsigned numRetRegs = retDesc.GetReturnRegCount();
4788-
unsigned regIndex = 0;
47894776

4790-
for (const GenTreeFieldList::Use& use : fieldList->Uses())
4777+
GenTreeFieldList::Use* use = fieldList->Uses().GetHead();
4778+
for (unsigned i = 0; i < numRetRegs; i++)
47914779
{
4792-
if (regIndex >= numRetRegs)
4793-
{
4794-
JITDUMP("it is not; too many fields\n");
4795-
return false;
4796-
}
4780+
unsigned regStart = retDesc.GetReturnFieldOffset(i);
4781+
var_types regType = retDesc.GetReturnRegType(i);
4782+
unsigned regEnd = regStart + genTypeSize(regType);
47974783

4798-
unsigned offset = retDesc.GetReturnFieldOffset(regIndex);
4799-
if (offset != use.GetOffset())
4784+
// TODO-CQ: Could just create a 0 for this.
4785+
if (use == nullptr)
48004786
{
4801-
JITDUMP("it is not; field %u register starts at offset %u, but field starts at offset %u\n", regIndex,
4802-
offset, use.GetOffset());
4787+
JITDUMP("it is not; register %u has no corresponding field\n", i);
48034788
return false;
48044789
}
48054790

4806-
var_types fieldType = genActualType(use.GetNode());
4807-
var_types regType = genActualType(retDesc.GetReturnRegType(regIndex));
4808-
if (genTypeSize(fieldType) != genTypeSize(regType))
4791+
do
48094792
{
4810-
JITDUMP("it is not; field %u register has type %s but field has type %s\n", regIndex, varTypeName(regType),
4811-
varTypeName(fieldType));
4812-
return false;
4813-
}
4793+
unsigned fieldStart = use->GetOffset();
48144794

4815-
regIndex++;
4795+
if (fieldStart < regStart)
4796+
{
4797+
// Not fully contained in a register.
4798+
// TODO-CQ: Could just remove these fields if they don't partially overlap with the next register.
4799+
JITDUMP("it is not; field [%06u] starts before register %u\n", Compiler::dspTreeID(use->GetNode()), i);
4800+
return false;
4801+
}
4802+
4803+
if (fieldStart >= regEnd)
4804+
{
4805+
break;
4806+
}
4807+
4808+
unsigned fieldEnd = fieldStart + genTypeSize(use->GetType());
4809+
if (fieldEnd > regEnd)
4810+
{
4811+
JITDUMP("it is not; field [%06u] ends after register %u\n", Compiler::dspTreeID(use->GetNode()), i);
4812+
return false;
4813+
}
4814+
4815+
// float -> float insertions are not yet supported
4816+
if (varTypeUsesFloatReg(use->GetNode()) && varTypeUsesFloatReg(regType) && (fieldStart != regStart))
4817+
{
4818+
JITDUMP("it is not; field [%06u] requires an insertion into register %u\n",
4819+
Compiler::dspTreeID(use->GetNode()), i);
4820+
return false;
4821+
}
4822+
4823+
use = use->GetNext();
4824+
} while (use != nullptr);
48164825
}
48174826

4818-
if (regIndex != numRetRegs)
4827+
if (use != nullptr)
48194828
{
4820-
JITDUMP("it is not; too few fields\n");
4829+
// TODO-CQ: Could just remove these fields.
4830+
JITDUMP("it is not; field [%06u] corresponds to no register\n", Compiler::dspTreeID(use->GetNode()));
48214831
return false;
48224832
}
48234833

48244834
JITDUMP("it is\n");
48254835
return true;
48264836
}
48274837

4838+
//----------------------------------------------------------------------------------------------
4839+
// LowerFieldListToFieldListOfRegisters:
4840+
// Lower the specified field list into one that is compatible with the return
4841+
// registers.
4842+
//
4843+
// Arguments:
4844+
// fieldList - The field list
4845+
//
4846+
void Lowering::LowerFieldListToFieldListOfRegisters(GenTreeFieldList* fieldList)
4847+
{
4848+
const ReturnTypeDesc& retDesc = comp->compRetTypeDesc;
4849+
unsigned numRegs = retDesc.GetReturnRegCount();
4850+
4851+
GenTreeFieldList::Use* use = fieldList->Uses().GetHead();
4852+
assert(fieldList->Uses().IsSorted());
4853+
4854+
for (unsigned i = 0; i < numRegs; i++)
4855+
{
4856+
unsigned regStart = retDesc.GetReturnFieldOffset(i);
4857+
var_types regType = genActualType(retDesc.GetReturnRegType(i));
4858+
unsigned regEnd = regStart + genTypeSize(regType);
4859+
4860+
GenTreeFieldList::Use* regEntry = use;
4861+
4862+
assert(use != nullptr);
4863+
4864+
GenTree* fieldListPrev = fieldList->gtPrev;
4865+
4866+
do
4867+
{
4868+
unsigned fieldStart = use->GetOffset();
4869+
4870+
assert(fieldStart >= regStart);
4871+
4872+
if (fieldStart >= regEnd)
4873+
{
4874+
break;
4875+
}
4876+
4877+
var_types fieldType = use->GetType();
4878+
GenTree* value = use->GetNode();
4879+
4880+
unsigned insertOffset = fieldStart - regStart;
4881+
GenTreeFieldList::Use* nextUse = use->GetNext();
4882+
4883+
// First ensure the value does not have upper bits set that
4884+
// interfere with the next field.
4885+
if ((nextUse != nullptr) && (nextUse->GetOffset() < regEnd) &&
4886+
(fieldStart + genTypeSize(genActualType(fieldType)) > nextUse->GetOffset()))
4887+
{
4888+
assert(varTypeIsSmall(fieldType));
4889+
// This value may interfere with the next field. Ensure that doesn't happen.
4890+
if (comp->fgCastNeeded(value, varTypeToUnsigned(fieldType)))
4891+
{
4892+
value = comp->gtNewCastNode(TYP_INT, value, true, varTypeToUnsigned(fieldType));
4893+
BlockRange().InsertBefore(fieldList, value);
4894+
}
4895+
}
4896+
4897+
// If this is a float -> int insertion, then we need the bitcast now.
4898+
if (varTypeUsesFloatReg(value) && varTypeUsesIntReg(regType))
4899+
{
4900+
assert((genTypeSize(value) == 4) || (genTypeSize(value) == 8));
4901+
var_types castType = genTypeSize(value) == 4 ? TYP_INT : TYP_LONG;
4902+
value = comp->gtNewBitCastNode(castType, value);
4903+
BlockRange().InsertBefore(fieldList, value);
4904+
}
4905+
4906+
if (insertOffset + genTypeSize(fieldType) > genTypeSize(genActualType(value)))
4907+
{
4908+
value = comp->gtNewCastNode(TYP_LONG, value, true, TYP_LONG);
4909+
BlockRange().InsertBefore(fieldList, value);
4910+
}
4911+
4912+
if (fieldStart != regStart)
4913+
{
4914+
GenTree* shiftAmount = comp->gtNewIconNode((ssize_t)insertOffset * BITS_PER_BYTE);
4915+
value = comp->gtNewOperNode(GT_LSH, genActualType(value), value, shiftAmount);
4916+
BlockRange().InsertBefore(fieldList, shiftAmount, value);
4917+
}
4918+
4919+
if (regEntry != use)
4920+
{
4921+
GenTree* prevValue = regEntry->GetNode();
4922+
if (genActualType(value) != genActualType(regEntry->GetNode()))
4923+
{
4924+
prevValue = comp->gtNewCastNode(TYP_LONG, prevValue, true, TYP_LONG);
4925+
BlockRange().InsertBefore(fieldList, prevValue);
4926+
regEntry->SetNode(prevValue);
4927+
}
4928+
4929+
value = comp->gtNewOperNode(GT_OR, genActualType(value), prevValue, value);
4930+
BlockRange().InsertBefore(fieldList, value);
4931+
4932+
// Remove this field from the FIELD_LIST.
4933+
regEntry->SetNext(use->GetNext());
4934+
}
4935+
4936+
regEntry->SetNode(value);
4937+
regEntry->SetType(genActualType(value));
4938+
use = regEntry->GetNext();
4939+
} while (use != nullptr);
4940+
4941+
assert(regEntry != nullptr);
4942+
if (varTypeUsesIntReg(regEntry->GetNode()) != varTypeUsesIntReg(regType))
4943+
{
4944+
GenTree* bitCast = comp->gtNewBitCastNode(regType, regEntry->GetNode());
4945+
BlockRange().InsertBefore(fieldList, bitCast);
4946+
regEntry->SetNode(bitCast);
4947+
}
4948+
4949+
if (fieldListPrev->gtNext != fieldList)
4950+
{
4951+
LowerRange(fieldListPrev->gtNext, fieldList->gtPrev);
4952+
}
4953+
}
4954+
4955+
assert(use == nullptr);
4956+
}
4957+
48284958
//----------------------------------------------------------------------------------------------
48294959
// LowerStoreLocCommon: platform independent part of local var or field store lowering.
48304960
//
@@ -8792,6 +8922,45 @@ void Lowering::ContainCheckRet(GenTreeUnOp* ret)
87928922
#endif // FEATURE_MULTIREG_RET
87938923
}
87948924

8925+
//------------------------------------------------------------------------
8926+
// TryRemoveCast:
8927+
// Try to remove a cast node by changing its operand.
8928+
//
8929+
// Arguments:
8930+
// node - Cast node
8931+
//
8932+
// Returns:
8933+
// True if the cast was removed.
8934+
//
8935+
bool Lowering::TryRemoveCast(GenTreeCast* node)
8936+
{
8937+
if (comp->opts.OptimizationDisabled())
8938+
{
8939+
return false;
8940+
}
8941+
8942+
if (node->gtOverflow())
8943+
{
8944+
return false;
8945+
}
8946+
8947+
GenTree* op = node->CastOp();
8948+
if (!op->OperIsConst())
8949+
{
8950+
return false;
8951+
}
8952+
8953+
GenTree* folded = comp->gtFoldExprConst(node);
8954+
assert(folded == node);
8955+
if (folded->OperIs(GT_CAST))
8956+
{
8957+
return false;
8958+
}
8959+
8960+
op->SetUnusedValue();
8961+
return true;
8962+
}
8963+
87958964
//------------------------------------------------------------------------
87968965
// TryRemoveBitCast:
87978966
// Try to remove a bitcast node by changing its operand.

0 commit comments

Comments
 (0)