Skip to content

Commit e13f0dc

Browse files
Optimize stackalloc zeroing via BLK (#83255)
Co-authored-by: SingleAccretion <[email protected]>
1 parent 3e6ad47 commit e13f0dc

File tree

6 files changed

+502
-118
lines changed

6 files changed

+502
-118
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 26 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -2742,18 +2742,10 @@ void CodeGen::genLclHeap(GenTree* tree)
27422742

27432743
// compute the amount of memory to allocate to properly STACK_ALIGN.
27442744
size_t amount = 0;
2745-
if (size->IsCnsIntOrI())
2745+
if (size->IsCnsIntOrI() && size->isContained())
27462746
{
2747-
// If size is a constant, then it must be contained.
2748-
assert(size->isContained());
2749-
2750-
// If amount is zero then return null in targetReg
27512747
amount = size->AsIntCon()->gtIconVal;
2752-
if (amount == 0)
2753-
{
2754-
instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
2755-
goto BAILOUT;
2756-
}
2748+
assert((amount > 0) && (amount <= UINT_MAX));
27572749

27582750
// 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
27592751
amount = AlignUp(amount, STACK_ALIGN);
@@ -2848,77 +2840,44 @@ void CodeGen::genLclHeap(GenTree* tree)
28482840
goto ALLOC_DONE;
28492841
}
28502842

2851-
inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
2852-
stackAdjustment += (target_size_t)compiler->lvaOutgoingArgSpaceSize;
2853-
locAllocStackOffset = stackAdjustment;
2843+
if (size->IsCnsIntOrI() && size->isContained())
2844+
{
2845+
stackAdjustment = 0;
2846+
locAllocStackOffset = (target_size_t)compiler->lvaOutgoingArgSpaceSize;
2847+
}
2848+
else
2849+
{
2850+
inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
2851+
stackAdjustment += (target_size_t)compiler->lvaOutgoingArgSpaceSize;
2852+
locAllocStackOffset = stackAdjustment;
2853+
}
28542854
}
28552855
#endif
28562856

2857-
if (size->IsCnsIntOrI())
2857+
if (size->IsCnsIntOrI() && size->isContained())
28582858
{
28592859
// We should reach here only for non-zero, constant size allocations.
28602860
assert(amount > 0);
28612861
assert((amount % STACK_ALIGN) == 0);
2862-
assert((amount % REGSIZE_BYTES) == 0);
28632862

2864-
// For small allocations we will generate up to six push 0 inline
2865-
size_t cntRegSizedWords = amount / REGSIZE_BYTES;
2866-
if (compiler->info.compInitMem && (cntRegSizedWords <= 6))
2863+
// We should reach here only for non-zero, constant size allocations which we zero
2864+
// via BLK explicitly, so just bump the stack pointer.
2865+
if ((amount >= compiler->eeGetPageSize()) || (TARGET_POINTER_SIZE == 4))
28672866
{
2868-
for (; cntRegSizedWords != 0; cntRegSizedWords--)
2869-
{
2870-
inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
2871-
}
2872-
2873-
lastTouchDelta = 0;
2874-
2875-
goto ALLOC_DONE;
2876-
}
2877-
2878-
#ifdef TARGET_X86
2879-
bool needRegCntRegister = true;
2880-
#else // !TARGET_X86
2881-
bool needRegCntRegister = initMemOrLargeAlloc;
2882-
#endif // !TARGET_X86
2883-
2884-
if (needRegCntRegister)
2885-
{
2886-
// If compInitMem=true, we can reuse targetReg as regcnt.
2887-
// Since size is a constant, regCnt is not yet initialized.
2888-
assert(regCnt == REG_NA);
2889-
if (compiler->info.compInitMem)
2890-
{
2891-
assert(tree->AvailableTempRegCount() == 0);
2892-
regCnt = targetReg;
2893-
}
2894-
else
2895-
{
2896-
regCnt = tree->GetSingleTempReg();
2897-
}
2867+
regCnt = tree->GetSingleTempReg();
2868+
instGen_Set_Reg_To_Imm(EA_PTRSIZE, regCnt, -(ssize_t)amount);
2869+
genStackPointerDynamicAdjustmentWithProbe(regCnt);
2870+
// lastTouchDelta is dynamic, and can be up to a page. So if we have outgoing arg space,
2871+
// we're going to assume the worst and probe.
28982872
}
2899-
2900-
if (!initMemOrLargeAlloc)
2873+
else
29012874
{
29022875
// Since the size is less than a page, and we don't need to zero init memory, simply adjust ESP.
2903-
// ESP might already be in the guard page, so we must touch it BEFORE
2904-
// the alloc, not after.
2905-
2906-
assert(amount < compiler->eeGetPageSize()); // must be < not <=
2876+
// ESP might already be in the guard page, so we must touch it BEFORE the alloc, not after.
29072877
lastTouchDelta = genStackPointerConstantAdjustmentLoopWithProbe(-(ssize_t)amount,
2908-
/* trackSpAdjustments */ regCnt == REG_NA);
2909-
goto ALLOC_DONE;
2878+
/* trackSpAdjustments */ true);
29102879
}
2911-
2912-
// else, "mov regCnt, amount"
2913-
2914-
if (compiler->info.compInitMem)
2915-
{
2916-
// When initializing memory, we want 'amount' to be the loop count.
2917-
assert((amount % STACK_ALIGN) == 0);
2918-
amount /= STACK_ALIGN;
2919-
}
2920-
2921-
instGen_Set_Reg_To_Imm(((size_t)(int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount);
2880+
goto ALLOC_DONE;
29222881
}
29232882

29242883
// We should not have any temp registers at this point.
@@ -2996,8 +2955,6 @@ void CodeGen::genLclHeap(GenTree* tree)
29962955
genDefineTempLabel(endLabel);
29972956
}
29982957

2999-
BAILOUT:
3000-
30012958
#ifdef JIT32_GCENCODER
30022959
if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
30032960
{

src/coreclr/jit/lower.cpp

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,7 +580,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
580580
break;
581581

582582
case GT_LCLHEAP:
583-
ContainCheckLclHeap(node->AsOp());
583+
LowerLclHeap(node);
584584
break;
585585

586586
#ifdef TARGET_XARCH
@@ -7992,6 +7992,70 @@ void Lowering::TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, Bas
79927992
}
79937993
}
79947994

7995+
//------------------------------------------------------------------------
7996+
// LowerLclHeap: a common logic to lower LCLHEAP.
7997+
//
7998+
// Arguments:
7999+
// blkNode - the LCLHEAP node we are lowering.
8000+
//
8001+
void Lowering::LowerLclHeap(GenTree* node)
8002+
{
8003+
assert(node->OperIs(GT_LCLHEAP));
8004+
8005+
#if defined(TARGET_XARCH)
8006+
if (node->gtGetOp1()->IsCnsIntOrI())
8007+
{
8008+
GenTreeIntCon* sizeNode = node->gtGetOp1()->AsIntCon();
8009+
ssize_t size = sizeNode->IconValue();
8010+
8011+
if (size == 0)
8012+
{
8013+
// Replace with null for LCLHEAP(0)
8014+
node->BashToZeroConst(TYP_I_IMPL);
8015+
BlockRange().Remove(sizeNode);
8016+
return;
8017+
}
8018+
8019+
if (comp->info.compInitMem)
8020+
{
8021+
ssize_t alignedSize = ALIGN_UP(size, STACK_ALIGN);
8022+
if ((size > UINT_MAX) || (alignedSize > UINT_MAX))
8023+
{
8024+
// Size is too big - don't mark sizeNode as contained
8025+
return;
8026+
}
8027+
8028+
LIR::Use use;
8029+
if (BlockRange().TryGetUse(node, &use))
8030+
{
8031+
// Align LCLHEAP size for more efficient zeroing via BLK
8032+
sizeNode->SetIconValue(alignedSize);
8033+
8034+
// Emit STORE_BLK to zero it
8035+
//
8036+
// * STORE_BLK struct<alignedSize> (init) (Unroll)
8037+
// +--* LCL_VAR long V01
8038+
// \--* CNS_INT int 0
8039+
//
8040+
GenTree* heapLcl = comp->gtNewLclvNode(use.ReplaceWithLclVar(comp), TYP_I_IMPL);
8041+
GenTree* zero = comp->gtNewIconNode(0);
8042+
GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK)
8043+
GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, heapLcl, zero, comp->typGetBlkLayout((unsigned)alignedSize));
8044+
storeBlk->gtFlags |= (GTF_IND_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF);
8045+
BlockRange().InsertAfter(use.Def(), heapLcl, zero, storeBlk);
8046+
LowerNode(storeBlk);
8047+
}
8048+
else
8049+
{
8050+
// Value is unused and we don't mark the size node as contained
8051+
return;
8052+
}
8053+
}
8054+
}
8055+
#endif
8056+
ContainCheckLclHeap(node->AsOp());
8057+
}
8058+
79958059
//------------------------------------------------------------------------
79968060
// LowerBlockStoreCommon: a common logic to lower STORE_OBJ/BLK/DYN_BLK.
79978061
//

src/coreclr/jit/lower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ class Lowering final : public Phase
315315
GenTree* LowerSignedDivOrMod(GenTree* node);
316316
void LowerBlockStore(GenTreeBlk* blkNode);
317317
void LowerBlockStoreCommon(GenTreeBlk* blkNode);
318+
void LowerLclHeap(GenTree* node);
318319
void ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenTree* addr, GenTree* addrParent);
319320
void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode);
320321
#ifdef TARGET_XARCH

src/coreclr/jit/lsraxarch.cpp

Lines changed: 5 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,60 +1843,17 @@ int LinearScan::BuildLclHeap(GenTree* tree)
18431843
{
18441844
int srcCount = 1;
18451845

1846-
// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
1847-
// Here '-' means don't care.
1848-
//
1849-
// Size? Init Memory? # temp regs
1850-
// 0 - 0 (returns 0)
1851-
// const and <=6 reg words - 0 (pushes '0')
1852-
// const and >6 reg words Yes 0 (pushes '0')
1853-
// const and <PageSize No 0 (amd64) 1 (x86)
1854-
//
1855-
// const and >=PageSize No 1 (regCnt)
1856-
// Non-const Yes 0 (regCnt=targetReg and pushes '0')
1857-
// Non-const No 1 (regCnt)
1858-
//
1859-
// Note: Here we don't need internal register to be different from targetReg.
1860-
// Rather, require it to be different from operand's reg.
1861-
18621846
GenTree* size = tree->gtGetOp1();
1863-
if (size->IsCnsIntOrI())
1847+
if (size->IsCnsIntOrI() && size->isContained())
18641848
{
1865-
assert(size->isContained());
18661849
srcCount = 0;
1867-
size_t sizeVal = size->AsIntCon()->gtIconVal;
1850+
size_t sizeVal = AlignUp((size_t)size->AsIntCon()->gtIconVal, STACK_ALIGN);
18681851

1869-
if (sizeVal == 0)
1852+
// Explicitly zeroed LCLHEAP also needs a regCnt in case of x86 or large page
1853+
if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize()))
18701854
{
1871-
// For regCnt
18721855
buildInternalIntRegisterDefForNode(tree);
18731856
}
1874-
else
1875-
{
1876-
// Compute the amount of memory to properly STACK_ALIGN.
1877-
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
1878-
// This should also help in debugging as we can examine the original size specified with localloc.
1879-
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
1880-
1881-
// For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
1882-
// we will generate 'push 0'.
1883-
assert((sizeVal % REGSIZE_BYTES) == 0);
1884-
1885-
if (!compiler->info.compInitMem)
1886-
{
1887-
#ifdef TARGET_X86
1888-
// x86 always needs regCnt.
1889-
// For regCnt
1890-
buildInternalIntRegisterDefForNode(tree);
1891-
#else // !TARGET_X86
1892-
if (sizeVal >= compiler->eeGetPageSize())
1893-
{
1894-
// For regCnt
1895-
buildInternalIntRegisterDefForNode(tree);
1896-
}
1897-
#endif // !TARGET_X86
1898-
}
1899-
}
19001857
}
19011858
else
19021859
{
@@ -1905,7 +1862,7 @@ int LinearScan::BuildLclHeap(GenTree* tree)
19051862
// For regCnt
19061863
buildInternalIntRegisterDefForNode(tree);
19071864
}
1908-
BuildUse(size);
1865+
BuildUse(size); // could be a non-contained constant
19091866
}
19101867
buildInternalRegisterUses();
19111868
BuildDef(tree);

0 commit comments

Comments
 (0)