Skip to content

Commit c861106

Browse files
authored
Unify unroll limits in a single entry point (#83274)
1 parent e1b1478 commit c861106

13 files changed

+95
-54
lines changed

src/coreclr/jit/codegenarm64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree)
31243124

31253125
if (compiler->info.compInitMem)
31263126
{
3127-
if (amount <= LCLHEAP_UNROLL_LIMIT)
3127+
if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
31283128
{
31293129
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
31303130
// stp xzr, xzr, [sp, #-16]!

src/coreclr/jit/codegenloongarch64.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2765,7 +2765,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* tree)
27652765
// Generate code for InitBlk by performing a loop unroll
27662766
// Preconditions:
27672767
// a) Both the size and fill byte value are integer constants.
2768-
// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes.
2768+
// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes.
27692769
void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
27702770
{
27712771
assert(node->OperIs(GT_STORE_BLK));
@@ -6457,7 +6457,7 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
64576457
// None
64586458
//
64596459
// Assumption:
6460-
// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes.
6460+
// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes.
64616461
//
64626462
void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode)
64636463
{

src/coreclr/jit/codegenxarch.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3613,7 +3613,8 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
36133613
}
36143614

36153615
unsigned loadSize = putArgNode->GetArgLoadSize();
3616-
assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT));
3616+
assert(!src->GetLayout(compiler)->HasGCPtr() &&
3617+
(loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy)));
36173618

36183619
unsigned offset = 0;
36193620
regNumber xmmTmpReg = REG_NA;

src/coreclr/jit/compiler.h

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8953,6 +8953,75 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
89538953
#endif // FEATURE_SIMD
89548954

89558955
public:
8956+
enum UnrollKind
8957+
{
8958+
Memset, // Initializing memory with some value
8959+
Memcpy // Copying memory from src to dst
8960+
};
8961+
8962+
//------------------------------------------------------------------------
8963+
// getUnrollThreshold: Calculates the unrolling threshold for the given operation
8964+
//
8965+
// Arguments:
8966+
// type - kind of the operation (memset/memcpy)
8967+
// canUseSimd - whether it is allowed to use SIMD or not
8968+
//
8969+
// Return Value:
8970+
// The unrolling threshold for the given operation in bytes
8971+
//
8972+
unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
8973+
{
8974+
unsigned threshold = TARGET_POINTER_SIZE;
8975+
8976+
#if defined(FEATURE_SIMD)
8977+
if (canUseSimd)
8978+
{
8979+
threshold = maxSIMDStructBytes();
8980+
#if defined(TARGET_ARM64)
8981+
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
8982+
//
8983+
// ldp q0, q1, [x1]
8984+
// stp q0, q1, [x0]
8985+
//
8986+
threshold *= 2;
8987+
#elif defined(TARGET_XARCH)
8988+
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
8989+
threshold = max(threshold, YMM_REGSIZE_BYTES);
8990+
#endif
8991+
}
8992+
#if defined(TARGET_XARCH)
8993+
else
8994+
{
8995+
// Compatibility with previous logic: we used to allow memset:128/memcpy:64
8996+
// on AMD64 (and 64/32 on x86) for cases where we don't use SIMD
8997+
// see https://github.com/dotnet/runtime/issues/83297
8998+
threshold *= 2;
8999+
}
9000+
#endif
9001+
#endif
9002+
9003+
if (type == UnrollKind::Memset)
9004+
{
9005+
// Typically, memset-like operations require less instructions than memcpy
9006+
threshold *= 2;
9007+
}
9008+
9009+
// Use 4 as a multiplier by default, thus, the final threshold will be:
9010+
//
9011+
// | arch | memset | memcpy |
9012+
// |-------------|--------|--------|
9013+
// | x86 avx512 | 512 | 256 | (TODO-XARCH-AVX512: ignored for now)
9014+
// | x86 avx | 256 | 128 |
9015+
// | x86 sse | 128 | 64 |
9016+
// | arm64 | 256 | 128 | ldp/stp (2x128bit)
9017+
// | arm | 32 | 16 | no SIMD support
9018+
// | loongarch64 | 64 | 32 | no SIMD support
9019+
//
9020+
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
9021+
//
9022+
return threshold * 4;
9023+
}
9024+
89569025
//------------------------------------------------------------------------
89579026
// largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered.
89589027
//

src/coreclr/jit/lowerarmarch.cpp

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -527,18 +527,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
527527
blkNode->SetOper(GT_STORE_BLK);
528528
}
529529

530-
unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;
531-
532-
#ifdef TARGET_ARM64
533-
if (isDstAddrLocal)
534-
{
535-
// Since dstAddr points to the stack CodeGen can use more optimal
536-
// quad-word store SIMD instructions for InitBlock.
537-
initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
538-
}
539-
#endif
540-
541-
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
530+
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) &&
531+
src->OperIs(GT_CNS_INT))
542532
{
543533
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
544534

@@ -608,17 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
608598
}
609599
}
610600

611-
unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;
612-
613-
#ifdef TARGET_ARM64
614-
if (isSrcAddrLocal && isDstAddrLocal)
615-
{
616-
// Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
617-
// quad-word load and store SIMD instructions for CopyBlock.
618-
copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
619-
}
620-
#endif
621-
601+
unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy);
622602
if (blkNode->OperIs(GT_STORE_OBJ))
623603
{
624604
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())

src/coreclr/jit/lowerloongarch64.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
295295
blkNode->SetOper(GT_STORE_BLK);
296296
}
297297

298-
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
298+
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) &&
299+
src->OperIs(GT_CNS_INT))
299300
{
300301
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
301302

@@ -353,7 +354,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
353354
{
354355
blkNode->SetOper(GT_STORE_BLK);
355356
}
356-
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
357+
else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
357358
{
358359
// If the size is small enough to unroll then we need to mark the block as non-interruptible
359360
// to actually allow unrolling. The generated code does not report GC references loaded in the
@@ -371,7 +372,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
371372
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
372373
}
373374
////////////////////////////////////////////////////////////////////////////////////////////////////////
374-
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
375+
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy)))
375376
{
376377
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
377378

src/coreclr/jit/lowerxarch.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
321321
blkNode->SetOper(GT_STORE_BLK);
322322
}
323323

324-
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT))
324+
if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)))
325325
{
326326
if (!src->OperIs(GT_CNS_INT))
327327
{
@@ -332,7 +332,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
332332
}
333333
else
334334
{
335-
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
336335

337336
// The fill value of an initblk is interpreted to hold a
338337
// value of (unsigned int8) however a constant of any size
@@ -357,6 +356,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
357356
{
358357
src->SetContained();
359358
}
359+
else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, /*canUseSimd*/ false))
360+
{
361+
// It turns out we can't use SIMD so the default threshold is too big
362+
goto TOO_BIG_TO_UNROLL;
363+
}
360364
}
361365
}
362366
#ifdef TARGET_AMD64
@@ -371,13 +375,15 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
371375
fill *= 0x01010101;
372376
}
373377

378+
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
374379
src->AsIntCon()->SetIconValue(fill);
375380

376381
ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr);
377382
}
378383
}
379384
else
380385
{
386+
TOO_BIG_TO_UNROLL:
381387
#ifdef TARGET_AMD64
382388
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper;
383389
#else
@@ -412,7 +418,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
412418
blkNode->SetOper(GT_STORE_BLK);
413419
}
414420
#ifndef JIT32_GCENCODER
415-
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
421+
else if (dstAddr->OperIsLocalAddr() &&
422+
(size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, false)))
416423
{
417424
// If the size is small enough to unroll then we need to mark the block as non-interruptible
418425
// to actually allow unrolling. The generated code does not report GC references loaded in the
@@ -472,7 +479,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
472479
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
473480
}
474481
}
475-
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
482+
else if (blkNode->OperIs(GT_STORE_BLK) &&
483+
(size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, !blkNode->GetLayout()->HasGCPtr())))
476484
{
477485
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
478486

@@ -655,7 +663,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
655663
}
656664
else
657665
#endif // TARGET_X86
658-
if (loadSize <= CPBLK_UNROLL_LIMIT)
666+
if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))
659667
{
660668
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
661669
}

src/coreclr/jit/lsraarm64.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree)
591591
// localloc.
592592
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
593593

594-
if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
594+
if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset))
595595
{
596596
// Need no internal registers
597597
}

src/coreclr/jit/targetamd64.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
#define ROUND_FLOAT 0 // Do not round intermed float expression results
1414
#define CPU_HAS_BYTE_REGS 0
1515

16-
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
17-
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
1816
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
1917
// of contiguous non-gc slots that trigger generating rep movsq instead of
2018
// sequences of movsq instructions

src/coreclr/jit/targetarm.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
#define ROUND_FLOAT 0 // Do not round intermed float expression results
1515
#define CPU_HAS_BYTE_REGS 0
1616

17-
#define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk.
18-
#define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk.
19-
2017
#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
2118
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
2219
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers

src/coreclr/jit/targetarm64.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,6 @@
1111
#define ROUND_FLOAT 0 // Do not round intermed float expression results
1212
#define CPU_HAS_BYTE_REGS 0
1313

14-
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
15-
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
16-
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
17-
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
18-
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)
19-
2014
#ifdef FEATURE_SIMD
2115
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
2216
#define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls

src/coreclr/jit/targetloongarch64.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
#define ROUND_FLOAT 0 // Do not round intermed float expression results
1717
#define CPU_HAS_BYTE_REGS 0
1818

19-
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
20-
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.
21-
2219
#ifdef FEATURE_SIMD
2320
#pragma error("SIMD Unimplemented yet LOONGARCH")
2421
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned

src/coreclr/jit/targetx86.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@
1111
#define ROUND_FLOAT 1 // round intermed float expression results
1212
#define CPU_HAS_BYTE_REGS 1
1313

14-
// TODO-CQ: Fine tune the following xxBlk threshold values:
15-
16-
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
17-
#define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk.
1814
#define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number
1915
// of contiguous non-gc slots that trigger generating rep movsq instead of
2016
// sequences of movsq instructions

0 commit comments

Comments
 (0)