Skip to content

Commit 81d039e

Browse files
Improve the handling of ToScalar and GetElement(0) (#86209)
* Improve the handling of ToScalar and GetElement(0) * Fix build failure * Ensure ToScalar handling is in EvalHWIntrinsicFunUnary, not FunBinary * Ensure we don't regress codegen for small types
1 parent f452d9c commit 81d039e

9 files changed

+261
-85
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5405,6 +5405,9 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
54055405

54065406
switch (intrinsicId)
54075407
{
5408+
case NI_Vector128_ToScalar:
5409+
case NI_Vector256_ToScalar:
5410+
case NI_Vector512_ToScalar:
54085411
case NI_SSE2_ConvertToInt32:
54095412
case NI_SSE2_ConvertToUInt32:
54105413
case NI_SSE2_X64_ConvertToInt64:

src/coreclr/jit/decomposelongs.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,11 +1702,15 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
17021702
case NI_Vector128_GetElement:
17031703
case NI_Vector256_GetElement:
17041704
case NI_Vector512_GetElement:
1705+
{
17051706
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
1707+
}
17061708

17071709
default:
1710+
{
17081711
noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition");
17091712
break;
1713+
}
17101714
}
17111715

17121716
return nullptr;

src/coreclr/jit/gentree.cpp

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19067,6 +19067,9 @@ bool GenTree::isContainableHWIntrinsic() const
1906719067
}
1906819068

1906919069
case NI_Vector128_GetElement:
19070+
case NI_Vector128_ToScalar:
19071+
case NI_Vector256_ToScalar:
19072+
case NI_Vector512_ToScalar:
1907019073
case NI_SSE2_ConvertToInt32:
1907119074
case NI_SSE2_ConvertToUInt32:
1907219075
case NI_SSE2_X64_ConvertToInt64:
@@ -22057,27 +22060,56 @@ GenTree* Compiler::gtNewSimdGetElementNode(
2205722060
assert(varTypeIsArithmetic(simdBaseType));
2205822061

2205922062
#if defined(TARGET_XARCH)
22063+
bool useToScalar = op2->IsIntegralConst(0);
22064+
22065+
#if defined(TARGET_X86)
22066+
// We handle decomposition via GetElement for simplicity
22067+
useToScalar &= !varTypeIsLong(simdBaseType);
22068+
#endif // TARGET_X86
22069+
22070+
if (useToScalar)
22071+
{
22072+
intrinsicId = NI_Vector128_ToScalar;
22073+
22074+
if (simdSize == 64)
22075+
{
22076+
intrinsicId = NI_Vector512_ToScalar;
22077+
}
22078+
else if (simdSize == 32)
22079+
{
22080+
intrinsicId = NI_Vector256_ToScalar;
22081+
}
22082+
22083+
return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
22084+
}
22085+
2206022086
switch (simdBaseType)
2206122087
{
22062-
// Using software fallback if simdBaseType is not supported by hardware
2206322088
case TYP_BYTE:
2206422089
case TYP_UBYTE:
2206522090
case TYP_INT:
2206622091
case TYP_UINT:
2206722092
case TYP_LONG:
2206822093
case TYP_ULONG:
22094+
{
22095+
// Using software fallback if simdBaseType is not supported by hardware
2206922096
assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
2207022097
break;
22098+
}
2207122099

2207222100
case TYP_DOUBLE:
2207322101
case TYP_FLOAT:
2207422102
case TYP_SHORT:
2207522103
case TYP_USHORT:
22104+
{
2207622105
assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
2207722106
break;
22107+
}
2207822108

2207922109
default:
22110+
{
2208022111
unreached();
22112+
}
2208122113
}
2208222114

2208322115
if (simdSize == 64)
@@ -22089,6 +22121,18 @@ GenTree* Compiler::gtNewSimdGetElementNode(
2208922121
intrinsicId = NI_Vector256_GetElement;
2209022122
}
2209122123
#elif defined(TARGET_ARM64)
22124+
if (op2->IsIntegralConst(0))
22125+
{
22126+
intrinsicId = NI_Vector128_ToScalar;
22127+
22128+
if (simdSize == 8)
22129+
{
22130+
intrinsicId = NI_Vector64_ToScalar;
22131+
}
22132+
22133+
return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
22134+
}
22135+
2209222136
if (simdSize == 8)
2209322137
{
2209422138
intrinsicId = NI_Vector64_GetElement;

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,14 +1276,20 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
12761276
case NI_Vector256_ToScalar:
12771277
case NI_Vector512_ToScalar:
12781278
{
1279-
assert(varTypeIsFloating(baseType));
1280-
12811279
if (op1->isContained() || op1->isUsedFromSpillTemp())
12821280
{
1281+
if (varTypeIsIntegral(baseType))
1282+
{
1283+
// We just want to emit a standard read from memory
1284+
ins = ins_Move_Extend(baseType, false);
1285+
attr = emitTypeSize(baseType);
1286+
}
12831287
genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1);
12841288
}
12851289
else
12861290
{
1291+
assert(varTypeIsFloating(baseType));
1292+
12871293
// Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
12881294
emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
12891295
}

src/coreclr/jit/hwintrinsiclistxarch.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal,
120120
HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
121121
HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
122122
HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
123-
HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
123+
HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
124124
HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
125125
HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
126126
HARDWARE_INTRINSIC(Vector128, ToVector512, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@ -226,7 +226,7 @@ HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal,
226226
HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible)
227227
HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
228228
HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
229-
HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
229+
HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
230230
HARDWARE_INTRINSIC(Vector256, ToVector512, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
231231
HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
232232
HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
@@ -328,7 +328,7 @@ HARDWARE_INTRINSIC(Vector512, StoreAligned,
328328
HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
329329
HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
330330
HARDWARE_INTRINSIC(Vector512, Subtract, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
331-
HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
331+
HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
332332
HARDWARE_INTRINSIC(Vector512, WidenLower, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
333333
HARDWARE_INTRINSIC(Vector512, WidenUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
334334
HARDWARE_INTRINSIC(Vector512, WithElement, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,34 +1678,48 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
16781678
{
16791679
assert(sig->numArgs == 2);
16801680

1681+
op2 = impStackTop(0).val;
1682+
16811683
switch (simdBaseType)
16821684
{
1683-
// Using software fallback if simdBaseType is not supported by hardware
16841685
case TYP_BYTE:
16851686
case TYP_UBYTE:
16861687
case TYP_INT:
16871688
case TYP_UINT:
16881689
case TYP_LONG:
16891690
case TYP_ULONG:
1690-
if (!compExactlyDependsOn(InstructionSet_SSE41))
1691+
{
1692+
bool useToScalar = op2->IsIntegralConst(0);
1693+
1694+
#if defined(TARGET_X86)
1695+
useToScalar &= !varTypeIsLong(simdBaseType);
1696+
#endif // TARGET_X86
1697+
1698+
if (!useToScalar && !compExactlyDependsOn(InstructionSet_SSE41))
16911699
{
1700+
// Using software fallback if simdBaseType is not supported by hardware
16921701
return nullptr;
16931702
}
16941703
break;
1704+
}
16951705

16961706
case TYP_DOUBLE:
16971707
case TYP_FLOAT:
16981708
case TYP_SHORT:
16991709
case TYP_USHORT:
1710+
{
17001711
// short/ushort/float/double is supported by SSE2
17011712
break;
1713+
}
17021714

17031715
default:
1716+
{
17041717
unreached();
1718+
}
17051719
}
17061720

1707-
GenTree* op2 = impPopStack().val;
1708-
GenTree* op1 = impSIMDPopStack();
1721+
impPopStack();
1722+
op1 = impSIMDPopStack();
17091723

17101724
retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
17111725
break;
@@ -2543,16 +2557,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
25432557
{
25442558
assert(sig->numArgs == 1);
25452559

2560+
op1 = impSIMDPopStack();
2561+
25462562
#if defined(TARGET_X86)
25472563
if (varTypeIsLong(simdBaseType))
25482564
{
2549-
// TODO-XARCH-CQ: It may be beneficial to decompose this operation
2565+
// Create a GetElement node which handles decomposition
2566+
op2 = gtNewIconNode(0);
2567+
retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
25502568
break;
25512569
}
25522570
#endif // TARGET_X86
25532571

2554-
// TODO-XARCH-CQ: It may be beneficial to import this as GetElement(0)
2555-
op1 = impSIMDPopStack();
25562572
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
25572573
break;
25582574
}

0 commit comments

Comments
 (0)