Skip to content

Commit f1a4cdd

Browse files
Expose various Convert intrinsics for Avx512F, Avx512BW, and Avx512DQ (#85281)
* Expose various Convert intrinsics for Avx512F and Avx512DQ * Expose various integer conversion APIs for Avx512F and Avx512BW * Ensure special instructions are handled in codegen * Apply formatting patch * Ensure the AVX512F_VL variant is picked for simdSize=16/32 * Ensure conversion instructions are handled in PERFSCORE * Ensure instructions use the right tuple type * Removing an invalid API and fix more PERFSCORE entries * Resolve additional failures masked by #85056 * Ensure TieredCompilation=0 is also passing * Apply formatting patch * Fixing some more test edge cases * Ensure uint64->double and uint64->Float masks the input
1 parent 5b9848f commit f1a4cdd

20 files changed

+2963
-302
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5658,18 +5658,47 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
56585658
break;
56595659
}
56605660

5661+
case NI_AVX512F_ConvertToVector256Int32:
5662+
case NI_AVX512F_ConvertToVector256UInt32:
5663+
case NI_AVX512F_VL_ConvertToVector128UInt32:
5664+
case NI_AVX512F_VL_ConvertToVector128UInt32WithSaturation:
5665+
{
5666+
assert(!varTypeIsFloating(baseType));
5667+
FALLTHROUGH;
5668+
}
5669+
5670+
case NI_AVX512F_ConvertToVector128Byte:
5671+
case NI_AVX512F_ConvertToVector128ByteWithSaturation:
56615672
case NI_AVX512F_ConvertToVector128Int16:
5662-
case NI_AVX512F_ConvertToVector128Int32:
5673+
case NI_AVX512F_ConvertToVector128Int16WithSaturation:
5674+
case NI_AVX512F_ConvertToVector128SByte:
5675+
case NI_AVX512F_ConvertToVector128SByteWithSaturation:
56635676
case NI_AVX512F_ConvertToVector128UInt16:
5664-
case NI_AVX512F_ConvertToVector128UInt32:
5677+
case NI_AVX512F_ConvertToVector128UInt16WithSaturation:
56655678
case NI_AVX512F_ConvertToVector256Int16:
5666-
case NI_AVX512F_ConvertToVector256Int32:
5679+
case NI_AVX512F_ConvertToVector256Int16WithSaturation:
5680+
case NI_AVX512F_ConvertToVector256Int32WithSaturation:
56675681
case NI_AVX512F_ConvertToVector256UInt16:
5668-
case NI_AVX512F_ConvertToVector256UInt32:
5669-
case NI_AVX512BW_ConvertToVector128Byte:
5670-
case NI_AVX512BW_ConvertToVector128SByte:
5682+
case NI_AVX512F_ConvertToVector256UInt16WithSaturation:
5683+
case NI_AVX512F_ConvertToVector256UInt32WithSaturation:
5684+
case NI_AVX512F_VL_ConvertToVector128Byte:
5685+
case NI_AVX512F_VL_ConvertToVector128ByteWithSaturation:
5686+
case NI_AVX512F_VL_ConvertToVector128Int16:
5687+
case NI_AVX512F_VL_ConvertToVector128Int16WithSaturation:
5688+
case NI_AVX512F_VL_ConvertToVector128Int32:
5689+
case NI_AVX512F_VL_ConvertToVector128Int32WithSaturation:
5690+
case NI_AVX512F_VL_ConvertToVector128SByte:
5691+
case NI_AVX512F_VL_ConvertToVector128SByteWithSaturation:
5692+
case NI_AVX512F_VL_ConvertToVector128UInt16:
5693+
case NI_AVX512F_VL_ConvertToVector128UInt16WithSaturation:
56715694
case NI_AVX512BW_ConvertToVector256Byte:
5695+
case NI_AVX512BW_ConvertToVector256ByteWithSaturation:
56725696
case NI_AVX512BW_ConvertToVector256SByte:
5697+
case NI_AVX512BW_ConvertToVector256SByteWithSaturation:
5698+
case NI_AVX512BW_VL_ConvertToVector128Byte:
5699+
case NI_AVX512BW_VL_ConvertToVector128ByteWithSaturation:
5700+
case NI_AVX512BW_VL_ConvertToVector128SByte:
5701+
case NI_AVX512BW_VL_ConvertToVector128SByteWithSaturation:
56735702
{
56745703
// These intrinsics are "ins reg/mem, xmm"
56755704
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);

src/coreclr/jit/emit.h

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1905,7 +1905,7 @@ class emitter
19051905
ssize_t emitGetInsCIdisp(instrDesc* id);
19061906
unsigned emitGetInsCIargs(instrDesc* id);
19071907

1908-
inline static emitAttr emitGetMemOpSize(instrDesc* id);
1908+
inline emitAttr emitGetMemOpSize(instrDesc* id) const;
19091909

19101910
// Return the argument count for a direct call "id".
19111911
int emitGetInsCDinfo(instrDesc* id);
@@ -3456,11 +3456,12 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
34563456
// Arguments:
34573457
// id - Instruction descriptor
34583458
//
3459-
/* static */ emitAttr emitter::emitGetMemOpSize(instrDesc* id)
3459+
emitAttr emitter::emitGetMemOpSize(instrDesc* id) const
34603460
{
3461-
emitAttr defaultSize = id->idOpSize();
3461+
emitAttr defaultSize = id->idOpSize();
3462+
instruction ins = id->idIns();
34623463

3463-
switch (id->idIns())
3464+
switch (ins)
34643465
{
34653466
case INS_pextrb:
34663467
case INS_pinsrb:
@@ -3570,9 +3571,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
35703571

35713572
case INS_cvtdq2pd:
35723573
case INS_cvtps2pd:
3573-
case INS_vpmovdw:
3574-
case INS_vpmovqd:
3575-
case INS_vpmovwb:
35763574
{
35773575
if (defaultSize == 64)
35783576
{
@@ -3589,6 +3587,57 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
35893587
}
35903588
}
35913589

3590+
case INS_vpmovdb:
3591+
case INS_vpmovdw:
3592+
case INS_vpmovqb:
3593+
case INS_vpmovqd:
3594+
case INS_vpmovqw:
3595+
case INS_vpmovwb:
3596+
case INS_vpmovsdb:
3597+
case INS_vpmovsdw:
3598+
case INS_vpmovsqb:
3599+
case INS_vpmovsqd:
3600+
case INS_vpmovsqw:
3601+
case INS_vpmovswb:
3602+
case INS_vpmovusdb:
3603+
case INS_vpmovusdw:
3604+
case INS_vpmovusqb:
3605+
case INS_vpmovusqd:
3606+
case INS_vpmovusqw:
3607+
case INS_vpmovuswb:
3608+
{
3609+
insTupleType tupleType = insTupleTypeInfo(ins);
3610+
unsigned memSize = 0;
3611+
3612+
switch (tupleType)
3613+
{
3614+
case INS_TT_HALF_MEM:
3615+
{
3616+
memSize = defaultSize / 2;
3617+
break;
3618+
}
3619+
3620+
case INS_TT_QUARTER_MEM:
3621+
{
3622+
memSize = defaultSize / 4;
3623+
break;
3624+
}
3625+
3626+
case INS_TT_EIGHTH_MEM:
3627+
{
3628+
memSize = defaultSize / 8;
3629+
break;
3630+
}
3631+
3632+
default:
3633+
{
3634+
unreached();
3635+
}
3636+
}
3637+
3638+
return EA_ATTR(memSize);
3639+
}
3640+
35923641
case INS_vbroadcastf128:
35933642
case INS_vbroadcasti128:
35943643
case INS_vextractf128:
@@ -3613,7 +3662,11 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
36133662

36143663
case INS_movddup:
36153664
{
3616-
if (defaultSize == 32)
3665+
if (defaultSize == 64)
3666+
{
3667+
return EA_64BYTE;
3668+
}
3669+
else if (defaultSize == 32)
36173670
{
36183671
return EA_32BYTE;
36193672
}

src/coreclr/jit/emitxarch.cpp

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1362,6 +1362,10 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
13621362
case INS_shlx:
13631363
case INS_shrx:
13641364
#endif // TARGET_AMD64
1365+
case INS_vcvtsd2usi:
1366+
case INS_vcvtss2usi:
1367+
case INS_vcvttsd2usi:
1368+
case INS_vcvttss2usi:
13651369
{
13661370
if (attr == EA_8BYTE)
13671371
{
@@ -2582,6 +2586,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
25822586
case INS_sarx:
25832587
case INS_shrx:
25842588
#endif
2589+
case INS_vcvtsd2usi:
2590+
case INS_vcvtss2usi:
2591+
case INS_vcvttsd2usi:
2592+
case INS_vcvttss2usi:
25852593
{
25862594
// These SSE instructions write to a general purpose integer register.
25872595
return false;
@@ -3010,7 +3018,7 @@ inline bool hasTupleTypeInfo(instruction ins)
30103018
// Return Value:
30113019
// the tuple type info for a given CPU instruction.
30123020
//
3013-
inline insTupleType insTupleTypeInfo(instruction ins)
3021+
insTupleType emitter::insTupleTypeInfo(instruction ins) const
30143022
{
30153023
assert((unsigned)ins < ArrLen(insTupleTypeInfos));
30163024
assert(insTupleTypeInfos[ins] != INS_TT_NONE);
@@ -3020,9 +3028,9 @@ inline insTupleType insTupleTypeInfo(instruction ins)
30203028
// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
30213029
bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
30223030
{
3023-
const size_t SSE38 = 0x0F660038;
3024-
const size_t SSE3A = 0x0F66003A;
3025-
const size_t MASK = 0xFFFF00FF;
3031+
const size_t SSE38 = 0x0F000038;
3032+
const size_t SSE3A = 0x0F00003A;
3033+
const size_t MASK = 0xFF0000FF;
30263034

30273035
size_t insCode = 0;
30283036

@@ -3044,8 +3052,19 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
30443052
insCode = insCodeMR(ins);
30453053
}
30463054

3047-
insCode &= MASK;
3048-
return insCode == SSE38 || insCode == SSE3A;
3055+
size_t mskCode = insCode & MASK;
3056+
3057+
if ((mskCode != SSE38) && (mskCode != SSE3A))
3058+
{
3059+
return false;
3060+
}
3061+
3062+
#if defined(DEBUG)
3063+
insCode = (insCode >> 16) & 0xFF;
3064+
assert((insCode == 0x66) || (insCode == 0xF2) || (insCode == 0xF3));
3065+
#endif // DEBUG
3066+
3067+
return true;
30493068
}
30503069

30513070
/*****************************************************************************
@@ -11214,6 +11233,10 @@ void emitter::emitDispIns(
1121411233
case INS_cvtss2si:
1121511234
case INS_cvtsd2si:
1121611235
case INS_cvttss2si:
11236+
case INS_vcvtsd2usi:
11237+
case INS_vcvtss2usi:
11238+
case INS_vcvttsd2usi:
11239+
case INS_vcvttss2usi:
1121711240
{
1121811241
printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
1121911242
break;
@@ -15528,9 +15551,9 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
1552815551
disp8Compression = inputSize * 4;
1552915552
break;
1553015553
case INS_TT_TUPLE8:
15531-
// N = input size in bytes * 4, 32bit for 512 only
15554+
// N = input size in bytes * 8, 32bit for 512 only
1553215555
assert((inputSize == 4 && vectorLength >= 64));
15533-
disp8Compression = inputSize * 4;
15556+
disp8Compression = inputSize * 8;
1553415557
break;
1553515558
case INS_TT_HALF_MEM:
1553615559
// N = vector length in bytes / 2
@@ -17825,11 +17848,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1782517848
case INS_cvttps2dq:
1782617849
case INS_cvtps2dq:
1782717850
case INS_cvtdq2ps:
17851+
case INS_vcvtpd2qq:
17852+
case INS_vcvtpd2uqq:
17853+
case INS_vcvtps2udq:
17854+
case INS_vcvtqq2pd:
17855+
case INS_vcvttps2udq:
17856+
case INS_vcvtudq2ps:
17857+
case INS_vcvttpd2qq:
17858+
case INS_vcvttpd2uqq:
17859+
case INS_vcvtuqq2pd:
17860+
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
17861+
result.insLatency += PERFSCORE_LATENCY_4C;
17862+
break;
17863+
17864+
case INS_vpmovdb:
1782817865
case INS_vpmovdw:
17866+
case INS_vpmovqb:
1782917867
case INS_vpmovqd:
17868+
case INS_vpmovqw:
17869+
case INS_vpmovsdb:
17870+
case INS_vpmovsdw:
17871+
case INS_vpmovsqb:
17872+
case INS_vpmovsqd:
17873+
case INS_vpmovsqw:
17874+
case INS_vpmovswb:
17875+
case INS_vpmovusdb:
17876+
case INS_vpmovusdw:
17877+
case INS_vpmovusqb:
17878+
case INS_vpmovusqd:
17879+
case INS_vpmovusqw:
17880+
case INS_vpmovuswb:
1783017881
case INS_vpmovwb:
17831-
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
17832-
result.insLatency += PERFSCORE_LATENCY_4C;
17882+
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
17883+
result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C;
1783317884
break;
1783417885

1783517886
case INS_haddps:
@@ -17892,12 +17943,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1789217943
case INS_cvtsi2ss32:
1789317944
case INS_cvtsi2sd64:
1789417945
case INS_cvtsi2ss64:
17946+
case INS_vcvtsd2usi:
17947+
case INS_vcvttsd2usi:
17948+
case INS_vcvtusi2sd32:
17949+
case INS_vcvtusi2sd64:
17950+
case INS_vcvtusi2ss32:
17951+
case INS_vcvtusi2ss64:
1789517952
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1789617953
result.insLatency += PERFSCORE_LATENCY_7C;
1789717954
break;
1789817955

1789917956
case INS_cvttss2si:
1790017957
case INS_cvtss2si:
17958+
case INS_vcvtss2usi:
17959+
case INS_vcvttss2usi:
1790117960
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1790217961
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
1790317962
break;
@@ -18241,6 +18300,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1824118300
case INS_cvtdq2pd:
1824218301
case INS_cvtpd2ps:
1824318302
case INS_cvttpd2dq:
18303+
case INS_vcvtpd2udq:
18304+
case INS_vcvtps2qq:
18305+
case INS_vcvtps2uqq:
18306+
case INS_vcvtqq2ps:
18307+
case INS_vcvttpd2udq:
18308+
case INS_vcvttps2qq:
18309+
case INS_vcvttps2uqq:
18310+
case INS_vcvtudq2pd:
18311+
case INS_vcvtuqq2ps:
1824418312
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
1824518313
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C;
1824618314
break;
@@ -18282,17 +18350,25 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1828218350
case INS_vpbroadcastq_gpr:
1828318351
case INS_vbroadcasti128:
1828418352
case INS_vbroadcastf128:
18353+
case INS_vbroadcastf64x2:
18354+
case INS_vbroadcasti64x2:
18355+
case INS_vbroadcastf64x4:
18356+
case INS_vbroadcasti64x4:
18357+
case INS_vbroadcastf32x2:
18358+
case INS_vbroadcasti32x2:
18359+
case INS_vbroadcastf32x8:
18360+
case INS_vbroadcasti32x8:
1828518361
case INS_vbroadcastss:
1828618362
case INS_vbroadcastsd:
1828718363
if (memAccessKind == PERFSCORE_MEMORY_NONE)
1828818364
{
1828918365
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
18290-
result.insLatency = opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C;
18366+
result.insLatency = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C;
1829118367
}
1829218368
else
1829318369
{
1829418370
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
18295-
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C;
18371+
result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C;
1829618372
if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw)
1829718373
{
1829818374
result.insLatency += PERFSCORE_LATENCY_1C;

src/coreclr/jit/emitxarch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
183183
return code;
184184
}
185185

186+
insTupleType insTupleTypeInfo(instruction ins) const;
187+
186188
//------------------------------------------------------------------------
187189
// HasKMaskRegisterDest: Temporary check to identify instructions that can
188190
// be Evex encoded but require Opmask(KMask) register support.

0 commit comments

Comments
 (0)