Skip to content

Commit b3b53fe

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false".
1 parent 034eaed commit b3b53fe

18 files changed

+1728
-222
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4654,8 +4654,27 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
46544654
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
46554655
return SDValue();
46564656

4657-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4658-
SDLoc(N), Cond, LHS, RHS);
4657+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4658+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4659+
// out in this case. For now I've made the logic as specific to the case as
4660+
// possible, hopefully this can be relaxed in future.
4661+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4662+
SDValue LHSB = LHS.getOperand(0);
4663+
SDValue RHSB = RHS.getOperand(0);
4664+
if (LHSB.getOpcode() == ISD::BITCAST &&
4665+
RHSB->getOpcode() == ISD::BITCAST) {
4666+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4667+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4668+
if (LHSB.getValueType() == MVT::f32 &&
4669+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4670+
RHSBOpTy == MVT::i32) {
4671+
return SDValue();
4672+
}
4673+
}
4674+
}
4675+
4676+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4677+
RHS);
46594678
}
46604679

46614680
bool Inv = false;
@@ -4708,8 +4727,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47084727
if (Inv)
47094728
std::swap(NewLHS, NewRHS);
47104729

4711-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4712-
Cond, NewLHS, NewRHS);
4730+
SDValue NewSelect =
4731+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
47134732
DCI.AddToWorklist(NewSelect.getNode());
47144733
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
47154734
}
@@ -5047,8 +5066,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
50475066
}
50485067
case ISD::SELECT: {
50495068
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5069+
// This combine became necessary recently to prevent a regression in
5070+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5071+
// Specifically, additional instructions were added to the final codegen.
5072+
// When adding this combine a case was added to performFNEGCombine to
5073+
// prevent this combine from being undone under certain conditions.
50505074
// TODO: Invert conditions of foldFreeOpFromSelect
5051-
return SDValue();
5075+
SDValue Cond = N0.getOperand(0);
5076+
SDValue LHS = N0.getOperand(1);
5077+
SDValue RHS = N0.getOperand(2);
5078+
EVT LHVT = LHS.getValueType();
5079+
EVT RHVT = RHS.getValueType();
5080+
// The regression was limited to i32 v2/i32.
5081+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5082+
return SDValue();
5083+
5084+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5085+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5086+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5087+
return Op;
50525088
}
50535089
case ISD::BITCAST: {
50545090
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
430430
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431431
}
432432

433+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
434+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
435+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
436+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
437+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
438+
// alignbit.
439+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
440+
433441
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434442
Custom);
435443

@@ -5929,6 +5937,19 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59295937
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
59305938
}
59315939

5940+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
5941+
// regression in rotr.ll, whereby extra unnecessary instructions were added to
5942+
// the final codegen caused by legalising v2i32 or.
5943+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5944+
[[maybe_unused]] EVT VT = Op.getValueType();
5945+
5946+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
5947+
VT == MVT::v16i32) &&
5948+
"Unexpected ValueType.");
5949+
5950+
return DAG.UnrollVectorOp(Op.getNode());
5951+
}
5952+
59325953
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
59335954
// wider vector type is legal.
59345955
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6115,6 +6136,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
61156136
return lowerGET_FPENV(Op, DAG);
61166137
case ISD::SET_FPENV:
61176138
return lowerSET_FPENV(Op, DAG);
6139+
case ISD::ROTR:
6140+
return lowerROTR(Op, DAG);
61186141
}
61196142
return SDValue();
61206143
}
@@ -12872,6 +12895,50 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1287212895
}
1287312896
}
1287412897

12898+
// Detect identity v2i32 OR and replace with identity source node.
12899+
// Specifically an Or that has operands constructed from the same source node
12900+
// via extract_vector_elt and build_vector. I.E.
12901+
// v2i32 or(
12902+
// v2i32 build_vector(
12903+
// i32 extract_elt(%IdentitySrc, 0),
12904+
// i32 0
12905+
// ),
12906+
// v2i32 build_vector(
12907+
// i32 0,
12908+
// i32 extract_elt(%IdentitySrc, 1)
12909+
// )
12910+
// )
12911+
// =>
12912+
// v2i32 %IdentitySrc
12913+
if (VT == MVT::v2i32) {
12914+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12915+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
12916+
12917+
if (auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1)))
12918+
if (auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0))) {
12919+
12920+
// Test for and normalise build vectors.
12921+
if (LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
12922+
12923+
// Get the extract_vector_element operands.
12924+
SDValue LEVE = LHS->getOperand(0);
12925+
SDValue REVE = RHS->getOperand(1);
12926+
12927+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
12928+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12929+
// Check that different elements from the same vector are
12930+
// extracted.
12931+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
12932+
LEVE->getOperand(1) != REVE->getOperand(1)) {
12933+
SDValue IdentitySrc = LEVE.getOperand(0);
12934+
return IdentitySrc;
12935+
}
12936+
}
12937+
}
12938+
}
12939+
}
12940+
}
12941+
1287512942
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1287612943
return SDValue();
1287712944

@@ -12916,13 +12983,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1291612983
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1291712984
return RV;
1291812985

12986+
SelectionDAG &DAG = DCI.DAG;
12987+
EVT VT = N->getValueType(0);
1291912988
SDValue LHS = N->getOperand(0);
1292012989
SDValue RHS = N->getOperand(1);
1292112990

12991+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
12992+
12993+
const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
12994+
const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12995+
SDValue LHS_0 = LHS.getOperand(0);
12996+
SDValue LHS_1 = LHS.getOperand(1);
12997+
12998+
if (LHS.getOpcode() == ISD::VSELECT) {
12999+
if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
13000+
shouldFoldFNegIntoSrc(N, LHS_0))
13001+
if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
13002+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13003+
SDLoc DL(N);
13004+
SDValue CastLHS =
13005+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13006+
SDValue CastRHS =
13007+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13008+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13009+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13010+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13011+
LHS->getOperand(0), FNegLHS, FNegRHS);
13012+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13013+
}
13014+
}
13015+
}
13016+
1292213017
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12923-
SelectionDAG &DAG = DCI.DAG;
1292413018

12925-
EVT VT = N->getValueType(0);
1292613019
if (CRHS && VT == MVT::i64) {
1292713020
if (SDValue Split =
1292813021
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
437437
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
438438
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
439439
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
440+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
440441

441442
Register getRegisterByName(const char* RegName, LLT VT,
442443
const MachineFunction &MF) const override;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,7 +1793,6 @@ def : GCNPat <
17931793
>;
17941794
}
17951795

1796-
17971796
/********** ================================ **********/
17981797
/********** Floating point absolute/negative **********/
17991798
/********** ================================ **********/
@@ -2334,9 +2333,9 @@ def : AMDGPUPatIgnoreCopies <
23342333
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
23352334
>;
23362335

2337-
// 64-bit version
2336+
foreach vt = [i64, v2i32] in {
23382337
def : AMDGPUPatIgnoreCopies <
2339-
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2338+
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
23402339
(REG_SEQUENCE VReg_64,
23412340
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
23422341
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2345,6 +2344,7 @@ def : AMDGPUPatIgnoreCopies <
23452344
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
23462345
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
23472346
>;
2347+
}
23482348

23492349
def : AMDGPUPat <
23502350
(fcopysign f32:$src0, f32:$src1),
@@ -2388,30 +2388,25 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
23882388
} // end True16Predicate = NotHasTrue16BitInsts
23892389

23902390
let True16Predicate = UseRealTrue16Insts in {
2391-
def : GCNPat <
2392-
(rotr i32:$src0, i32:$src1),
2393-
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2394-
/* src1_modifiers */ 0, $src0,
2395-
/* src2_modifiers */ 0,
2396-
(EXTRACT_SUBREG $src1, lo16),
2397-
/* clamp */ 0, /* op_sel */ 0)
2398-
>;
2399-
2400-
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2401-
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
2402-
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
2403-
0, /* src1_modifiers */
2404-
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
2405-
0, /* src2_modifiers */
2406-
(i16 (EXTRACT_SUBREG VGPR_32:$src1, lo16)),
2407-
/* clamp */ 0, /* op_sel */ 0)>;
2391+
def : GCNPat<(rotr i32:$src0, i32:$src1),
2392+
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2393+
/* src1_modifiers */ 0, $src0,
2394+
/* src2_modifiers */ 0, (EXTRACT_SUBREG $src1, lo16),
2395+
/* clamp */ 0, /* op_sel */ 0)>;
24082396

2409-
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2410-
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2411-
/* src1_modifiers */ 0, $src1,
2412-
/* src2_modifiers */ 0,
2413-
(EXTRACT_SUBREG VGPR_32:$src2, lo16),
2414-
/* clamp */ 0, /* op_sel */ 0)>;
2397+
def : GCNPat<
2398+
(i32(trunc(srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
2399+
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
2400+
(i32(EXTRACT_SUBREG(i64 $src0), sub1)), 0, /* src1_modifiers */
2401+
(i32(EXTRACT_SUBREG(i64 $src0), sub0)), 0, /* src2_modifiers */
2402+
(i16(EXTRACT_SUBREG VGPR_32:$src1, lo16)),
2403+
/* clamp */ 0, /* op_sel */ 0)>;
2404+
2405+
def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
2406+
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
2407+
/* src1_modifiers */ 0, $src1,
2408+
/* src2_modifiers */ 0, (EXTRACT_SUBREG VGPR_32:$src2, lo16),
2409+
/* clamp */ 0, /* op_sel */ 0)>;
24152410
} // end True16Predicate = UseRealTrue16Insts
24162411

24172412
let True16Predicate = UseFakeTrue16Insts in {

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,21 @@ def : GCNPat <
17791779
(S_MOV_B32 imm:$imm)
17801780
>;
17811781

1782+
def : GCNPat <
1783+
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
1784+
(S_AND_B64 SReg_64:$x, SReg_64:$y)
1785+
>;
1786+
1787+
def : GCNPat <
1788+
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
1789+
(S_OR_B64 SReg_64:$x, SReg_64:$y)
1790+
>;
1791+
1792+
def : GCNPat <
1793+
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
1794+
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
1795+
>;
1796+
17821797
// Same as a 32-bit inreg
17831798
def : GCNPat<
17841799
(i32 (UniformUnaryFrag<sext> i16:$src)),

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
954954
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
955955
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
956956

957-
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
957+
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
958958
GCNPat<
959-
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
959+
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
960960
(REG_SEQUENCE VReg_64,
961961
(Inst
962962
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
973973
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
974974
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
975975

976+
def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
977+
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
978+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
979+
976980
// mul24 w/ 64 bit output.
977981
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
978982
(i64 (Op i32:$src0, i32:$src1)),

llvm/test/CodeGen/AMDGPU/and.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
88
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1010

11-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
12-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11+
; SI: s_and_b64
1312

1413
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1514
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
151151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
152152
; GFX-950: ; %bb.0:
153153
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154+
; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
155+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
154156
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156157
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
158+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
159+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
159160
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
160161
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
161-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
162-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
162+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
163163
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165-
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
164+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
165+
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
166166
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
167-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
168-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
167+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
168+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
169169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
170-
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
171-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
172-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
170+
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
171+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
172+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
173173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174174
; GFX-950-NEXT: ; return to shader part epilog
175175
%res = fptrunc <2 x double> %src to <2 x bfloat>

0 commit comments

Comments
 (0)