Skip to content

Commit 995e7fb

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath - Add test case to rotr.ll
1 parent 034eaed commit 995e7fb

16 files changed

+376
-176
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
430430
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431431
}
432432

433+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
434+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
435+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
436+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
437+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
438+
// alignbit.
439+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
440+
433441
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434442
Custom);
435443

@@ -5929,6 +5937,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59295937
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
59305938
}
59315939

5940+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
5941+
// regression caused by legalising v2i32 or.
5942+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5943+
unsigned Opc = Op.getOpcode();
5944+
EVT VT = Op.getValueType();
5945+
assert(Opc == ISD::ROTR && "Expected ROTR Opcode for lowerROTR.");
5946+
5947+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
5948+
VT == MVT::v16i32) &&
5949+
"Unexpected ValueType.");
5950+
5951+
return DAG.UnrollVectorOp(Op.getNode());
5952+
}
5953+
59325954
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
59335955
// wider vector type is legal.
59345956
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6115,6 +6137,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
61156137
return lowerGET_FPENV(Op, DAG);
61166138
case ISD::SET_FPENV:
61176139
return lowerSET_FPENV(Op, DAG);
6140+
case ISD::ROTR:
6141+
return lowerROTR(Op, DAG);
61186142
}
61196143
return SDValue();
61206144
}
@@ -12872,6 +12896,53 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1287212896
}
1287312897
}
1287412898

12899+
// Detect identity v2i32 OR and replace with identity source node.
12900+
// Specifically an Or that has operands constructed from the same source node
12901+
// via extract_vector_elt and build_vector. I.E.
12902+
// v2i32 or(
12903+
// v2i32 build_vector(
12904+
// i32 extract_elt(%IdentitySrc, 0),
12905+
// i32 0
12906+
// ),
12907+
// v2i32 build_vector(
12908+
// i32 0,
12909+
// i32 extract_elt(%IdentitySrc, 1)
12910+
// )
12911+
// )
12912+
// =>
12913+
// v2i32 %IdentitySrc
12914+
if (VT == MVT::v2i32) {
12915+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12916+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
12917+
LLVM_DEBUG(dbgs() << "### Performing v2i32 SIISelLowering "
12918+
"DAGCombine::CombineOR\n";);
12919+
12920+
if (auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1)))
12921+
if (auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0))) {
12922+
12923+
// Test for and normalise build vectors.
12924+
if (LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
12925+
12926+
// Get the extract_vector_element operands.
12927+
SDValue LEVE = LHS->getOperand(0);
12928+
SDValue REVE = RHS->getOperand(1);
12929+
12930+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
12931+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12932+
// Check that different elements from the same vector are
12933+
// extracted.
12934+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
12935+
LEVE->getOperand(1) != REVE->getOperand(1)) {
12936+
LLVM_DEBUG(dbgs() << "### Found identity OR, folding...\n";);
12937+
SDValue IdentitySrc = LEVE.getOperand(0);
12938+
return IdentitySrc;
12939+
}
12940+
}
12941+
}
12942+
}
12943+
}
12944+
}
12945+
1287512946
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1287612947
return SDValue();
1287712948

@@ -12916,13 +12987,43 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1291612987
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1291712988
return RV;
1291812989

12990+
SelectionDAG &DAG = DCI.DAG;
12991+
EVT VT = N->getValueType(0);
1291912992
SDValue LHS = N->getOperand(0);
1292012993
SDValue RHS = N->getOperand(1);
1292112994

12995+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
12996+
12997+
const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
12998+
const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12999+
SDValue LHS_0 = LHS.getOperand(0);
13000+
SDValue LHS_1 = LHS.getOperand(1);
13001+
13002+
if (LHS.getOpcode() == ISD::VSELECT && VT == MVT::v2i32) {
13003+
if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
13004+
shouldFoldFNegIntoSrc(N, LHS_0))
13005+
if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
13006+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13007+
SDLoc DL(N);
13008+
SDValue CastLHS =
13009+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13010+
SDValue CastRHS =
13011+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13012+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13013+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13014+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13015+
LHS->getOperand(0), FNegLHS, FNegRHS);
13016+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13017+
}
13018+
}
13019+
// Possibly split vector here if one side does have a constant RHS.
13020+
}
13021+
13022+
// Add test for when only one of the RHS vector elements is a const. Might be
13023+
// possible to optimise this case.
13024+
1292213025
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12923-
SelectionDAG &DAG = DCI.DAG;
1292413026

12925-
EVT VT = N->getValueType(0);
1292613027
if (CRHS && VT == MVT::i64) {
1292713028
if (SDValue Split =
1292813029
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
366366
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
367367
Type *Ty) const override;
368368

369+
// bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
370+
// EVT VT) const override;
371+
369372
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
370373
unsigned Index) const override;
371374
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
@@ -437,6 +440,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
437440
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
438441
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
439442
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
443+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
440444

441445
Register getRegisterByName(const char* RegName, LLT VT,
442446
const MachineFunction &MF) const override;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,9 +2334,9 @@ def : AMDGPUPatIgnoreCopies <
23342334
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
23352335
>;
23362336

2337-
// 64-bit version
2337+
foreach vt = [i64, v2i32] in {
23382338
def : AMDGPUPatIgnoreCopies <
2339-
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2339+
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
23402340
(REG_SEQUENCE VReg_64,
23412341
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
23422342
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2345,6 +2345,7 @@ def : AMDGPUPatIgnoreCopies <
23452345
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
23462346
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
23472347
>;
2348+
}
23482349

23492350
def : AMDGPUPat <
23502351
(fcopysign f32:$src0, f32:$src1),
@@ -2375,9 +2376,14 @@ def : AMDGPUPat <
23752376
$src1), sub1)
23762377
>;
23772378

2379+
def : AMDGPUPat <
2380+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2381+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2382+
23782383
let True16Predicate = NotHasTrue16BitInsts in {
23792384
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
23802385

2386+
23812387
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
23822388
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
23832389
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
@@ -2388,6 +2394,12 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
23882394
} // end True16Predicate = NotHasTrue16BitInsts
23892395

23902396
let True16Predicate = UseRealTrue16Insts in {
2397+
2398+
// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine() when v2i32 or is legal.
2399+
def : AMDGPUPat <
2400+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2401+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2402+
23912403
def : GCNPat <
23922404
(rotr i32:$src0, i32:$src1),
23932405
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
@@ -2449,6 +2461,7 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
24492461
>;
24502462
} // end True16Predicate = UseFakeTrue16Insts
24512463

2464+
24522465
/********** ====================== **********/
24532466
/********** Indirect addressing **********/
24542467
/********** ====================== **********/

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,21 @@ def : GCNPat <
17791779
(S_MOV_B32 imm:$imm)
17801780
>;
17811781

1782+
def : GCNPat <
1783+
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
1784+
(S_AND_B64 SReg_64:$x, SReg_64:$y)
1785+
>;
1786+
1787+
def : GCNPat <
1788+
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
1789+
(S_OR_B64 SReg_64:$x, SReg_64:$y)
1790+
>;
1791+
1792+
def : GCNPat <
1793+
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
1794+
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
1795+
>;
1796+
17821797
// Same as a 32-bit inreg
17831798
def : GCNPat<
17841799
(i32 (UniformUnaryFrag<sext> i16:$src)),

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
954954
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
955955
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
956956

957-
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
957+
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
958958
GCNPat<
959-
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
959+
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
960960
(REG_SEQUENCE VReg_64,
961961
(Inst
962962
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
973973
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
974974
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
975975

976+
def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
977+
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
978+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
979+
976980
// mul24 w/ 64 bit output.
977981
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
978982
(i64 (Op i32:$src0, i32:$src1)),

llvm/test/CodeGen/AMDGPU/and.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
88
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1010

11-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
12-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11+
; SI: s_and_b64
1312

1413
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1514
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
151151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
152152
; GFX-950: ; %bb.0:
153153
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154+
; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
155+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
154156
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156157
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
158+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
159+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
159160
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
160161
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
161-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
162-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
162+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
163163
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165-
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
164+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
165+
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
166166
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
167-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
168-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
167+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
168+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
169169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
170-
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
171-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
172-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
170+
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
171+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
172+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
173173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174174
; GFX-950-NEXT: ; return to shader part epilog
175175
%res = fptrunc <2 x double> %src to <2 x bfloat>

llvm/test/CodeGen/AMDGPU/bfi_int.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %
582582
; GFX7-LABEL: v_bitselect_v2i32_pat1:
583583
; GFX7: ; %bb.0:
584584
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585-
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
586585
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
586+
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
587587
; GFX7-NEXT: s_setpc_b64 s[30:31]
588588
;
589589
; GFX8-LABEL: v_bitselect_v2i32_pat1:
590590
; GFX8: ; %bb.0:
591591
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592-
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
593592
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
593+
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
594594
; GFX8-NEXT: s_setpc_b64 s[30:31]
595595
;
596596
; GFX10-LABEL: v_bitselect_v2i32_pat1:

llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
3131
; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
3232
; GFX9: ; %bb.0:
3333
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
3534
; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
35+
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
3636
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
3737
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
3838
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0

llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
2828
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
2929
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
3030
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
31-
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
32-
; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
33-
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]]
34-
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
35-
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
36-
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
31+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
32+
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
33+
; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
34+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
35+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
36+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
37+
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
38+
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
39+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
3740
; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
3841
; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
3942
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
@@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
6467
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
6568
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
6669
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
67-
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
68-
; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
69-
; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
70-
; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
70+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
71+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
72+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
73+
; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
74+
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
75+
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
76+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
77+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
78+
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec
79+
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
80+
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
81+
; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec
82+
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1
83+
; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0
84+
; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1
85+
; CHECK-NEXT: $vgpr0 = COPY [[COPY8]]
86+
; CHECK-NEXT: $vgpr1 = COPY [[COPY9]]
7187
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
7288
%result = or disjoint <2 x i32> %a, %b
7389
ret <2 x i32> %result

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1759,4 +1759,4 @@ bb5: ; preds = %bb, %.entry
17591759

17601760
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #0
17611761

1762-
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
1762+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }

0 commit comments

Comments
 (0)