Skip to content

Commit d77c537

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent 09e794c commit d77c537

18 files changed

+1809
-223
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40254025
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40264026
/// binary operation \p Opc to it with the corresponding constant operands.
40274027
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4028-
DAGCombinerInfo &DCI, const SDLoc &SL,
4029-
unsigned Opc, SDValue LHS,
4030-
uint32_t ValLo, uint32_t ValHi) const {
4028+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4029+
uint32_t ValLo, uint32_t ValHi) const {
40314030
SelectionDAG &DAG = DCI.DAG;
40324031
SDValue Lo, Hi;
40334032
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4056,6 +4055,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40564055
SDLoc SL(N);
40574056
SelectionDAG &DAG = DCI.DAG;
40584057

4058+
// When the shl64_reduce optimisation code is passed through vector
4059+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4060+
// resulted in the AND instructions no longer being elided, as mentioned
4061+
// below. The following code should make sure this takes place.
4062+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4063+
SDValue VAND = RHS.getOperand(0);
4064+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4065+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4066+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4067+
SDValue LHSAND = VAND.getOperand(0);
4068+
SDValue RHSAND = VAND.getOperand(1);
4069+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4070+
// Part of shlcombine is to optimise for the case where its possible
4071+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4072+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4073+
// '&' is then elided by ISel. The vector code for this was being
4074+
// completely scalarised by the vector legalizer, but now v2i32 is
4075+
// made legal the vector legaliser only partially scalarises the
4076+
// vector operations and the and was not elided. This check enables us
4077+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4078+
// the and instruction.
4079+
ConstantSDNode *CANDL =
4080+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4081+
ConstantSDNode *CANDR =
4082+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4083+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4084+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4085+
// Get the non-const AND operands and produce scalar AND
4086+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4087+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4088+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4089+
LHSAND, Zero);
4090+
SDValue Hi =
4091+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4092+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4093+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4094+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4095+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4096+
if (AndIndex == 0 || AndIndex == 1)
4097+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
4098+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4099+
}
4100+
}
4101+
}
4102+
}
4103+
}
4104+
40594105
unsigned RHSVal;
40604106
if (CRHS) {
40614107
RHSVal = CRHS->getZExtValue();
@@ -4097,8 +4143,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40974143
if (VT.getScalarType() != MVT::i64)
40984144
return SDValue();
40994145

4100-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101-
41024146
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41034147
// common case, splitting this into a move and a 32-bit shift is faster and
41044148
// the same code size.
@@ -4189,6 +4233,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
41894233
SDLoc SL(N);
41904234
unsigned RHSVal;
41914235

4236+
// When the shl64_reduce optimisation code is passed through vector
4237+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4238+
// resulted in the AND instructions no longer being elided, as mentioned
4239+
// below. The following code should make sure this takes place.
4240+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4241+
SDValue VAND = RHS.getOperand(0);
4242+
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
4243+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4244+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4245+
SDValue LHSAND = VAND.getOperand(0);
4246+
SDValue RHSAND = VAND.getOperand(1);
4247+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4248+
// Part of srlcombine is to optimise for the case where its possible
4249+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4250+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4251+
// '&' is then elided by ISel. The vector code for this was being
4252+
// completely scalarised by the vector legalizer, but now v2i32 is
4253+
// made legal the vector legaliser only partially scalarises the
4254+
// vector operations and the and was not elided. This check enables us
4255+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4256+
// the and instruction.
4257+
ConstantSDNode *CANDL =
4258+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4259+
ConstantSDNode *CANDR =
4260+
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4261+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4262+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4263+
// Get the non-const AND operands and produce scalar AND
4264+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4265+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4266+
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
4267+
LHSAND, Zero);
4268+
SDValue Hi =
4269+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4270+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4271+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4272+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4273+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4274+
if (AndIndex == 0 || AndIndex == 1)
4275+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
4276+
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
4277+
}
4278+
}
4279+
}
4280+
}
4281+
}
4282+
41924283
if (CRHS) {
41934284
RHSVal = CRHS->getZExtValue();
41944285

@@ -4701,8 +4792,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47014792
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47024793
return SDValue();
47034794

4704-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4705-
SDLoc(N), Cond, LHS, RHS);
4795+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4796+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4797+
// out in this case. For now I've made the logic as specific to the case as
4798+
// possible, hopefully this can be relaxed in future.
4799+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4800+
SDValue LHSB = LHS.getOperand(0);
4801+
SDValue RHSB = RHS.getOperand(0);
4802+
if (LHSB.getOpcode() == ISD::BITCAST &&
4803+
RHSB->getOpcode() == ISD::BITCAST) {
4804+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4805+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4806+
if (LHSB.getValueType() == MVT::f32 &&
4807+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4808+
RHSBOpTy == MVT::i32)
4809+
return SDValue();
4810+
}
4811+
}
4812+
4813+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4814+
RHS);
47064815
}
47074816

47084817
bool Inv = false;
@@ -4755,8 +4864,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47554864
if (Inv)
47564865
std::swap(NewLHS, NewRHS);
47574866

4758-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4759-
Cond, NewLHS, NewRHS);
4867+
SDValue NewSelect =
4868+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
47604869
DCI.AddToWorklist(NewSelect.getNode());
47614870
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
47624871
}
@@ -5094,8 +5203,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
50945203
}
50955204
case ISD::SELECT: {
50965205
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5206+
// This combine became necessary recently to prevent a regression in
5207+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5208+
// Specifically, additional instructions were added to the final codegen.
5209+
// When adding this combine a case was added to performFNEGCombine to
5210+
// prevent this combine from being undone under certain conditions.
50975211
// TODO: Invert conditions of foldFreeOpFromSelect
5098-
return SDValue();
5212+
SDValue Cond = N0.getOperand(0);
5213+
SDValue LHS = N0.getOperand(1);
5214+
SDValue RHS = N0.getOperand(2);
5215+
EVT LHVT = LHS.getValueType();
5216+
EVT RHVT = RHS.getValueType();
5217+
// The regression was limited to i32 v2/i32.
5218+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5219+
return SDValue();
5220+
5221+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5222+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5223+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5224+
return Op;
50995225
}
51005226
case ISD::BITCAST: {
51015227
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
438438
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
439439
}
440440

441+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
442+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
443+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
444+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
445+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
446+
// alignbit.
447+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
448+
441449
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442450
Custom);
443451

@@ -5934,6 +5942,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59345942
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
59355943
}
59365944

5945+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
5946+
// regression whereby extra unnecessary instructions were added to codegen
5947+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
5948+
// instructions to extract the result from the vector.
5949+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5950+
[[maybe_unused]] EVT VT = Op.getValueType();
5951+
5952+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
5953+
VT == MVT::v16i32) &&
5954+
"Unexpected ValueType.");
5955+
5956+
return DAG.UnrollVectorOp(Op.getNode());
5957+
}
5958+
59375959
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
59385960
// wider vector type is legal.
59395961
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6125,6 +6147,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
61256147
return lowerGET_FPENV(Op, DAG);
61266148
case ISD::SET_FPENV:
61276149
return lowerSET_FPENV(Op, DAG);
6150+
case ISD::ROTR:
6151+
return lowerROTR(Op, DAG);
61286152
}
61296153
return SDValue();
61306154
}
@@ -13018,6 +13042,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1301813042
}
1301913043
}
1302013044

13045+
// Detect identity v2i32 OR and replace with identity source node.
13046+
// Specifically an Or that has operands constructed from the same source node
13047+
// via extract_vector_elt and build_vector. I.E.
13048+
// v2i32 or(
13049+
// v2i32 build_vector(
13050+
// i32 extract_elt(%IdentitySrc, 0),
13051+
// i32 0
13052+
// ),
13053+
// v2i32 build_vector(
13054+
// i32 0,
13055+
// i32 extract_elt(%IdentitySrc, 1)
13056+
// ) )
13057+
// =>
13058+
// v2i32 %IdentitySrc
13059+
13060+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13061+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13062+
13063+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13064+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13065+
13066+
// Test for and normalise build vectors.
13067+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13068+
13069+
// Get the extract_vector_element operands.
13070+
SDValue LEVE = LHS->getOperand(0);
13071+
SDValue REVE = RHS->getOperand(1);
13072+
13073+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13074+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13075+
// Check that different elements from the same vector are
13076+
// extracted.
13077+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13078+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13079+
SDValue IdentitySrc = LEVE.getOperand(0);
13080+
return IdentitySrc;
13081+
}
13082+
}
13083+
}
13084+
}
13085+
1302113086
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1302213087
return SDValue();
1302313088

@@ -13062,13 +13127,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1306213127
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1306313128
return RV;
1306413129

13130+
SelectionDAG &DAG = DCI.DAG;
13131+
EVT VT = N->getValueType(0);
1306513132
SDValue LHS = N->getOperand(0);
1306613133
SDValue RHS = N->getOperand(1);
1306713134

13135+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13136+
13137+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13138+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13139+
SDValue LHS_0 = LHS.getOperand(0);
13140+
SDValue LHS_1 = LHS.getOperand(1);
13141+
13142+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13143+
CRHS0->getAPIntValue().isSignMask() &&
13144+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13145+
CRHS1->getAPIntValue().isSignMask() &&
13146+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13147+
13148+
SDLoc DL(N);
13149+
SDValue CastLHS =
13150+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13151+
SDValue CastRHS =
13152+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13153+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13154+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13155+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13156+
LHS->getOperand(0), FNegLHS, FNegRHS);
13157+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13158+
}
13159+
}
13160+
1306813161
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13069-
SelectionDAG &DAG = DCI.DAG;
1307013162

13071-
EVT VT = N->getValueType(0);
1307213163
if (CRHS && VT == MVT::i64) {
1307313164
if (SDValue Split =
1307413165
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
442442
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
443443
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
444444
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
445+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
445446

446447
Register getRegisterByName(const char* RegName, LLT VT,
447448
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)