Skip to content

Commit 6245978

Browse files
committed
[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of
64-bit wide instructions Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary. - Use custom for v2i32 rotr instead of additional patterns. Modify PerformOrCombine() to remove some identity or operations - Fix rotr regression by adding lowerRotr() on the legalizer codepath. - Add test case to rotr.ll - Extend performFNEGCombine() for the SELECT case. - Modify performSelectCombine() and foldFreeOpFromSelect to prevent the performFNEGCombine() changes from being unwound. - Add cases to or.ll and xor.ll to demonstrate the generation of the s_or_64 and s_xor_64 instructions for the v2i32 cases. Previously this was inhibited by "-amdgpu-scalarize-global-loads=false". - Fix shl/srl64_reduce regression by performing the scalarisation previously performewd by the vector legaliser in the combiner.
1 parent 595a273 commit 6245978

18 files changed

+1815
-223
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 142 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40254025
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
40264026
/// binary operation \p Opc to it with the corresponding constant operands.
40274027
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4028-
DAGCombinerInfo &DCI, const SDLoc &SL,
4029-
unsigned Opc, SDValue LHS,
4030-
uint32_t ValLo, uint32_t ValHi) const {
4028+
DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4029+
uint32_t ValLo, uint32_t ValHi) const {
40314030
SelectionDAG &DAG = DCI.DAG;
40324031
SDValue Lo, Hi;
40334032
std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4056,6 +4055,56 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40564055
SDLoc SL(N);
40574056
SelectionDAG &DAG = DCI.DAG;
40584057

4058+
// When the shl64_reduce optimisation code is passed through vector
4059+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4060+
// resulted in the AND instructions no longer being elided, as mentioned
4061+
// below. The following code should make sure this takes place.
4062+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4063+
SDValue VAND = RHS.getOperand(0);
4064+
ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
4065+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4066+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4067+
SDValue LHSAND = VAND.getOperand(0);
4068+
SDValue RHSAND = VAND.getOperand(1);
4069+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4070+
// Part of shlcombine is to optimise for the case where its possible
4071+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4072+
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4073+
// '&' is then elided by ISel. The vector code for this was being
4074+
// completely scalarised by the vector legalizer, but now v2i32 is
4075+
// made legal the vector legaliser only partially scalarises the
4076+
// vector operations and the and was not elided. This check enables us
4077+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4078+
// the and instruction.
4079+
ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4080+
ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4081+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4082+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4083+
// Get the non-const AND operands and produce scalar AND
4084+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4085+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4086+
SDValue Lo =
4087+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero);
4088+
SDValue Hi =
4089+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4090+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4091+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4092+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4093+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4094+
if (AndIndex == 0) {
4095+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, LoAnd,
4096+
N->getFlags());
4097+
} else if (AndIndex == 1) {
4098+
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc, HiAnd,
4099+
N->getFlags());
4100+
} else {
4101+
// Intentionally fall-through to the other combines.
4102+
}
4103+
}
4104+
}
4105+
}
4106+
}
4107+
40594108
unsigned RHSVal;
40604109
if (CRHS) {
40614110
RHSVal = CRHS->getZExtValue();
@@ -4097,8 +4146,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40974146
if (VT.getScalarType() != MVT::i64)
40984147
return SDValue();
40994148

4100-
// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101-
41024149
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
41034150
// common case, splitting this into a move and a 32-bit shift is faster and
41044151
// the same code size.
@@ -4189,6 +4236,56 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
41894236
SDLoc SL(N);
41904237
unsigned RHSVal;
41914238

4239+
// When the shl64_reduce optimisation code is passed through vector
4240+
// legalization some scalarising occurs. After ISD::AND was legalised, this
4241+
// resulted in the AND instructions no longer being elided, as mentioned
4242+
// below. The following code should make sure this takes place.
4243+
if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4244+
SDValue VAND = RHS.getOperand(0);
4245+
ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
4246+
uint64_t AndIndex = RHS->getConstantOperandVal(1);
4247+
if (VAND->getOpcode() == ISD::AND && CRRHS) {
4248+
SDValue LHSAND = VAND.getOperand(0);
4249+
SDValue RHSAND = VAND.getOperand(1);
4250+
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
4251+
// Part of srlcombine is to optimise for the case where its possible
4252+
// to reduce shl64 to shl32 if shift range is [63-32]. This
4253+
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4254+
// '&' is then elided by ISel. The vector code for this was being
4255+
// completely scalarised by the vector legalizer, but now v2i32 is
4256+
// made legal the vector legaliser only partially scalarises the
4257+
// vector operations and the and was not elided. This check enables us
4258+
// to locate and scalarise the v2i32 and and re-enable ISel to elide
4259+
// the and instruction.
4260+
ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
4261+
ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
4262+
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
4263+
RHSAND->getConstantOperandVal(1) == 0x1f) {
4264+
// Get the non-const AND operands and produce scalar AND
4265+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4266+
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
4267+
SDValue Lo =
4268+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero);
4269+
SDValue Hi =
4270+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
4271+
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
4272+
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
4273+
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
4274+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4275+
if (AndIndex == 0) {
4276+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc, LoAnd,
4277+
N->getFlags());
4278+
} else if (AndIndex == 1) {
4279+
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc, HiAnd,
4280+
N->getFlags());
4281+
} else {
4282+
// Intentionally fall-through to the other combines.
4283+
}
4284+
}
4285+
}
4286+
}
4287+
}
4288+
41924289
if (CRHS) {
41934290
RHSVal = CRHS->getZExtValue();
41944291

@@ -4701,8 +4798,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47014798
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
47024799
return SDValue();
47034800

4704-
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4705-
SDLoc(N), Cond, LHS, RHS);
4801+
// select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4802+
// lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4803+
// out in this case. For now I've made the logic as specific to the case as
4804+
// possible, hopefully this can be relaxed in future.
4805+
if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
4806+
SDValue LHSB = LHS.getOperand(0);
4807+
SDValue RHSB = RHS.getOperand(0);
4808+
if (LHSB.getOpcode() == ISD::BITCAST &&
4809+
RHSB->getOpcode() == ISD::BITCAST) {
4810+
EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
4811+
EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
4812+
if (LHSB.getValueType() == MVT::f32 &&
4813+
RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
4814+
RHSBOpTy == MVT::i32)
4815+
return SDValue();
4816+
}
4817+
}
4818+
4819+
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
4820+
RHS);
47064821
}
47074822

47084823
bool Inv = false;
@@ -4755,8 +4870,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
47554870
if (Inv)
47564871
std::swap(NewLHS, NewRHS);
47574872

4758-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4759-
Cond, NewLHS, NewRHS);
4873+
SDValue NewSelect =
4874+
DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
47604875
DCI.AddToWorklist(NewSelect.getNode());
47614876
return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
47624877
}
@@ -5094,8 +5209,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
50945209
}
50955210
case ISD::SELECT: {
50965211
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5212+
// This combine became necessary recently to prevent a regression in
5213+
// fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5214+
// Specifically, additional instructions were added to the final codegen.
5215+
// When adding this combine a case was added to performFNEGCombine to
5216+
// prevent this combine from being undone under certain conditions.
50975217
// TODO: Invert conditions of foldFreeOpFromSelect
5098-
return SDValue();
5218+
SDValue Cond = N0.getOperand(0);
5219+
SDValue LHS = N0.getOperand(1);
5220+
SDValue RHS = N0.getOperand(2);
5221+
EVT LHVT = LHS.getValueType();
5222+
EVT RHVT = RHS.getValueType();
5223+
// The regression was limited to i32 v2/i32.
5224+
if (RHVT != MVT::i32 && LHVT != MVT::i32)
5225+
return SDValue();
5226+
5227+
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
5228+
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
5229+
SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5230+
return Op;
50995231
}
51005232
case ISD::BITCAST: {
51015233
SDLoc SL(N);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
438438
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
439439
}
440440

441+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
442+
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
443+
// instead lower to cndmask in SITargetLowering::LowerSELECT().
444+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
445+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
446+
// alignbit.
447+
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
448+
441449
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442450
Custom);
443451

@@ -5930,6 +5938,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59305938
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
59315939
}
59325940

5941+
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
5942+
// regression whereby extra unnecessary instructions were added to codegen
5943+
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
5944+
// instructions to extract the result from the vector.
5945+
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5946+
[[maybe_unused]] EVT VT = Op.getValueType();
5947+
5948+
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
5949+
VT == MVT::v16i32) &&
5950+
"Unexpected ValueType.");
5951+
5952+
return DAG.UnrollVectorOp(Op.getNode());
5953+
}
5954+
59335955
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
59345956
// wider vector type is legal.
59355957
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6120,6 +6142,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
61206142
return lowerGET_FPENV(Op, DAG);
61216143
case ISD::SET_FPENV:
61226144
return lowerSET_FPENV(Op, DAG);
6145+
case ISD::ROTR:
6146+
return lowerROTR(Op, DAG);
61236147
}
61246148
return SDValue();
61256149
}
@@ -12996,6 +13020,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1299613020
}
1299713021
}
1299813022

13023+
// Detect identity v2i32 OR and replace with identity source node.
13024+
// Specifically an Or that has operands constructed from the same source node
13025+
// via extract_vector_elt and build_vector. I.E.
13026+
// v2i32 or(
13027+
// v2i32 build_vector(
13028+
// i32 extract_elt(%IdentitySrc, 0),
13029+
// i32 0
13030+
// ),
13031+
// v2i32 build_vector(
13032+
// i32 0,
13033+
// i32 extract_elt(%IdentitySrc, 1)
13034+
// ) )
13035+
// =>
13036+
// v2i32 %IdentitySrc
13037+
13038+
if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
13039+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
13040+
13041+
ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13042+
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
13043+
13044+
// Test for and normalise build vectors.
13045+
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
13046+
13047+
// Get the extract_vector_element operands.
13048+
SDValue LEVE = LHS->getOperand(0);
13049+
SDValue REVE = RHS->getOperand(1);
13050+
13051+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13052+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13053+
// Check that different elements from the same vector are
13054+
// extracted.
13055+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
13056+
LEVE->getOperand(1) != REVE->getOperand(1)) {
13057+
SDValue IdentitySrc = LEVE.getOperand(0);
13058+
return IdentitySrc;
13059+
}
13060+
}
13061+
}
13062+
}
13063+
1299913064
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1300013065
return SDValue();
1300113066

@@ -13040,13 +13105,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1304013105
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1304113106
return RV;
1304213107

13108+
SelectionDAG &DAG = DCI.DAG;
13109+
EVT VT = N->getValueType(0);
1304313110
SDValue LHS = N->getOperand(0);
1304413111
SDValue RHS = N->getOperand(1);
1304513112

13113+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
13114+
13115+
const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
13116+
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13117+
SDValue LHS_0 = LHS.getOperand(0);
13118+
SDValue LHS_1 = LHS.getOperand(1);
13119+
13120+
if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
13121+
CRHS0->getAPIntValue().isSignMask() &&
13122+
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
13123+
CRHS1->getAPIntValue().isSignMask() &&
13124+
shouldFoldFNegIntoSrc(N, LHS_1)) {
13125+
13126+
SDLoc DL(N);
13127+
SDValue CastLHS =
13128+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13129+
SDValue CastRHS =
13130+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13131+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13132+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13133+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13134+
LHS->getOperand(0), FNegLHS, FNegRHS);
13135+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13136+
}
13137+
}
13138+
1304613139
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13047-
SelectionDAG &DAG = DCI.DAG;
1304813140

13049-
EVT VT = N->getValueType(0);
1305013141
if (CRHS && VT == MVT::i64) {
1305113142
if (SDValue Split =
1305213143
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
441441
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
442442
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
443443
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
444+
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
444445

445446
Register getRegisterByName(const char* RegName, LLT VT,
446447
const MachineFunction &MF) const override;

0 commit comments

Comments
 (0)