@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4025
4025
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4026
4026
// / binary operation \p Opc to it with the corresponding constant operands.
4027
4027
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4028
- DAGCombinerInfo &DCI, const SDLoc &SL,
4029
- unsigned Opc, SDValue LHS,
4030
- uint32_t ValLo, uint32_t ValHi) const {
4028
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4029
+ uint32_t ValLo, uint32_t ValHi) const {
4031
4030
SelectionDAG &DAG = DCI.DAG ;
4032
4031
SDValue Lo, Hi;
4033
4032
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4056,6 +4055,56 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4056
4055
SDLoc SL (N);
4057
4056
SelectionDAG &DAG = DCI.DAG ;
4058
4057
4058
+ // When the shl64_reduce optimisation code is passed through vector
4059
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4060
+ // resulted in the AND instructions no longer being elided, as mentioned
4061
+ // below. The following code should make sure this takes place.
4062
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4063
+ SDValue VAND = RHS.getOperand (0 );
4064
+ ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ));
4065
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4066
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4067
+ SDValue LHSAND = VAND.getOperand (0 );
4068
+ SDValue RHSAND = VAND.getOperand (1 );
4069
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4070
+ // Part of shlcombine is to optimise for the case where its possible
4071
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4072
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4073
+ // '&' is then elided by ISel. The vector code for this was being
4074
+ // completely scalarised by the vector legalizer, but now v2i32 is
4075
+ // made legal the vector legaliser only partially scalarises the
4076
+ // vector operations and the and was not elided. This check enables us
4077
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4078
+ // the and instruction.
4079
+ ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4080
+ ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4081
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4082
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4083
+ // Get the non-const AND operands and produce scalar AND
4084
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4085
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4086
+ SDValue Lo =
4087
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, Zero);
4088
+ SDValue Hi =
4089
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4090
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4091
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4092
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4093
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4094
+ if (AndIndex == 0 ) {
4095
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc, LoAnd,
4096
+ N->getFlags ());
4097
+ } else if (AndIndex == 1 ) {
4098
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc, HiAnd,
4099
+ N->getFlags ());
4100
+ } else {
4101
+ // Intentionally fall-through to the other combines.
4102
+ }
4103
+ }
4104
+ }
4105
+ }
4106
+ }
4107
+
4059
4108
unsigned RHSVal;
4060
4109
if (CRHS) {
4061
4110
RHSVal = CRHS->getZExtValue ();
@@ -4097,8 +4146,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4097
4146
if (VT.getScalarType () != MVT::i64 )
4098
4147
return SDValue ();
4099
4148
4100
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101
-
4102
4149
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4103
4150
// common case, splitting this into a move and a 32-bit shift is faster and
4104
4151
// the same code size.
@@ -4189,6 +4236,56 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4189
4236
SDLoc SL (N);
4190
4237
unsigned RHSVal;
4191
4238
4239
+ // When the shl64_reduce optimisation code is passed through vector
4240
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4241
+ // resulted in the AND instructions no longer being elided, as mentioned
4242
+ // below. The following code should make sure this takes place.
4243
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4244
+ SDValue VAND = RHS.getOperand (0 );
4245
+ ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ));
4246
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4247
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4248
+ SDValue LHSAND = VAND.getOperand (0 );
4249
+ SDValue RHSAND = VAND.getOperand (1 );
4250
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4251
+ // Part of srlcombine is to optimise for the case where its possible
4252
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4253
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4254
+ // '&' is then elided by ISel. The vector code for this was being
4255
+ // completely scalarised by the vector legalizer, but now v2i32 is
4256
+ // made legal the vector legaliser only partially scalarises the
4257
+ // vector operations and the and was not elided. This check enables us
4258
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4259
+ // the and instruction.
4260
+ ConstantSDNode *CANDL = dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4261
+ ConstantSDNode *CANDR = dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4262
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4263
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4264
+ // Get the non-const AND operands and produce scalar AND
4265
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4266
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4267
+ SDValue Lo =
4268
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, Zero);
4269
+ SDValue Hi =
4270
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4271
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4272
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4273
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4274
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4275
+ if (AndIndex == 0 ) {
4276
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc, LoAnd,
4277
+ N->getFlags ());
4278
+ } else if (AndIndex == 1 ) {
4279
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc, HiAnd,
4280
+ N->getFlags ());
4281
+ } else {
4282
+ // Intentionally fall-through to the other combines.
4283
+ }
4284
+ }
4285
+ }
4286
+ }
4287
+ }
4288
+
4192
4289
if (CRHS) {
4193
4290
RHSVal = CRHS->getZExtValue ();
4194
4291
@@ -4701,8 +4798,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4701
4798
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4702
4799
return SDValue ();
4703
4800
4704
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4705
- SDLoc (N), Cond, LHS, RHS);
4801
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4802
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4803
+ // out in this case. For now I've made the logic as specific to the case as
4804
+ // possible, hopefully this can be relaxed in future.
4805
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4806
+ SDValue LHSB = LHS.getOperand (0 );
4807
+ SDValue RHSB = RHS.getOperand (0 );
4808
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4809
+ RHSB->getOpcode () == ISD::BITCAST) {
4810
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4811
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4812
+ if (LHSB.getValueType () == MVT::f32 &&
4813
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4814
+ RHSBOpTy == MVT::i32 )
4815
+ return SDValue ();
4816
+ }
4817
+ }
4818
+
4819
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4820
+ RHS);
4706
4821
}
4707
4822
4708
4823
bool Inv = false ;
@@ -4755,8 +4870,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4755
4870
if (Inv)
4756
4871
std::swap (NewLHS, NewRHS);
4757
4872
4758
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4759
- Cond, NewLHS, NewRHS);
4873
+ SDValue NewSelect =
4874
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4760
4875
DCI.AddToWorklist (NewSelect.getNode ());
4761
4876
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4762
4877
}
@@ -5094,8 +5209,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5094
5209
}
5095
5210
case ISD::SELECT: {
5096
5211
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5212
+ // This combine became necessary recently to prevent a regression in
5213
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5214
+ // Specifically, additional instructions were added to the final codegen.
5215
+ // When adding this combine a case was added to performFNEGCombine to
5216
+ // prevent this combine from being undone under certain conditions.
5097
5217
// TODO: Invert conditions of foldFreeOpFromSelect
5098
- return SDValue ();
5218
+ SDValue Cond = N0.getOperand (0 );
5219
+ SDValue LHS = N0.getOperand (1 );
5220
+ SDValue RHS = N0.getOperand (2 );
5221
+ EVT LHVT = LHS.getValueType ();
5222
+ EVT RHVT = RHS.getValueType ();
5223
+ // The regression was limited to i32 v2/i32.
5224
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5225
+ return SDValue ();
5226
+
5227
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5228
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5229
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5230
+ return Op;
5099
5231
}
5100
5232
case ISD::BITCAST: {
5101
5233
SDLoc SL (N);
0 commit comments