@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4025
4025
// / Split the 64-bit value \p LHS into two 32-bit components, and perform the
4026
4026
// / binary operation \p Opc to it with the corresponding constant operands.
4027
4027
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl (
4028
- DAGCombinerInfo &DCI, const SDLoc &SL,
4029
- unsigned Opc, SDValue LHS,
4030
- uint32_t ValLo, uint32_t ValHi) const {
4028
+ DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
4029
+ uint32_t ValLo, uint32_t ValHi) const {
4031
4030
SelectionDAG &DAG = DCI.DAG ;
4032
4031
SDValue Lo, Hi;
4033
4032
std::tie (Lo, Hi) = split64BitValue (LHS, DAG);
@@ -4056,6 +4055,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4056
4055
SDLoc SL (N);
4057
4056
SelectionDAG &DAG = DCI.DAG ;
4058
4057
4058
+ // When the shl64_reduce optimisation code is passed through vector
4059
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4060
+ // resulted in the AND instructions no longer being elided, as mentioned
4061
+ // below. The following code should make sure this takes place.
4062
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4063
+ SDValue VAND = RHS.getOperand (0 );
4064
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4065
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4066
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4067
+ SDValue LHSAND = VAND.getOperand (0 );
4068
+ SDValue RHSAND = VAND.getOperand (1 );
4069
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4070
+ // Part of shlcombine is to optimise for the case where its possible
4071
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4072
+ // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
4073
+ // '&' is then elided by ISel. The vector code for this was being
4074
+ // completely scalarised by the vector legalizer, but now v2i32 is
4075
+ // made legal the vector legaliser only partially scalarises the
4076
+ // vector operations and the and was not elided. This check enables us
4077
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4078
+ // the and instruction.
4079
+ ConstantSDNode *CANDL =
4080
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4081
+ ConstantSDNode *CANDR =
4082
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4083
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4084
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4085
+ // Get the non-const AND operands and produce scalar AND
4086
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4087
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4088
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4089
+ LHSAND, Zero);
4090
+ SDValue Hi =
4091
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4092
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4093
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4094
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4095
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4096
+ if (AndIndex == 0 || AndIndex == 1 )
4097
+ return DAG.getNode (ISD::SHL, SL, MVT::i32 , Trunc,
4098
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4099
+ }
4100
+ }
4101
+ }
4102
+ }
4103
+ }
4104
+
4059
4105
unsigned RHSVal;
4060
4106
if (CRHS) {
4061
4107
RHSVal = CRHS->getZExtValue ();
@@ -4097,8 +4143,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4097
4143
if (VT.getScalarType () != MVT::i64 )
4098
4144
return SDValue ();
4099
4145
4100
- // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101
-
4102
4146
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4103
4147
// common case, splitting this into a move and a 32-bit shift is faster and
4104
4148
// the same code size.
@@ -4189,6 +4233,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4189
4233
SDLoc SL (N);
4190
4234
unsigned RHSVal;
4191
4235
4236
+ // When the shl64_reduce optimisation code is passed through vector
4237
+ // legalization some scalarising occurs. After ISD::AND was legalised, this
4238
+ // resulted in the AND instructions no longer being elided, as mentioned
4239
+ // below. The following code should make sure this takes place.
4240
+ if (RHS->getOpcode () == ISD::EXTRACT_VECTOR_ELT) {
4241
+ SDValue VAND = RHS.getOperand (0 );
4242
+ if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand (1 ))) {
4243
+ uint64_t AndIndex = RHS->getConstantOperandVal (1 );
4244
+ if (VAND->getOpcode () == ISD::AND && CRRHS) {
4245
+ SDValue LHSAND = VAND.getOperand (0 );
4246
+ SDValue RHSAND = VAND.getOperand (1 );
4247
+ if (RHSAND->getOpcode () == ISD::BUILD_VECTOR) {
4248
+ // Part of srlcombine is to optimise for the case where its possible
4249
+ // to reduce shl64 to shl32 if shift range is [63-32]. This
4250
+ // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
4251
+ // '&' is then elided by ISel. The vector code for this was being
4252
+ // completely scalarised by the vector legalizer, but now v2i32 is
4253
+ // made legal the vector legaliser only partially scalarises the
4254
+ // vector operations and the and was not elided. This check enables us
4255
+ // to locate and scalarise the v2i32 and and re-enable ISel to elide
4256
+ // the and instruction.
4257
+ ConstantSDNode *CANDL =
4258
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (0 ));
4259
+ ConstantSDNode *CANDR =
4260
+ dyn_cast<ConstantSDNode>(RHSAND->getOperand (1 ));
4261
+ if (CANDL && CANDR && RHSAND->getConstantOperandVal (0 ) == 0x1f &&
4262
+ RHSAND->getConstantOperandVal (1 ) == 0x1f ) {
4263
+ // Get the non-const AND operands and produce scalar AND
4264
+ const SDValue Zero = DAG.getConstant (0 , SL, MVT::i32 );
4265
+ const SDValue One = DAG.getConstant (1 , SL, MVT::i32 );
4266
+ SDValue Lo = DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 ,
4267
+ LHSAND, Zero);
4268
+ SDValue Hi =
4269
+ DAG.getNode (ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32 , LHSAND, One);
4270
+ SDValue AndMask = DAG.getConstant (0x1f , SL, MVT::i32 );
4271
+ SDValue LoAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Lo, AndMask);
4272
+ SDValue HiAnd = DAG.getNode (ISD::AND, SL, MVT::i32 , Hi, AndMask);
4273
+ SDValue Trunc = DAG.getNode (ISD::TRUNCATE, SL, MVT::i32 , LHS);
4274
+ if (AndIndex == 0 || AndIndex == 1 )
4275
+ return DAG.getNode (ISD::SRL, SL, MVT::i32 , Trunc,
4276
+ AndIndex == 0 ? LoAnd : HiAnd, N->getFlags ());
4277
+ }
4278
+ }
4279
+ }
4280
+ }
4281
+ }
4282
+
4192
4283
if (CRHS) {
4193
4284
RHSVal = CRHS->getZExtValue ();
4194
4285
@@ -4701,8 +4792,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4701
4792
if (!AMDGPUTargetLowering::allUsesHaveSourceMods (N.getNode ()))
4702
4793
return SDValue ();
4703
4794
4704
- return distributeOpThroughSelect (DCI, LHS.getOpcode (),
4705
- SDLoc (N), Cond, LHS, RHS);
4795
+ // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
4796
+ // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
4797
+ // out in this case. For now I've made the logic as specific to the case as
4798
+ // possible, hopefully this can be relaxed in future.
4799
+ if (LHS.getOpcode () == ISD::FNEG && RHS.getOpcode () == ISD::FNEG) {
4800
+ SDValue LHSB = LHS.getOperand (0 );
4801
+ SDValue RHSB = RHS.getOperand (0 );
4802
+ if (LHSB.getOpcode () == ISD::BITCAST &&
4803
+ RHSB->getOpcode () == ISD::BITCAST) {
4804
+ EVT LHSBOpTy = LHSB->getOperand (0 ).getValueType ();
4805
+ EVT RHSBOpTy = RHSB->getOperand (0 ).getValueType ();
4806
+ if (LHSB.getValueType () == MVT::f32 &&
4807
+ RHSB.getValueType () == MVT::f32 && LHSBOpTy == MVT::i32 &&
4808
+ RHSBOpTy == MVT::i32 )
4809
+ return SDValue ();
4810
+ }
4811
+ }
4812
+
4813
+ return distributeOpThroughSelect (DCI, LHS.getOpcode (), SDLoc (N), Cond, LHS,
4814
+ RHS);
4706
4815
}
4707
4816
4708
4817
bool Inv = false ;
@@ -4755,8 +4864,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4755
4864
if (Inv)
4756
4865
std::swap (NewLHS, NewRHS);
4757
4866
4758
- SDValue NewSelect = DAG. getNode (ISD::SELECT, SL, VT,
4759
- Cond, NewLHS, NewRHS);
4867
+ SDValue NewSelect =
4868
+ DAG. getNode (ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
4760
4869
DCI.AddToWorklist (NewSelect.getNode ());
4761
4870
return DAG.getNode (LHS.getOpcode (), SL, VT, NewSelect);
4762
4871
}
@@ -5094,8 +5203,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5094
5203
}
5095
5204
case ISD::SELECT: {
5096
5205
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5206
+ // This combine became necessary recently to prevent a regression in
5207
+ // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
5208
+ // Specifically, additional instructions were added to the final codegen.
5209
+ // When adding this combine a case was added to performFNEGCombine to
5210
+ // prevent this combine from being undone under certain conditions.
5097
5211
// TODO: Invert conditions of foldFreeOpFromSelect
5098
- return SDValue ();
5212
+ SDValue Cond = N0.getOperand (0 );
5213
+ SDValue LHS = N0.getOperand (1 );
5214
+ SDValue RHS = N0.getOperand (2 );
5215
+ EVT LHVT = LHS.getValueType ();
5216
+ EVT RHVT = RHS.getValueType ();
5217
+ // The regression was limited to i32 v2/i32.
5218
+ if (RHVT != MVT::i32 && LHVT != MVT::i32 )
5219
+ return SDValue ();
5220
+
5221
+ SDValue LFNeg = DAG.getNode (ISD::FNEG, SL, LHVT, LHS);
5222
+ SDValue RFNeg = DAG.getNode (ISD::FNEG, SL, RHVT, RHS);
5223
+ SDValue Op = DAG.getNode (Opc, SL, LHVT, Cond, LFNeg, RFNeg);
5224
+ return Op;
5099
5225
}
5100
5226
case ISD::BITCAST: {
5101
5227
SDLoc SL (N);
0 commit comments