llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 136 additions & 10 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 136 additions & 10 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 93 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 93 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.h
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.h
Lines changed: 1 addition & 0 deletions
@@ -4025,9 +4025,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
-  DAGCombinerInfo &DCI, const SDLoc &SL,
-  unsigned Opc, SDValue LHS,
-  uint32_t ValLo, uint32_t ValHi) const {
+    DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
+    uint32_t ValLo, uint32_t ValHi) const {
   SelectionDAG &DAG = DCI.DAG;
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
@@ -4056,6 +4055,53 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
+  // When the shl64_reduce optimisation code is passed through vector
+  // legalization some scalarising occurs. After ISD::AND was legalised, this
+  // resulted in the AND instructions no longer being elided, as mentioned
+  // below. The following code should make sure this takes place.
+  if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue VAND = RHS.getOperand(0);
+    if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
+      uint64_t AndIndex = RHS->getConstantOperandVal(1);
+      if (VAND->getOpcode() == ISD::AND && CRRHS) {
+        SDValue LHSAND = VAND.getOperand(0);
+        SDValue RHSAND = VAND.getOperand(1);
+        if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
+          // Part of shlcombine is to optimise for the case where its possible
+          // to reduce shl64 to shl32 if shift range is [63-32]. This
+          // transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
+          // '&' is then elided by ISel. The vector code for this was being
+          // completely scalarised by the vector legalizer, but now v2i32 is
+          // made legal the vector legaliser only partially scalarises the
+          // vector operations and the and was not elided. This check enables us
+          // to locate and scalarise the v2i32 and and re-enable ISel to elide
+          // the and instruction.
+          ConstantSDNode *CANDL =
+              dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
+          ConstantSDNode *CANDR =
+              dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
+          if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
+              RHSAND->getConstantOperandVal(1) == 0x1f) {
+            // Get the non-const AND operands and produce scalar AND
+            const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+            const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+            SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+                                     LHSAND, Zero);
+            SDValue Hi =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
+            SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
+            SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
+            SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+            SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+            if (AndIndex == 0 || AndIndex == 1)
+              return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
+                                 AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
+          }
+        }
+      }
+    }
+  }
+
   unsigned RHSVal;
   if (CRHS) {
     RHSVal = CRHS->getZExtValue();
@@ -4097,8 +4143,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
-
   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   // common case, splitting this into a move and a 32-bit shift is faster and
   // the same code size.
@@ -4189,6 +4233,53 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   SDLoc SL(N);
   unsigned RHSVal;
 
+  // When the shl64_reduce optimisation code is passed through vector
+  // legalization some scalarising occurs. After ISD::AND was legalised, this
+  // resulted in the AND instructions no longer being elided, as mentioned
+  // below. The following code should make sure this takes place.
+  if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue VAND = RHS.getOperand(0);
+    if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
+      uint64_t AndIndex = RHS->getConstantOperandVal(1);
+      if (VAND->getOpcode() == ISD::AND && CRRHS) {
+        SDValue LHSAND = VAND.getOperand(0);
+        SDValue RHSAND = VAND.getOperand(1);
+        if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
+          // Part of srlcombine is to optimise for the case where its possible
+          // to reduce shl64 to shl32 if shift range is [63-32]. This
+          // transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
+          // '&' is then elided by ISel. The vector code for this was being
+          // completely scalarised by the vector legalizer, but now v2i32 is
+          // made legal the vector legaliser only partially scalarises the
+          // vector operations and the and was not elided. This check enables us
+          // to locate and scalarise the v2i32 and and re-enable ISel to elide
+          // the and instruction.
+          ConstantSDNode *CANDL =
+              dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
+          ConstantSDNode *CANDR =
+              dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
+          if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
+              RHSAND->getConstantOperandVal(1) == 0x1f) {
+            // Get the non-const AND operands and produce scalar AND
+            const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+            const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+            SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+                                     LHSAND, Zero);
+            SDValue Hi =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
+            SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
+            SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
+            SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
+            SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+            if (AndIndex == 0 || AndIndex == 1)
+              return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
+                                 AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
+          }
+        }
+      }
+    }
+  }
+
   if (CRHS) {
     RHSVal = CRHS->getZExtValue();
 
@@ -4701,8 +4792,26 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
     if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
       return SDValue();
 
-    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
-                                     SDLoc(N), Cond, LHS, RHS);
+    // select c, (fneg (f32 bitcast i32 x)), (fneg (f32 bitcast i32 y)) can be
+    // lowered directly to a V_CNDMASK_. So prevent the fneg from being pulled
+    // out in this case. For now I've made the logic as specific to the case as
+    // possible, hopefully this can be relaxed in future.
+    if (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG) {
+      SDValue LHSB = LHS.getOperand(0);
+      SDValue RHSB = RHS.getOperand(0);
+      if (LHSB.getOpcode() == ISD::BITCAST &&
+          RHSB->getOpcode() == ISD::BITCAST) {
+        EVT LHSBOpTy = LHSB->getOperand(0).getValueType();
+        EVT RHSBOpTy = RHSB->getOperand(0).getValueType();
+        if (LHSB.getValueType() == MVT::f32 &&
+            RHSB.getValueType() == MVT::f32 && LHSBOpTy == MVT::i32 &&
+            RHSBOpTy == MVT::i32)
+          return SDValue();
+      }
+    }
+
+    return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
+                                     RHS);
   }
 
   bool Inv = false;
@@ -4755,8 +4864,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
       if (Inv)
         std::swap(NewLHS, NewRHS);
 
-      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
-                                      Cond, NewLHS, NewRHS);
+      SDValue NewSelect =
+          DAG.getNode(ISD::SELECT, SL, VT, Cond, NewLHS, NewRHS);
       DCI.AddToWorklist(NewSelect.getNode());
       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
     }
@@ -5094,8 +5203,25 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::SELECT: {
     // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+    // This combine became necessary recently to prevent a regression in
+    // fneg-modifier-casting.ll caused by this patch legalising v2i32 xor.
+    // Specifically, additional instructions were added to the final codegen.
+    // When adding this combine a case was added to performFNEGCombine to
+    // prevent this combine from being undone under certain conditions.
     // TODO: Invert conditions of foldFreeOpFromSelect
-    return SDValue();
+    SDValue Cond = N0.getOperand(0);
+    SDValue LHS = N0.getOperand(1);
+    SDValue RHS = N0.getOperand(2);
+    EVT LHVT = LHS.getValueType();
+    EVT RHVT = RHS.getValueType();
+    // The regression was limited to i32 v2/i32.
+    if (RHVT != MVT::i32 && LHVT != MVT::i32)
+      return SDValue();
+
+    SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
+    SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);
+    SDValue Op = DAG.getNode(Opc, SL, LHVT, Cond, LFNeg, RFNeg);
+    return Op;
   }
   case ISD::BITCAST: {
     SDLoc SL(N);
 
@@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
   }
 
+  setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
+  // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
+  // instead lower to cndmask in SITargetLowering::LowerSELECT().
+  setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
+  // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
+  // alignbit.
+  setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
+
   setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
                      Custom);
 
@@ -5934,6 +5942,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
 }
 
+// Enable lowering of ROTR for vxi32 types. This is a workaround for a
+// regression whereby extra unnecessary instructions were added to codegen
+// for rotr operations, casued by legalising v2i32 or. This resulted in extra
+// instructions to extract the result from the vector.
+SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
+  [[maybe_unused]] EVT VT = Op.getValueType();
+
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
+          VT == MVT::v16i32) &&
+         "Unexpected ValueType.");
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
 // wider vector type is legal.
 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
@@ -6125,6 +6147,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerGET_FPENV(Op, DAG);
   case ISD::SET_FPENV:
     return lowerSET_FPENV(Op, DAG);
+  case ISD::ROTR:
+    return lowerROTR(Op, DAG);
   }
   return SDValue();
 }
@@ -13018,6 +13042,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     }
   }
 
+  // Detect identity v2i32 OR and replace with identity source node.
+  // Specifically an Or that has operands constructed from the same source node
+  // via extract_vector_elt and build_vector. I.E.
+  // v2i32 or(
+  //   v2i32 build_vector(
+  //     i32 extract_elt(%IdentitySrc, 0),
+  //     i32 0
+  //   ),
+  //   v2i32 build_vector(
+  //     i32 0,
+  //     i32 extract_elt(%IdentitySrc, 1)
+  //   ) )
+  // =>
+  // v2i32 %IdentitySrc
+
+  if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
+      RHS->getOpcode() == ISD::BUILD_VECTOR) {
+
+    ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
+
+    // Test for and normalise build vectors.
+    if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
+
+      // Get the extract_vector_element operands.
+      SDValue LEVE = LHS->getOperand(0);
+      SDValue REVE = RHS->getOperand(1);
+
+      if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+        // Check that different elements from the same vector are
+        // extracted.
+        if (LEVE->getOperand(0) == REVE->getOperand(0) &&
+            LEVE->getOperand(1) != REVE->getOperand(1)) {
+          SDValue IdentitySrc = LEVE.getOperand(0);
+          return IdentitySrc;
+        }
+      }
+    }
+  }
+
   if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -13062,13 +13127,39 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
     return RV;
 
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
+
+    const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
+    const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    SDValue LHS_0 = LHS.getOperand(0);
+    SDValue LHS_1 = LHS.getOperand(1);
+
+    if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
+        CRHS0->getAPIntValue().isSignMask() &&
+        shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
+        CRHS1->getAPIntValue().isSignMask() &&
+        shouldFoldFNegIntoSrc(N, LHS_1)) {
+
+      SDLoc DL(N);
+      SDValue CastLHS =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
+      SDValue CastRHS =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
+      SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
+      SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
+      SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
+                                      LHS->getOperand(0), FNegLHS, FNegRHS);
+      return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
+    }
+  }
+
   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-  SelectionDAG &DAG = DCI.DAG;
 
-  EVT VT = N->getValueType(0);
   if (CRHS && VT == MVT::i64) {
     if (SDValue Split =
             splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
 
@@ -442,6 +442,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;