Skip to content

[AMDGPU][SDAG] Legalise v2i32 or/xor/and instructions to make use of 64-bit wide instructions #140694

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 103 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,18 @@ static bool selectSupportsSourceMods(const SDNode *N) {
return N->getValueType(0) == MVT::f32;
}

LLVM_READONLY
static bool buildVectorSupportsSourceMods(const SDNode *N) {
if (N->getValueType(0) != MVT::v2f32)
return true;

if (N->getOperand(0)->getOpcode() != ISD::SELECT ||
N->getOperand(1)->getOpcode() != ISD::SELECT)
return true;

return false;
}
Comment on lines +725 to +734
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUILD_VECTOR doesn't support source modifiers. It wouldn't be specific to this case either, it would be for all types and all opcodes that support source modifiers in the input. But instead of pretending build_vector supports these, there should be combines involving build_vector to expose the modifiers to the real use/def instructions


// Most FP instructions support source modifiers, but this could be refined
// slightly.
LLVM_READONLY
Expand Down Expand Up @@ -754,6 +766,8 @@ static bool hasSourceMods(const SDNode *N) {
return true;
}
}
case ISD::BUILD_VECTOR:
return buildVectorSupportsSourceMods(N);
case ISD::SELECT:
return selectSupportsSourceMods(N);
default:
Expand Down Expand Up @@ -4056,6 +4070,50 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;

if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue VAND = RHS.getOperand(0);
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
uint64_t AndIndex = RHS->getConstantOperandVal(1);
if (VAND->getOpcode() == ISD::AND && CRRHS) {
SDValue LHSAND = VAND.getOperand(0);
SDValue RHSAND = VAND.getOperand(1);
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
// Part of shlcombine is to optimise for the case where its possible
// to reduce shl64 to shl32 if shift range is [63-32]. This
// transforms: DST = shl i64 X, Y to [0, shl i32 X, (Y & 31) ]. The
// '&' is then elided by ISel. The vector code for this was being
// completely scalarised by the vector legalizer, but now v2i32 is
// made legal the vector legaliser only partially scalarises the
// vector operations and the and was not elided. This check enables us
// to locate and scalarise the v2i32 and and re-enable ISel to elide
// the and instruction.
ConstantSDNode *CANDL =
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
ConstantSDNode *CANDR =
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
RHSAND->getConstantOperandVal(1) == 0x1f) {
// Get the non-const AND operands and produce scalar AND
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
LHSAND, Zero);
SDValue Hi =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
SDValue LoAnd =
DAG.getNode(ISD::AND, SL, MVT::i32, Lo, RHSAND->getOperand(0));
SDValue HiAnd =
DAG.getNode(ISD::AND, SL, MVT::i32, Hi, RHSAND->getOperand(0));
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
if (AndIndex == 0 || AndIndex == 1)
return DAG.getNode(ISD::SHL, SL, MVT::i32, Trunc,
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
}
}
}
}
}

unsigned RHSVal;
if (CRHS) {
RHSVal = CRHS->getZExtValue();
Expand Down Expand Up @@ -4097,8 +4155,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (VT.getScalarType() != MVT::i64)
return SDValue();

// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))

// On some subtargets, 64-bit shift is a quarter rate instruction. In the
// common case, splitting this into a move and a 32-bit shift is faster and
// the same code size.
Expand Down Expand Up @@ -4189,6 +4245,49 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
SDLoc SL(N);
unsigned RHSVal;

if (RHS->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue VAND = RHS.getOperand(0);
if (ConstantSDNode *CRRHS = dyn_cast<ConstantSDNode>(RHS->getOperand(1))) {
uint64_t AndIndex = RHS->getConstantOperandVal(1);
if (VAND->getOpcode() == ISD::AND && CRRHS) {
SDValue LHSAND = VAND.getOperand(0);
SDValue RHSAND = VAND.getOperand(1);
if (RHSAND->getOpcode() == ISD::BUILD_VECTOR) {
// Part of srlcombine is to optimise for the case where its possible
// to reduce shl64 to shl32 if shift range is [63-32]. This
// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The
// '&' is then elided by ISel. The vector code for this was being
// completely scalarised by the vector legalizer, but now v2i32 is
// made legal the vector legaliser only partially scalarises the
// vector operations and the and was not elided. This check enables us
// to locate and scalarise the v2i32 and and re-enable ISel to elide
// the and instruction.
ConstantSDNode *CANDL =
dyn_cast<ConstantSDNode>(RHSAND->getOperand(0));
ConstantSDNode *CANDR =
dyn_cast<ConstantSDNode>(RHSAND->getOperand(1));
if (CANDL && CANDR && RHSAND->getConstantOperandVal(0) == 0x1f &&
RHSAND->getConstantOperandVal(1) == 0x1f) {
// Get the non-const AND operands and produce scalar AND
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
LHSAND, Zero);
SDValue Hi =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One);
SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask);
SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
if (AndIndex == 0 || AndIndex == 1)
return DAG.getNode(ISD::SRL, SL, MVT::i32, Trunc,
AndIndex == 0 ? LoAnd : HiAnd, N->getFlags());
}
}
}
}
}

if (CRHS) {
RHSVal = CRHS->getZExtValue();

Expand Down Expand Up @@ -4701,8 +4800,8 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
return SDValue();

return distributeOpThroughSelect(DCI, LHS.getOpcode(),
SDLoc(N), Cond, LHS, RHS);
return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS,
RHS);
}

bool Inv = false;
Expand Down
98 changes: 96 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}

setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
// instead lower to cndmask in SITargetLowering::LowerSELECT().
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
// alignbit.
setOperationAction(ISD::ROTR, MVT::v2i32, Custom);

setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);

Expand Down Expand Up @@ -6041,6 +6049,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}

// Enable lowering of ROTR for vxi32 types. This is a workaround for a
// regression whereby extra unnecessary instructions were added to codegen
// for rotr operations, casued by legalising v2i32 or. This resulted in extra
// instructions to extract the result from the vector.
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
[[maybe_unused]] EVT VT = Op.getValueType();

assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
VT == MVT::v16i32) &&
"Unexpected ValueType.");

return DAG.UnrollVectorOp(Op.getNode());
}

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
Expand Down Expand Up @@ -6232,6 +6254,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
case ISD::ROTR:
return lowerROTR(Op, DAG);
}
return SDValue();
}
Expand Down Expand Up @@ -13136,6 +13160,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}

// Detect identity v2i32 OR and replace with identity source node.
// Specifically an Or that has operands constructed from the same source node
// via extract_vector_elt and build_vector. I.E.
// v2i32 or(
// v2i32 build_vector(
// i32 extract_elt(%IdentitySrc, 0),
// i32 0
// ),
// v2i32 build_vector(
// i32 0,
// i32 extract_elt(%IdentitySrc, 1)
// ) )
// =>
// v2i32 %IdentitySrc

if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
RHS->getOpcode() == ISD::BUILD_VECTOR) {

ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));

// Test for and normalise build vectors.
if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {

// Get the extract_vector_element operands.
SDValue LEVE = LHS->getOperand(0);
SDValue REVE = RHS->getOperand(1);

if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Check that different elements from the same vector are
// extracted.
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
LEVE->getOperand(1) != REVE->getOperand(1)) {
SDValue IdentitySrc = LEVE.getOperand(0);
return IdentitySrc;
}
}
}
}

if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();

Expand Down Expand Up @@ -13180,13 +13245,42 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
return RV;

SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);

// Fold the fneg of a vselect into the v2 vselect operands.
// xor (vselect c, a, b), 0x80000000 ->
// bitcast (vselect c, (fneg (bitcast a)), (fneg (bitcast b)))
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {

const ConstantSDNode *CRHS0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
const ConstantSDNode *CRHS1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
SDValue LHS_0 = LHS.getOperand(0);
SDValue LHS_1 = LHS.getOperand(1);

if (LHS.getOpcode() == ISD::VSELECT && CRHS0 &&
CRHS0->getAPIntValue().isSignMask() &&
shouldFoldFNegIntoSrc(N, LHS_0) && CRHS1 &&
CRHS1->getAPIntValue().isSignMask() &&
shouldFoldFNegIntoSrc(N, LHS_1)) {

SDLoc DL(N);
SDValue CastLHS =
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
SDValue CastRHS =
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
LHS->getOperand(0), FNegLHS, FNegRHS);
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
}
Comment on lines +13269 to +13279
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to handle using fneg/fabs on integer select as a separate patch

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you suggest we prevent the test regression without including this code in this diff?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://godbolt.org/z/9aocMa381 directly do the work to support source modifiers as integer on select

}

const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
SelectionDAG &DAG = DCI.DAG;

EVT VT = N->getValueType(0);
if (CRHS && VT == MVT::i64) {
if (SDValue Split =
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1869,7 +1869,6 @@ def : GCNPat <
>;
}


/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
Expand Down Expand Up @@ -2423,9 +2422,9 @@ def : AMDGPUPatIgnoreCopies <
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;

// 64-bit version
foreach vt = [i64, v2i32] in {
def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
Expand All @@ -2434,6 +2433,7 @@ def : AMDGPUPatIgnoreCopies <
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
}

def : AMDGPUPat <
(fcopysign f32:$src0, f32:$src1),
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1792,6 +1792,21 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
(S_AND_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
(S_OR_B64 SReg_64:$x, SReg_64:$y)
>;

def : GCNPat <
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
>;

// Same as a 32-bit inreg
def : GCNPat<
(i32 (UniformUnaryFrag<sext> i16:$src)),
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
def : DivergentBinOp<sube, V_SUBB_U32_e32>;

class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
GCNPat<
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
(REG_SEQUENCE VReg_64,
(Inst
(i32 (EXTRACT_SUBREG $src0, sub0)),
Expand All @@ -973,6 +973,10 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;

def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;

// mul24 w/ 64 bit output.
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
(i64 (Op i32:$src0, i32:$src1)),
Expand Down
Loading