@@ -187,7 +187,15 @@ class AArch64InstructionSelector : public InstructionSelector {
187
187
ComplexRendererFns selectAddrModeIndexed (MachineOperand &Root) const {
188
188
return selectAddrModeIndexed (Root, Width / 8 );
189
189
}
190
+
191
+ bool isWorthFoldingIntoExtendedReg (MachineInstr &MI,
192
+ const MachineRegisterInfo &MRI) const ;
193
+ ComplexRendererFns
194
+ selectAddrModeShiftedExtendXReg (MachineOperand &Root,
195
+ unsigned SizeInBytes) const ;
190
196
ComplexRendererFns selectAddrModeRegisterOffset (MachineOperand &Root) const ;
197
+ ComplexRendererFns selectAddrModeXRO (MachineOperand &Root,
198
+ unsigned SizeInBytes) const ;
191
199
192
200
void renderTruncImm (MachineInstrBuilder &MIB, const MachineInstr &MI) const ;
193
201
@@ -1238,8 +1246,8 @@ bool AArch64InstructionSelector::earlySelectLoad(
1238
1246
if (DstSize != 64 )
1239
1247
return false ;
1240
1248
1241
- // Check if we can do any folding from GEPs etc. into the load.
1242
- auto ImmFn = selectAddrModeRegisterOffset (I.getOperand (1 ));
1249
+ // Check if we can do any folding from GEPs/shifts etc. into the load.
1250
+ auto ImmFn = selectAddrModeXRO (I.getOperand (1 ), MemBytes );
1243
1251
if (!ImmFn)
1244
1252
return false ;
1245
1253
@@ -3995,6 +4003,98 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
3995
4003
}};
3996
4004
}
3997
4005
4006
+ // / Return true if it is worth folding MI into an extended register. That is,
4007
+ // / if it's safe to pull it into the addressing mode of a load or store as a
4008
+ // / shift.
4009
+ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg (
4010
+ MachineInstr &MI, const MachineRegisterInfo &MRI) const {
4011
+ // Always fold if there is one use, or if we're optimizing for size.
4012
+ Register DefReg = MI.getOperand (0 ).getReg ();
4013
+ if (MRI.hasOneUse (DefReg) ||
4014
+ MI.getParent ()->getParent ()->getFunction ().hasMinSize ())
4015
+ return true ;
4016
+
4017
+ // It's better to avoid folding and recomputing shifts when we don't have a
4018
+ // fastpath.
4019
+ if (!STI.hasLSLFast ())
4020
+ return false ;
4021
+
4022
+ // We have a fastpath, so folding a shift in and potentially computing it
4023
+ // many times may be beneficial. Check if this is only used in memory ops.
4024
+ // If it is, then we should fold.
4025
+ return all_of (MRI.use_instructions (DefReg),
4026
+ [](MachineInstr &Use) { return Use.mayLoadOrStore (); });
4027
+ }
4028
+
4029
+ // / This is used for computing addresses like this:
4030
+ // /
4031
+ // / ldr x1, [x2, x3, lsl #3]
4032
+ // /
4033
+ // / Where x2 is the base register, and x3 is an offset register. The shift-left
4034
+ // / is a constant value specific to this load instruction. That is, we'll never
4035
+ // / see anything other than a 3 here (which corresponds to the size of the
4036
+ // / element being loaded.)
4037
+ InstructionSelector::ComplexRendererFns
4038
+ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg (
4039
+ MachineOperand &Root, unsigned SizeInBytes) const {
4040
+ if (!Root.isReg ())
4041
+ return None;
4042
+ MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
4043
+
4044
+ // Make sure that the memory op is a valid size.
4045
+ int64_t LegalShiftVal = Log2_32 (SizeInBytes);
4046
+ if (LegalShiftVal == 0 )
4047
+ return None;
4048
+
4049
+ // We want to find something like this:
4050
+ //
4051
+ // val = G_CONSTANT LegalShiftVal
4052
+ // shift = G_SHL off_reg val
4053
+ // ptr = G_GEP base_reg shift
4054
+ // x = G_LOAD ptr
4055
+ //
4056
+ // And fold it into this addressing mode:
4057
+ //
4058
+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
4059
+
4060
+ // Check if we can find the G_GEP.
4061
+ MachineInstr *Gep = getOpcodeDef (TargetOpcode::G_GEP, Root.getReg (), MRI);
4062
+ if (!Gep || !isWorthFoldingIntoExtendedReg (*Gep, MRI))
4063
+ return None;
4064
+
4065
+ // Now try to match the G_SHL.
4066
+ MachineInstr *Shl =
4067
+ getOpcodeDef (TargetOpcode::G_SHL, Gep->getOperand (2 ).getReg (), MRI);
4068
+ if (!Shl || !isWorthFoldingIntoExtendedReg (*Shl, MRI))
4069
+ return None;
4070
+
4071
+ // Now, try to find the specific G_CONSTANT.
4072
+ auto ValAndVReg =
4073
+ getConstantVRegValWithLookThrough (Shl->getOperand (2 ).getReg (), MRI);
4074
+ if (!ValAndVReg)
4075
+ return None;
4076
+
4077
+ // The value must fit into 3 bits, and must be positive. Make sure that is
4078
+ // true.
4079
+ int64_t ImmVal = ValAndVReg->Value ;
4080
+ if ((ImmVal & 0x7 ) != ImmVal)
4081
+ return None;
4082
+
4083
+ // We are only allowed to shift by LegalShiftVal. This shift value is built
4084
+ // into the instruction, so we can't just use whatever we want.
4085
+ if (ImmVal != LegalShiftVal)
4086
+ return None;
4087
+
4088
+ // We can use the LHS of the GEP as the base, and the LHS of the shift as an
4089
+ // offset. Signify that we are shifting by setting the shift flag to 1.
4090
+ return {{
4091
+ [=](MachineInstrBuilder &MIB) { MIB.add (Gep->getOperand (1 )); },
4092
+ [=](MachineInstrBuilder &MIB) { MIB.add (Shl->getOperand (1 )); },
4093
+ [=](MachineInstrBuilder &MIB) { MIB.addImm (0 ); },
4094
+ [=](MachineInstrBuilder &MIB) { MIB.addImm (1 ); },
4095
+ }};
4096
+ }
4097
+
3998
4098
// / This is used for computing addresses like this:
3999
4099
// /
4000
4100
// / ldr x1, [x2, x3]
@@ -4008,11 +4108,6 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
4008
4108
MachineOperand &Root) const {
4009
4109
MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
4010
4110
4011
- // If we have a constant offset, then we probably don't want to match a
4012
- // register offset.
4013
- if (isBaseWithConstantOffset (Root, MRI))
4014
- return None;
4015
-
4016
4111
// We need a GEP.
4017
4112
MachineInstr *Gep = MRI.getVRegDef (Root.getReg ());
4018
4113
if (!Gep || Gep->getOpcode () != TargetOpcode::G_GEP)
@@ -4033,6 +4128,28 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
4033
4128
}};
4034
4129
}
4035
4130
4131
+ // / This is intended to be equivalent to selectAddrModeXRO in
4132
+ // / AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
4133
+ InstructionSelector::ComplexRendererFns
4134
+ AArch64InstructionSelector::selectAddrModeXRO (MachineOperand &Root,
4135
+ unsigned SizeInBytes) const {
4136
+ MachineRegisterInfo &MRI = Root.getParent ()->getMF ()->getRegInfo ();
4137
+
4138
+ // If we have a constant offset, then we probably don't want to match a
4139
+ // register offset.
4140
+ if (isBaseWithConstantOffset (Root, MRI))
4141
+ return None;
4142
+
4143
+ // Try to fold shifts into the addressing mode.
4144
+ auto AddrModeFns = selectAddrModeShiftedExtendXReg (Root, SizeInBytes);
4145
+ if (AddrModeFns)
4146
+ return AddrModeFns;
4147
+
4148
+ // If that doesn't work, see if it's possible to fold in registers from
4149
+ // a GEP.
4150
+ return selectAddrModeRegisterOffset (Root);
4151
+ }
4152
+
4036
4153
// / Select a "register plus unscaled signed 9-bit immediate" address. This
4037
4154
// / should only match when there is an offset that is not valid for a scaled
4038
4155
// / immediate addressing mode. The "Size" argument is the size in bytes of the
0 commit comments