Skip to content

Improve Math.BigMul on x64 by adding new internal Multiply hardware intrinsic to X86Base #115966

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28833,8 +28833,10 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
{
#ifdef TARGET_XARCH
case NI_X86Base_DivRem:
case NI_X86Base_Multiply:
return compiler->typGetBlkLayout(genTypeSize(GetSimdBaseType()) * 2);
case NI_X86Base_X64_DivRem:
case NI_X86Base_X64_Multiply:
return compiler->typGetBlkLayout(16);
#endif // TARGET_XARCH
#ifdef TARGET_ARM64
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,8 @@ struct HWIntrinsicInfo
#ifdef TARGET_XARCH
case NI_X86Base_DivRem:
case NI_X86Base_X64_DivRem:
case NI_X86Base_Multiply:
case NI_X86Base_X64_Multiply:
return 2;
#endif // TARGET_XARCH

Expand Down
27 changes: 27 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2442,6 +2442,33 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}

case NI_X86Base_Multiply:
case NI_X86Base_X64_Multiply:
{
assert(node->GetOperandCount() == 2);
assert(instOptions == INS_OPTS_NONE);

// SIMD base type is from signature and can distinguish signed and unsigned
var_types targetType = node->GetSimdBaseType();
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);

regNumber op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

emitAttr attr = emitTypeSize(targetType);
emitter* emit = GetEmitter();

// op1: EAX, op2: reg/mem
emit->emitIns_Mov(INS_mov, attr, REG_EAX, op1Reg, /* canSkip */ true);

// emit the MUL/IMUL instruction
emit->emitInsBinary(ins, attr, node, op2);

break;
}

default:
unreached();
break;
Expand Down
4 changes: 3 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ HARDWARE_INTRINSIC(Vector512, op_UnsignedRightShift,
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(X86Base, Multiply, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, Pause, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other)
#define LAST_NI_X86Base NI_X86Base_Pause

Expand All @@ -409,7 +410,8 @@ HARDWARE_INTRINSIC(X86Base, Pause,
HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic)
#define LAST_NI_X86Base_X64 NI_X86Base_X64_DivRem
HARDWARE_INTRINSIC(X86Base_X64, Multiply, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_imulEAX, INS_mulEAX, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic|HW_Flag_Commutative)
#define LAST_NI_X86Base_X64 NI_X86Base_X64_Multiply

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
Expand Down
21 changes: 21 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3914,6 +3914,27 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_Multiply:
case NI_X86Base_X64_Multiply:
{
assert(sig->numArgs == 2);
assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(retType == TYP_STRUCT);
assert(simdBaseJitType != CORINFO_TYPE_UNDEF);

op2 = impPopStack().val;
op1 = impPopStack().val;

GenTreeHWIntrinsic* multiplyIntrinsic = gtNewScalarHWIntrinsicNode(retType, op1, op2, intrinsic);

// Store the type from signature into SIMD base type for convenience
multiplyIntrinsic->SetSimdBaseJitType(simdBaseJitType);

retNode = impStoreMultiRegValueToVar(multiplyIntrinsic,
sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}

case NI_SSE_CompareScalarGreaterThan:
case NI_SSE_CompareScalarGreaterThanOrEqual:
case NI_SSE_CompareScalarNotGreaterThan:
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10642,6 +10642,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)

case NI_BMI2_MultiplyNoFlags:
case NI_BMI2_X64_MultiplyNoFlags:
case NI_X86Base_Multiply:
case NI_X86Base_X64_Multiply:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am mostly guessing that the "containment" check and switching of operations should be the same, and the resulting code looks ok, but I am not sure

{
bool supportsOp1RegOptional = false;
bool supportsOp2RegOptional = false;
Expand Down
25 changes: 23 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2465,6 +2465,25 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_X86Base_Multiply:
case NI_X86Base_X64_Multiply:
{
assert(numArgs == 2);
assert(dstCount == 2);
assert(isRMW);

// mulEAX always have op1 in EAX
srcCount += BuildOperandUses(op1, SRBM_EAX);
srcCount += BuildOperandUses(op2);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see there are a number of APX related changes to this files since the code was written, I will merge/rebase and try to update it once I have feedback if this solution is in the right direction. (or if it would be better to handle BigMul itself directly, )

Will probaly replace this with :
BuildOperandUses(op2, BuildApxIncompatibleGPRMask(op2))


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it need to add an if statement checking if (!op2->isContained()) similar to DivRem ?
If so what should it do different? Should it do BuildDelayFreeUses or similar

// result put in EAX and EDX
BuildDef(intrinsicTree, SRBM_EAX, 0);
BuildDef(intrinsicTree, SRBM_EDX, 1);

buildUses = false;
break;
}

case NI_BMI2_MultiplyNoFlags:
case NI_BMI2_X64_MultiplyNoFlags:
{
Expand Down Expand Up @@ -2976,9 +2995,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
else
{
// Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above
// Currently dstCount = 2 is only used for DivRem and Multiply, which has special constraints and is handled
// above
assert((dstCount == 0) ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) ||
(intrinsicId == NI_X86Base_Multiply) || (intrinsicId == NI_X86Base_X64_Multiply))));
}

*pDstCount = dstCount;
Expand Down
32 changes: 17 additions & 15 deletions src/libraries/System.Private.CoreLib/src/System/Math.cs
Original file line number Diff line number Diff line change
Expand Up @@ -159,16 +159,6 @@ internal static void ThrowNegateTwosCompOverflow()
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong BigMul(uint a, uint b)
{
#if false // TARGET_32BIT
// This generates slower code currently than the simple multiplication
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing has changed here and this block/comment still holds, why remove it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Is it fine to just att back the link ?

I initially called the new Multiply and it was better than the BMI2 code, and seems to generate as good code as the built in JIT support for 32*32=>64bit multiply, but decided against adding in case it produces worse code than the current JIT optimisations.

From my understanding of the MultiplyNoFlags issue is that it will not be fixed for that overload, or at least not in the near future.

  1. Is there a specific reason why not to emit mulx GT_MUL_LONG instead ?

It would then apply to a lot more places, including those that used the standard cast+mul pattern (ulong)a*(ulong)b instead because it is simipler or due to thee bad perf this had because of MultiplyNoFlags.

If I update Multiply with mulx support I can add that one back on here, or have a lock at handling it in GT_MUL_LONG isntead.

// https://github.com/dotnet/runtime/issues/11782
if (Bmi2.IsSupported)
{
uint low;
uint high = Bmi2.MultiplyNoFlags(a, b, &low);
return ((ulong)high << 32) | low;
}
#endif
return ((ulong)a) * b;
}

Expand All @@ -181,7 +171,7 @@ public static long BigMul(int a, int b)
return ((long)a) * b;
}


#if !(TARGET_ARM64 || (TARGET_AMD64 && !MONO)) // BigMul 64*64 has high performance intrinsics on ARM64 and AMD64 (but not yet on MONO)
/// <summary>
/// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
/// </summary>
Expand All @@ -190,21 +180,18 @@ public static long BigMul(int a, int b)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(ulong a, uint b, out ulong low)
{
#if TARGET_64BIT
return Math.BigMul((ulong)a, (ulong)b, out low);
#else
ulong prodL = ((ulong)(uint)a) * b;
ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b);

low = ((prodH << 32) | (uint)prodL);
return (prodH >> 32);
#endif
}

/// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong BigMul(uint a, ulong b, out ulong low)
=> BigMul(b, a, out low);
#endif

/// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
/// <param name="a">The first number to multiply.</param>
Expand All @@ -215,13 +202,21 @@ internal static ulong BigMul(uint a, ulong b, out ulong low)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong BigMul(ulong a, ulong b, out ulong low)
{
#if MONO // Multiply is not yet implemented in MONO
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition should be moved below (and reversed) so that the new intrinsic is used only if BMI2 isn't supported - MULX should be preferred over MUL since it has more flexible register allocation (currently wasted by the unnecessary memory spills though).

Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson May 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I can reorder the condition, but rather not switch to the slow BMI method to be called.

That would eliminate the performance improvements this PR was intended to fix (apart from 0.86 execeution time for benchmark) it makes a significant improvement of nearly 2* for other cases such as FastMod where previous workarounds for slow BigMul can now be removed and BigMul called directly (so #113352 can be "fixed" by just calling BigMul on both arm and x64).

  1. I forgot to mention that I want feedback about if it would be an acceptable solution to try to add mulx support to the new Multiply intrinsic when supported by hardware. (I have a separate local branch, but it is not as tested, and I want to merge this one first so that all tests are executed against the base x64 code first).

If the answer is no I can remove the following comment on the internal api and toss the bmi2 version (it performs on par with mul so have similar performance improvements as in this pr)

/// <para>In the future it might emit mulx on compatible hardware</para>

Otherwise I am happy to push those changes as a separate PR if that is fine

if (Bmi2.X64.IsSupported)
{
ulong tmp;
ulong high = Bmi2.X64.MultiplyNoFlags(a, b, &tmp);
low = tmp;
return high;
}
#else
if (X86Base.X64.IsSupported)
{
(low, ulong hi) = X86Base.X64.Multiply(a, b);
return hi;
}
#endif
else if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down Expand Up @@ -261,6 +256,13 @@ static ulong SoftwareFallback(ulong a, ulong b, out ulong low)
/// <returns>The high 64-bit of the product of the specified numbers.</returns>
public static long BigMul(long a, long b, out long low)
{
#if !MONO // Multiply is not yet implemented in MONO
if (X86Base.X64.IsSupported)
{
(low, long hi) = X86Base.X64.Multiply(a, b);
return hi;
}
#endif
if (ArmBase.Arm64.IsSupported)
{
low = a * b;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) Multiply(ulong left, ulong right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (long Lower, long Upper) Multiply(long left, long right) { throw new PlatformNotSupportedException(); }
}

/// <summary>
Expand Down Expand Up @@ -109,6 +126,28 @@ internal X64() { }
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> MUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (uint Lower, uint Upper) Multiply(uint left, uint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed in the public <see cref="Math" /> class.</para>
/// </remarks>
internal static (int Lower, int Upper) Multiply(int left, int right) { throw new PlatformNotSupportedException(); }

/// <summary> MUL reg/m</summary>
internal static (nuint Lower, nuint Upper) Multiply(nuint left, nuint right) { throw new PlatformNotSupportedException(); }

/// <summary> IMUL reg/m</summary>
internal static (nint Lower, nint Upper) Multiply(nint left, nint right) { throw new PlatformNotSupportedException(); }

/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ internal X64() { }
/// </summary>
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (long Quotient, long Remainder) DivRem(ulong lower, long upper, long divisor) => DivRem(lower, upper, divisor);

#if !MONO
/// <summary>
/// <para>unsigned _umul128(unsigned __int64 Multiplier, unsigned __int64 Multiplicand, unsigned __int64 * HighProduct)</para>
/// <para> MUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(ulong, ulong, out ulong)" />.</para>
/// <para>In the future it might emit mulx on compatible hardware</para>
/// </remarks>
internal static (ulong Lower, ulong Upper) Multiply(ulong left, ulong right) => Multiply(left, right);

/// <summary>
/// <para> IMUL reg/m64</para>
/// </summary>
/// <remarks>
/// <para>Its functionality is exposed by the public <see cref="Math.BigMul(long, long, out long)" />.</para>
/// </remarks>
internal static (long Lower, long Upper) Multiply(long left, long right) => Multiply(left, right);
#endif
}

/// <summary>
Expand Down Expand Up @@ -123,6 +143,26 @@ public static unsafe (int Eax, int Ebx, int Ecx, int Edx) CpuId(int functionId,
[Experimental(Experimentals.X86BaseDivRemDiagId, UrlFormat = Experimentals.SharedUrlFormat)]
public static (nint Quotient, nint Remainder) DivRem(nuint lower, nint upper, nint divisor) => DivRem(lower, upper, divisor);

#if !MONO
/// <summary>
/// <para> MUL reg/m32</para>
/// </summary>
internal static (uint Lower, uint Upper) Multiply(uint left, uint right) => Multiply(left, right);

/// <summary>
/// <para> IMUL reg/m32</para>
/// </summary>
internal static (int Lower, int Upper) Multiply(int left, int right) => Multiply(left, right);

/// <summary> MUL reg/m</summary>
/// <remarks>Intented for UIntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>
internal static (nuint Lower, nuint Upper) Multiply(nuint left, nuint right) => Multiply(left, right);

/// <summary> IMUL reg/m</summary>
/// <remarks>Intented for IntPtr.Bigmul https://github.com/dotnet/runtime/issues/114731 </remarks>
internal static (nint Lower, nint Upper) Multiply(nint left, nint right) => Multiply(left, right);
#endif

/// <summary>
/// <para>void _mm_pause (void);</para>
/// <para> PAUSE</para>
Expand Down
Loading