-
Notifications
You must be signed in to change notification settings - Fork 5k
Improve Math.BigMul on x64 by adding new internal Multiply
hardware intrinsic to X86Base
#115966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b41a669
d5acd40
794d135
b93aac3
046723b
e70da88
1f8080d
81198ac
a5a5334
0ce33ea
e1c1ade
4ed65b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2465,6 +2465,25 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou | |
break; | ||
} | ||
|
||
case NI_X86Base_Multiply: | ||
case NI_X86Base_X64_Multiply: | ||
{ | ||
assert(numArgs == 2); | ||
assert(dstCount == 2); | ||
assert(isRMW); | ||
|
||
// mulEAX always have op1 in EAX | ||
srcCount += BuildOperandUses(op1, SRBM_EAX); | ||
srcCount += BuildOperandUses(op2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see there are a number of APX related changes to this files since the code was written, I will merge/rebase and try to update it once I have feedback if this solution is in the right direction. (or if it would be better to handle BigMul itself directly, ) Will probaly replace this with : |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it need to add an if statement checking |
||
// result put in EAX and EDX | ||
BuildDef(intrinsicTree, SRBM_EAX, 0); | ||
BuildDef(intrinsicTree, SRBM_EDX, 1); | ||
|
||
buildUses = false; | ||
break; | ||
} | ||
|
||
case NI_BMI2_MultiplyNoFlags: | ||
case NI_BMI2_X64_MultiplyNoFlags: | ||
{ | ||
|
@@ -2976,9 +2995,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou | |
} | ||
else | ||
{ | ||
// Currently dstCount = 2 is only used for DivRem, which has special constraints and is handled above | ||
// Currently dstCount = 2 is only used for DivRem and Multiply, which has special constraints and is handled | ||
// above | ||
assert((dstCount == 0) || | ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem)))); | ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem) || | ||
(intrinsicId == NI_X86Base_Multiply) || (intrinsicId == NI_X86Base_X64_Multiply)))); | ||
} | ||
|
||
*pDstCount = dstCount; | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -159,16 +159,6 @@ internal static void ThrowNegateTwosCompOverflow() | |||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||||
public static unsafe ulong BigMul(uint a, uint b) | ||||
{ | ||||
#if false // TARGET_32BIT | ||||
// This generates slower code currently than the simple multiplication | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing has changed here and this block/comment still holds, why remove it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I initially called the new Multiply and it was better than the BMI2 code, and seems to generate as good code as the built in JIT support for 32*32=>64bit multiply, but decided against adding in case it produces worse code than the current JIT optimisations. From my understanding of the MultiplyNoFlags issue is that it will not be fixed for that overload, or at least not in the near future.
It would then apply to a lot more places, including those that used the standard cast+mul pattern If I update Multiply with mulx support I can add that one back on here, or have a lock at handling it in GT_MUL_LONG isntead. |
||||
// https://github.com/dotnet/runtime/issues/11782 | ||||
if (Bmi2.IsSupported) | ||||
{ | ||||
uint low; | ||||
uint high = Bmi2.MultiplyNoFlags(a, b, &low); | ||||
return ((ulong)high << 32) | low; | ||||
} | ||||
#endif | ||||
return ((ulong)a) * b; | ||||
} | ||||
|
||||
|
@@ -181,7 +171,7 @@ public static long BigMul(int a, int b) | |||
return ((long)a) * b; | ||||
} | ||||
|
||||
|
||||
#if !(TARGET_ARM64 || (TARGET_AMD64 && !MONO)) // BigMul 64*64 has high performance intrinsics on ARM64 and AMD64 (but not yet on MONO) | ||||
/// <summary> | ||||
/// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/> | ||||
/// </summary> | ||||
|
@@ -190,21 +180,18 @@ public static long BigMul(int a, int b) | |||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||||
internal static ulong BigMul(ulong a, uint b, out ulong low) | ||||
{ | ||||
#if TARGET_64BIT | ||||
return Math.BigMul((ulong)a, (ulong)b, out low); | ||||
#else | ||||
ulong prodL = ((ulong)(uint)a) * b; | ||||
ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b); | ||||
|
||||
low = ((prodH << 32) | (uint)prodL); | ||||
return (prodH >> 32); | ||||
#endif | ||||
} | ||||
|
||||
/// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/> | ||||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||||
internal static ulong BigMul(uint a, ulong b, out ulong low) | ||||
=> BigMul(b, a, out low); | ||||
#endif | ||||
|
||||
/// <summary>Produces the full product of two unsigned 64-bit numbers.</summary> | ||||
/// <param name="a">The first number to multiply.</param> | ||||
|
@@ -215,13 +202,21 @@ internal static ulong BigMul(uint a, ulong b, out ulong low) | |||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||||
public static unsafe ulong BigMul(ulong a, ulong b, out ulong low) | ||||
{ | ||||
#if MONO // Multiply is not yet implemented in MONO | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This condition should be moved below (and reversed) so that the new intrinsic is used only if BMI2 isn't supported - MULX should be preferred over MUL since it has more flexible register allocation (currently wasted by the unnecessary memory spills though). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That would eliminate the performance improvements this PR was intended to fix (apart from 0.86 execeution time for benchmark) it makes a significant improvement of nearly 2* for other cases such as FastMod where previous workarounds for slow BigMul can now be removed and BigMul called directly (so #113352 can be "fixed" by just calling BigMul on both arm and x64).
If the answer is no I can remove the following comment on the internal api and toss the bmi2 version (it performs on par with mul so have similar performance improvements as in this pr) runtime/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/X86/X86Base.cs Line 77 in 4ed65b8
Otherwise I am happy to push those changes as a separate PR if that is fine |
||||
if (Bmi2.X64.IsSupported) | ||||
{ | ||||
ulong tmp; | ||||
ulong high = Bmi2.X64.MultiplyNoFlags(a, b, &tmp); | ||||
low = tmp; | ||||
return high; | ||||
} | ||||
#else | ||||
if (X86Base.X64.IsSupported) | ||||
{ | ||||
(low, ulong hi) = X86Base.X64.Multiply(a, b); | ||||
return hi; | ||||
} | ||||
#endif | ||||
else if (ArmBase.Arm64.IsSupported) | ||||
{ | ||||
low = a * b; | ||||
|
@@ -261,6 +256,13 @@ static ulong SoftwareFallback(ulong a, ulong b, out ulong low) | |||
/// <returns>The high 64-bit of the product of the specified numbers.</returns> | ||||
public static long BigMul(long a, long b, out long low) | ||||
{ | ||||
#if !MONO // Multiply is not yet implemented in MONO | ||||
if (X86Base.X64.IsSupported) | ||||
{ | ||||
(low, long hi) = X86Base.X64.Multiply(a, b); | ||||
return hi; | ||||
} | ||||
#endif | ||||
if (ArmBase.Arm64.IsSupported) | ||||
{ | ||||
low = a * b; | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am mostly guessing that the "containment" check and switching of operations should be the same, and the resulting code looks ok, but I am not sure