Skip to content

Commit 8006e6a

Browse files
Add vectorized paths for Span<T>.Reverse (#64412)
* Adding vectorized path for Span<byte>.Reverse that uses SSSE3 and AVX2 where possible * Added vectorized paths for Span<T>.Reverse for primitive types that are the same size as char, int, or long that use AVX2 or SSSE3 where possible * Apply suggestions from code review Co-authored-by: Theodore Tsirpanis <[email protected]> * Added vectorized paths for Span.Reverse to Array.Reverse, use one wrapper for both Span.Reverse and Array.Reverse Co-authored-by: Theodore Tsirpanis <[email protected]>
1 parent 6387a2d commit 8006e6a

File tree

5 files changed

+384
-24
lines changed

5 files changed

+384
-24
lines changed

src/libraries/System.Private.CoreLib/src/System/Array.cs

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,8 @@ public static void Reverse<T>(T[] array)
17221722
{
17231723
if (array == null)
17241724
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.array);
1725-
Reverse(array, 0, array.Length);
1725+
if (array.Length > 1)
1726+
SpanHelpers.Reverse(ref MemoryMarshal.GetArrayDataReference(array), (nuint)array.Length);
17261727
}
17271728

17281729
public static void Reverse<T>(T[] array, int index, int length)
@@ -1739,16 +1740,7 @@ public static void Reverse<T>(T[] array, int index, int length)
17391740
if (length <= 1)
17401741
return;
17411742

1742-
ref T first = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(array), index);
1743-
ref T last = ref Unsafe.Add(ref Unsafe.Add(ref first, length), -1);
1744-
do
1745-
{
1746-
T temp = first;
1747-
first = last;
1748-
last = temp;
1749-
first = ref Unsafe.Add(ref first, 1);
1750-
last = ref Unsafe.Add(ref last, -1);
1751-
} while (Unsafe.IsAddressLessThan(ref first, ref last));
1743+
SpanHelpers.Reverse(ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(array), index), (nuint)length);
17521744
}
17531745

17541746
// Sorts the elements of an array. The sort compares the elements to each

src/libraries/System.Private.CoreLib/src/System/MemoryExtensions.cs

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
using System.Diagnostics;
77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
911

1012
namespace System
1113
{
@@ -1543,21 +1545,10 @@ ref MemoryMarshal.GetReference(value),
15431545
/// </summary>
15441546
public static void Reverse<T>(this Span<T> span)
15451547
{
1546-
if (span.Length <= 1)
1548+
if (span.Length > 1)
15471549
{
1548-
return;
1550+
SpanHelpers.Reverse(ref MemoryMarshal.GetReference(span), (nuint)span.Length);
15491551
}
1550-
1551-
ref T first = ref MemoryMarshal.GetReference(span);
1552-
ref T last = ref Unsafe.Add(ref Unsafe.Add(ref first, span.Length), -1);
1553-
do
1554-
{
1555-
T temp = first;
1556-
first = last;
1557-
last = temp;
1558-
first = ref Unsafe.Add(ref first, 1);
1559-
last = ref Unsafe.Add(ref last, -1);
1560-
} while (Unsafe.IsAddressLessThan(ref first, ref last));
15611552
}
15621553

15631554
/// <summary>

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,5 +2238,96 @@ private static uint FindFirstMatchedLane(Vector128<byte> compareResult)
22382238
// Find the first lane that is set inside compareResult.
22392239
return (uint)BitOperations.TrailingZeroCount(selectedLanes) >> 2;
22402240
}
2241+
2242+
public static void Reverse(ref byte buf, nuint length)
2243+
{
2244+
if (Avx2.IsSupported && (nuint)Vector256<byte>.Count * 2 <= length)
2245+
{
2246+
Vector256<byte> reverseMask = Vector256.Create(
2247+
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
2248+
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); // second 128-bit lane
2249+
nuint numElements = (nuint)Vector256<byte>.Count;
2250+
nuint numIters = (length / numElements) / 2;
2251+
for (nuint i = 0; i < numIters; i++)
2252+
{
2253+
nuint firstOffset = i * numElements;
2254+
nuint lastOffset = length - ((1 + i) * numElements);
2255+
2256+
// Load in values from beginning and end of the array.
2257+
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref buf, firstOffset);
2258+
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref buf, lastOffset);
2259+
2260+
// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
2261+
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
2262+
// +-------------------------------------------------------------------------------+
2263+
// | A1 | B1 | C1 | D1 | E1 | F1 | G1 | H1 | I1 | J1 | K1 | L1 | M1 | N1 | O1 | P1 |
2264+
// +-------------------------------------------------------------------------------+
2265+
// | A2 | B2 | C2 | D2 | E2 | F2 | G2 | H2 | I2 | J2 | K2 | L2 | M2 | N2 | O2 | P2 |
2266+
// +-------------------------------------------------------------------------------+
2267+
// Shuffle --->
2268+
// +-------------------------------------------------------------------------------+
2269+
// | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2270+
// +-------------------------------------------------------------------------------+
2271+
// | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2272+
// +-------------------------------------------------------------------------------+
2273+
// Permute --->
2274+
// +-------------------------------------------------------------------------------+
2275+
// | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2276+
// +-------------------------------------------------------------------------------+
2277+
// | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2278+
// +-------------------------------------------------------------------------------+
2279+
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
2280+
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
2281+
tempLast = Avx2.Shuffle(tempLast, reverseMask);
2282+
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
2283+
2284+
// Store the reversed vectors
2285+
tempLast.StoreUnsafe(ref buf, firstOffset);
2286+
tempFirst.StoreUnsafe(ref buf, lastOffset);
2287+
}
2288+
buf = ref Unsafe.Add(ref buf, numIters * numElements);
2289+
length -= numIters * numElements * 2;
2290+
}
2291+
else if (Sse2.IsSupported && (nuint)Vector128<byte>.Count * 2 <= length)
2292+
{
2293+
Vector128<byte> reverseMask = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2294+
nuint numElements = (nuint)Vector128<byte>.Count;
2295+
nuint numIters = (length / numElements) / 2;
2296+
for (nuint i = 0; i < numIters; i++)
2297+
{
2298+
nuint firstOffset = i * numElements;
2299+
nuint lastOffset = length - ((1 + i) * numElements);
2300+
2301+
// Load in values from beginning and end of the array.
2302+
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref buf, firstOffset);
2303+
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref buf, lastOffset);
2304+
2305+
// Shuffle to reverse each vector:
2306+
// +---------------------------------------------------------------+
2307+
// | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
2308+
// +---------------------------------------------------------------+
2309+
// --->
2310+
// +---------------------------------------------------------------+
2311+
// | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
2312+
// +---------------------------------------------------------------+
2313+
tempFirst = Ssse3.Shuffle(tempFirst, reverseMask);
2314+
tempLast = Ssse3.Shuffle(tempLast, reverseMask);
2315+
2316+
// Store the reversed vectors
2317+
tempLast.StoreUnsafe(ref buf, firstOffset);
2318+
tempFirst.StoreUnsafe(ref buf, lastOffset);
2319+
}
2320+
buf = ref Unsafe.Add(ref buf, numIters * numElements);
2321+
length -= numIters * numElements * 2;
2322+
}
2323+
2324+
// Store any remaining values one-by-one
2325+
for (nuint i = 0; i < (length / 2); i++)
2326+
{
2327+
ref byte first = ref Unsafe.Add(ref buf, i);
2328+
ref byte last = ref Unsafe.Add(ref buf, length - 1 - i);
2329+
(last, first) = (first, last);
2330+
}
2331+
}
22412332
}
22422333
}

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,5 +2015,93 @@ private static int FindFirstMatchedLane(Vector128<ushort> compareResult)
20152015

20162016
return BitOperations.TrailingZeroCount(selectedLanes) >> 3;
20172017
}
2018+
2019+
public static void Reverse(ref char buf, nuint length)
2020+
{
2021+
ref byte bufByte = ref Unsafe.As<char, byte>(ref buf);
2022+
nuint byteLength = length * sizeof(char);
2023+
if (Avx2.IsSupported && (nuint)Vector256<short>.Count * 2 <= length)
2024+
{
2025+
Vector256<byte> reverseMask = Vector256.Create(
2026+
(byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
2027+
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); // second 128-bit lane
2028+
nuint numElements = (nuint)Vector256<byte>.Count;
2029+
nuint numIters = (byteLength / numElements) / 2;
2030+
for (nuint i = 0; i < numIters; i++)
2031+
{
2032+
nuint firstOffset = i * numElements;
2033+
nuint lastOffset = byteLength - ((1 + i) * numElements);
2034+
2035+
// Load in values from beginning and end of the array.
2036+
Vector256<byte> tempFirst = Vector256.LoadUnsafe(ref bufByte, firstOffset);
2037+
Vector256<byte> tempLast = Vector256.LoadUnsafe(ref bufByte, lastOffset);
2038+
2039+
// Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
2040+
// Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
2041+
// +---------------------------------------------------------------+
2042+
// | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
2043+
// +---------------------------------------------------------------+
2044+
// Shuffle --->
2045+
// +---------------------------------------------------------------+
2046+
// | H | G | F | E | D | C | B | A | P | O | N | M | L | K | J | I |
2047+
// +---------------------------------------------------------------+
2048+
// Permute --->
2049+
// +---------------------------------------------------------------+
2050+
// | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
2051+
// +---------------------------------------------------------------+
2052+
tempFirst = Avx2.Shuffle(tempFirst, reverseMask);
2053+
tempFirst = Avx2.Permute2x128(tempFirst, tempFirst, 0b00_01);
2054+
tempLast = Avx2.Shuffle(tempLast, reverseMask);
2055+
tempLast = Avx2.Permute2x128(tempLast, tempLast, 0b00_01);
2056+
2057+
// Store the reversed vectors
2058+
tempLast.StoreUnsafe(ref bufByte, firstOffset);
2059+
tempFirst.StoreUnsafe(ref bufByte, lastOffset);
2060+
}
2061+
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
2062+
length -= numIters * (nuint)Vector256<short>.Count * 2;
2063+
}
2064+
else if (Sse2.IsSupported && (nuint)Vector128<short>.Count * 2 <= length)
2065+
{
2066+
Vector128<byte> reverseMask = Vector128.Create((byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2067+
nuint numElements = (nuint)Vector128<byte>.Count;
2068+
nuint numIters = ((length * sizeof(char)) / numElements) / 2;
2069+
for (nuint i = 0; i < numIters; i++)
2070+
{
2071+
nuint firstOffset = i * numElements;
2072+
nuint lastOffset = byteLength - ((1 + i) * numElements);
2073+
2074+
// Load in values from beginning and end of the array.
2075+
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref bufByte, firstOffset);
2076+
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref bufByte, lastOffset);
2077+
2078+
// Shuffle to reverse each vector:
2079+
// +-------------------------------+
2080+
// | A | B | C | D | E | F | G | H |
2081+
// +-------------------------------+
2082+
// --->
2083+
// +-------------------------------+
2084+
// | H | G | F | E | D | C | B | A |
2085+
// +-------------------------------+
2086+
tempFirst = Ssse3.Shuffle(tempFirst, reverseMask);
2087+
tempLast = Ssse3.Shuffle(tempLast, reverseMask);
2088+
2089+
// Store the reversed vectors
2090+
tempLast.StoreUnsafe(ref bufByte, firstOffset);
2091+
tempFirst.StoreUnsafe(ref bufByte, lastOffset);
2092+
}
2093+
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
2094+
length -= numIters * (nuint)Vector128<short>.Count * 2;
2095+
}
2096+
2097+
// Store any remaining values one-by-one
2098+
buf = ref Unsafe.As<byte, char>(ref bufByte);
2099+
for (nuint i = 0; i < (length / 2); i++)
2100+
{
2101+
ref char first = ref Unsafe.Add(ref buf, i);
2102+
ref char last = ref Unsafe.Add(ref buf, length - 1 - i);
2103+
(last, first) = (first, last);
2104+
}
2105+
}
20182106
}
20192107
}

0 commit comments

Comments
 (0)