@@ -2238,5 +2238,96 @@ private static uint FindFirstMatchedLane(Vector128<byte> compareResult)
2238
2238
// Find the first lane that is set inside compareResult.
2239
2239
return ( uint ) BitOperations . TrailingZeroCount ( selectedLanes ) >> 2 ;
2240
2240
}
2241
+
2242
+ public static void Reverse ( ref byte buf , nuint length )
2243
+ {
2244
+ if ( Avx2 . IsSupported && ( nuint ) Vector256 < byte > . Count * 2 <= length )
2245
+ {
2246
+ Vector256 < byte > reverseMask = Vector256 . Create (
2247
+ ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , // first 128-bit lane
2248
+ 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ; // second 128-bit lane
2249
+ nuint numElements = ( nuint ) Vector256 < byte > . Count ;
2250
+ nuint numIters = ( length / numElements ) / 2 ;
2251
+ for ( nuint i = 0 ; i < numIters ; i ++ )
2252
+ {
2253
+ nuint firstOffset = i * numElements ;
2254
+ nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
2255
+
2256
+ // Load in values from beginning and end of the array.
2257
+ Vector256 < byte > tempFirst = Vector256 . LoadUnsafe ( ref buf , firstOffset ) ;
2258
+ Vector256 < byte > tempLast = Vector256 . LoadUnsafe ( ref buf , lastOffset ) ;
2259
+
2260
+ // Avx2 operates on two 128-bit lanes rather than the full 256-bit vector.
2261
+ // Perform a shuffle to reverse each 128-bit lane, then permute to finish reversing the vector:
2262
+ // +-------------------------------------------------------------------------------+
2263
+ // | A1 | B1 | C1 | D1 | E1 | F1 | G1 | H1 | I1 | J1 | K1 | L1 | M1 | N1 | O1 | P1 |
2264
+ // +-------------------------------------------------------------------------------+
2265
+ // | A2 | B2 | C2 | D2 | E2 | F2 | G2 | H2 | I2 | J2 | K2 | L2 | M2 | N2 | O2 | P2 |
2266
+ // +-------------------------------------------------------------------------------+
2267
+ // Shuffle --->
2268
+ // +-------------------------------------------------------------------------------+
2269
+ // | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2270
+ // +-------------------------------------------------------------------------------+
2271
+ // | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2272
+ // +-------------------------------------------------------------------------------+
2273
+ // Permute --->
2274
+ // +-------------------------------------------------------------------------------+
2275
+ // | P2 | O2 | N2 | M2 | L2 | K2 | J2 | I2 | H2 | G2 | F2 | E2 | D2 | C2 | B2 | A2 |
2276
+ // +-------------------------------------------------------------------------------+
2277
+ // | P1 | O1 | N1 | M1 | L1 | K1 | J1 | I1 | H1 | G1 | F1 | E1 | D1 | C1 | B1 | A1 |
2278
+ // +-------------------------------------------------------------------------------+
2279
+ tempFirst = Avx2 . Shuffle ( tempFirst , reverseMask ) ;
2280
+ tempFirst = Avx2 . Permute2x128 ( tempFirst , tempFirst , 0b00_01 ) ;
2281
+ tempLast = Avx2 . Shuffle ( tempLast , reverseMask ) ;
2282
+ tempLast = Avx2 . Permute2x128 ( tempLast , tempLast , 0b00_01 ) ;
2283
+
2284
+ // Store the reversed vectors
2285
+ tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
2286
+ tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
2287
+ }
2288
+ buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
2289
+ length -= numIters * numElements * 2 ;
2290
+ }
2291
+ else if ( Sse2 . IsSupported && ( nuint ) Vector128 < byte > . Count * 2 <= length )
2292
+ {
2293
+ Vector128 < byte > reverseMask = Vector128 . Create ( ( byte ) 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ;
2294
+ nuint numElements = ( nuint ) Vector128 < byte > . Count ;
2295
+ nuint numIters = ( length / numElements ) / 2 ;
2296
+ for ( nuint i = 0 ; i < numIters ; i ++ )
2297
+ {
2298
+ nuint firstOffset = i * numElements ;
2299
+ nuint lastOffset = length - ( ( 1 + i ) * numElements ) ;
2300
+
2301
+ // Load in values from beginning and end of the array.
2302
+ Vector128 < byte > tempFirst = Vector128 . LoadUnsafe ( ref buf , firstOffset ) ;
2303
+ Vector128 < byte > tempLast = Vector128 . LoadUnsafe ( ref buf , lastOffset ) ;
2304
+
2305
+ // Shuffle to reverse each vector:
2306
+ // +---------------------------------------------------------------+
2307
+ // | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
2308
+ // +---------------------------------------------------------------+
2309
+ // --->
2310
+ // +---------------------------------------------------------------+
2311
+ // | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
2312
+ // +---------------------------------------------------------------+
2313
+ tempFirst = Ssse3 . Shuffle ( tempFirst , reverseMask ) ;
2314
+ tempLast = Ssse3 . Shuffle ( tempLast , reverseMask ) ;
2315
+
2316
+ // Store the reversed vectors
2317
+ tempLast . StoreUnsafe ( ref buf , firstOffset ) ;
2318
+ tempFirst . StoreUnsafe ( ref buf , lastOffset ) ;
2319
+ }
2320
+ buf = ref Unsafe . Add ( ref buf , numIters * numElements ) ;
2321
+ length -= numIters * numElements * 2 ;
2322
+ }
2323
+
2324
+ // Store any remaining values one-by-one
2325
+ for ( nuint i = 0 ; i < ( length / 2 ) ; i ++ )
2326
+ {
2327
+ ref byte first = ref Unsafe . Add ( ref buf , i ) ;
2328
+ ref byte last = ref Unsafe . Add ( ref buf , length - 1 - i ) ;
2329
+ ( last , first ) = ( first , last ) ;
2330
+ }
2331
+ }
2241
2332
}
2242
2333
}
0 commit comments