47
47
#include <immintrin.h>
48
48
#endif
49
49
50
+ #ifdef __ARM_NEON
51
+ #include <arm_neon.h>
52
+ #endif
53
+
50
54
/* The HyperLogLog implementation is based on the following ideas:
51
55
*
52
56
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -227,6 +231,13 @@ static int simd_enabled = 1;
227
231
#define HLL_USE_AVX2 0
228
232
#endif
229
233
234
+ #ifdef __ARM_NEON
235
+ static int simd_enabled = 1 ;
236
+ #define HLL_USE_NEON (simd_enabled)
237
+ #else
238
+ #define HLL_USE_NEON 0
239
+ #endif
240
+
230
241
/* =========================== Low level bit macros ========================= */
231
242
232
243
/* Macros to access the dense representation.
@@ -1193,6 +1204,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
1193
1204
}
1194
1205
#endif
1195
1206
1207
+ #if defined(__ARM_NEON )
1208
+ /*
1209
+ * hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
1210
+ *
1211
+ * This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
1212
+ * It extracts 6 bits registers from a dense format, and stores them in raw format
1213
+ *
1214
+ * Parameters:
1215
+ * - reg_raw: Pointer to the raw register array
1216
+ * - reg_dense: Pointer to the dense register array
1217
+ */
1218
+ void hllMergeDenseNEON (uint8_t * reg_raw , const uint8_t * reg_dense ) {
1219
+ uint8_t * dense_ptr = (uint8_t * )reg_dense ;
1220
+ uint8_t * raw_ptr = (uint8_t * )reg_raw ;
1221
+
1222
+ uint8x16_t idx = {0 , 1 , 2 , 0xFF ,
1223
+ 3 , 4 , 5 , 0xFF ,
1224
+ 6 , 7 , 8 , 0xFF ,
1225
+ 9 , 10 , 11 , 0xFF };
1226
+
1227
+ // Bit masks for extracting specific bit ranges
1228
+ uint8x16_t mask1 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x0000003f )); // Bits 0-5
1229
+ uint8x16_t mask2 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x00000fc0 )); // Bits 6-11
1230
+ uint8x16_t mask3 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x0003f000 )); // Bits 12-17
1231
+ uint8x16_t mask4 = vreinterpretq_u8_u32 (vdupq_n_u32 (0x00fc0000 )); // Bits 18-23
1232
+
1233
+ for (int i = 0 ; i < HLL_REGISTERS / 16 - 1 ; ++ i ) {
1234
+ /* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
1235
+ * 16 registers, which is copied into 16 bytes raw registers.
1236
+ * The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
1237
+ * in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
1238
+ */
1239
+ uint8x16_t r = vld1q_u8 (dense_ptr );
1240
+
1241
+ /* Reorder bytes based on index mapping
1242
+ * Lookup indices
1243
+ *From: {AAAB|BBCC|CDDD}
1244
+ *To: {AAA0|BBB0|CCC0|DDD0}
1245
+ */
1246
+ uint8x16_t x = vqtbl1q_u8 (r , idx );
1247
+
1248
+ // Extract and isolate registers
1249
+ uint8x16_t a1 = vandq_u8 (x , mask1 );
1250
+ uint8x16_t a2 = vandq_u8 (x , mask2 );
1251
+ uint8x16_t a3 = vandq_u8 (x , mask3 );
1252
+ uint8x16_t a4 = vandq_u8 (x , mask4 );
1253
+
1254
+ // Align extracted values by shifting left
1255
+ uint32x4_t a2_32 = vreinterpretq_u32_u8 (a2 );
1256
+ a2_32 = vshlq_n_u32 (a2_32 , 2 );
1257
+ a2 = vreinterpretq_u8_u32 (a2_32 );
1258
+
1259
+ uint32x4_t a3_32 = vreinterpretq_u32_u8 (a3 );
1260
+ a3_32 = vshlq_n_u32 (a3_32 , 4 );
1261
+ a3 = vreinterpretq_u8_u32 (a3_32 );
1262
+
1263
+ uint32x4_t a4_32 = vreinterpretq_u32_u8 (a4 );
1264
+ a4_32 = vshlq_n_u32 (a4_32 , 6 );
1265
+ a4 = vreinterpretq_u8_u32 (a4_32 );
1266
+
1267
+ // Combine extracted values
1268
+ uint8x16_t y1 = vorrq_u8 (a1 , a2 );
1269
+ uint8x16_t y2 = vorrq_u8 (a3 , a4 );
1270
+ uint8x16_t y = vorrq_u8 (y1 , y2 );
1271
+
1272
+ // Load current raw register values
1273
+ uint8x16_t z = vld1q_u8 (raw_ptr );
1274
+
1275
+ // Update raw registers with max values
1276
+ z = vmaxq_u8 (z , y );
1277
+
1278
+ // Store updated values
1279
+ vst1q_u8 (raw_ptr , z );
1280
+
1281
+ raw_ptr += 16 ;
1282
+ dense_ptr += 12 ;
1283
+ }
1284
+
1285
+ /* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
1286
+ uint8_t val ;
1287
+ for (int i = HLL_REGISTERS - 16 ; i < HLL_REGISTERS ; i ++ ) {
1288
+ HLL_DENSE_GET_REGISTER (val , reg_dense , i );
1289
+ if (val > reg_raw [i ]) {
1290
+ reg_raw [i ] = val ; // Update raw register if new value is greater
1291
+ }
1292
+ }
1293
+ }
1294
+ #endif // __ARM_NEON__
1295
+
1196
1296
/* Merge dense-encoded registers to raw registers array. */
1197
1297
void hllMergeDense (uint8_t * reg_raw , const uint8_t * reg_dense ) {
1198
1298
#ifdef HAVE_AVX2
@@ -1203,6 +1303,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
1203
1303
}
1204
1304
}
1205
1305
#endif
1306
+ #ifdef __ARM_NEON
1307
+ if (HLL_REGISTERS == 16384 && HLL_BITS == 6 ) {
1308
+ if (HLL_USE_NEON ) {
1309
+ hllMergeDenseNEON (reg_raw , reg_dense );
1310
+ return ;
1311
+ }
1312
+ }
1313
+ #endif
1206
1314
1207
1315
uint8_t val ;
1208
1316
for (int i = 0 ; i < HLL_REGISTERS ; i ++ ) {
@@ -1357,6 +1465,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
1357
1465
}
1358
1466
#endif
1359
1467
1468
+ #if defined(__ARM_NEON )
1469
+ /*
1470
+ * hllDenseCompressNEON is ARM optimzied version of hllDenseCompress using NEON.
1471
+ *
1472
+ * This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
1473
+ * It uses NEON SIMD instructions to process multiple values at once.
1474
+ *
1475
+ * - The first loop processes most of the registers in 16-element blocks using NEON instructions.
1476
+ * - The second loop handles the remaining registers using a direct assignment macro.
1477
+ *
1478
+ */
1479
+ void hllDenseCompressNEON (uint8_t * reg_dense , const uint8_t * reg_raw ) {
1480
+ /* Shuffle indices for packing bytes of dense registers
1481
+ * From: {AAA0|BBB0|CCC0|DDD0}
1482
+ * To: {AAAB|BBCC|CDDD|0000}
1483
+ */
1484
+ uint8x16_t idx = {
1485
+ 0 , 1 , 2 , // Extract bytes from lane 0
1486
+ 4 , 5 , 6 , // Extract bytes from lane 1
1487
+ 8 , 9 , 10 , // Extract bytes from lane 2
1488
+ 12 , 13 , 14 , // Extract bytes from lane 3
1489
+ 0xFF , 0xFF , 0xFF , 0xFF // Zero out last 4 elements (padding)
1490
+ };
1491
+
1492
+ // Bit masks for extracting first 6 bits from every byte within 32-bit lanes
1493
+ uint32x4_t mask1 = vdupq_n_u32 (0x0000003F ); // Extract bits 0-5
1494
+ uint32x4_t mask2 = vdupq_n_u32 (0x00003F00 ); // Extract bits 8-13
1495
+ uint32x4_t mask3 = vdupq_n_u32 (0x003F0000 ); // Extract bits 16-21
1496
+ uint32x4_t mask4 = vdupq_n_u32 (0x3F000000 ); // Extract bits 24-29
1497
+
1498
+ uint8_t * r = (uint8_t * )reg_raw ; // Input pointer
1499
+ uint8_t * t = (uint8_t * )reg_dense ; // Output pointer
1500
+
1501
+ // Process registers in blocks of 16 using NEON instructions
1502
+ // The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
1503
+ for (int i = 0 ; i < HLL_REGISTERS / 16 - 1 ; i ++ ) {
1504
+ // Load 16 bytes as 4x 32-bit values
1505
+ uint32x4_t x = vld1q_u32 ((uint32_t * )r );
1506
+
1507
+ // Apply masks to extract a single register from every 4 registers, for every lane
1508
+ uint32x4_t a1 = vandq_u32 (x , mask1 );
1509
+ uint32x4_t a2 = vandq_u32 (x , mask2 );
1510
+ uint32x4_t a3 = vandq_u32 (x , mask3 );
1511
+ uint32x4_t a4 = vandq_u32 (x , mask4 );
1512
+
1513
+ // Shift extracted bits to align them properly
1514
+ a2 = vshrq_n_u32 (a2 , 2 );
1515
+ a3 = vshrq_n_u32 (a3 , 4 );
1516
+ a4 = vshrq_n_u32 (a4 , 6 );
1517
+
1518
+ uint32x4_t y1 = vorrq_u32 (a1 , a2 );
1519
+ uint32x4_t y2 = vorrq_u32 (a3 , a4 );
1520
+ uint32x4_t y = vorrq_u32 (y1 , y2 );
1521
+
1522
+ // Perform a table lookup to shuffle extracted values and align them in 12 bytes
1523
+ vst1q_u8 (t , vqtbl1q_u8 (vreinterpretq_u8_u32 (y ), idx ));
1524
+
1525
+ t += 12 ;
1526
+ r += 16 ;
1527
+ }
1528
+
1529
+ // Handle the remaining registers individually (12 bytes)
1530
+ for (int i = HLL_REGISTERS - 16 ; i < HLL_REGISTERS ; i ++ ) {
1531
+ HLL_DENSE_SET_REGISTER (reg_dense , i , reg_raw [i ]);
1532
+ }
1533
+ }
1534
+ #endif // __ARM_NEON__
1535
+
1360
1536
/* Compress raw registers to dense representation. */
1361
1537
void hllDenseCompress (uint8_t * reg_dense , const uint8_t * reg_raw ) {
1362
1538
#ifdef HAVE_AVX2
@@ -1366,6 +1542,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
1366
1542
return ;
1367
1543
}
1368
1544
}
1545
+
1546
+ #endif
1547
+
1548
+ #ifdef __ARM_NEON
1549
+ if (HLL_REGISTERS == 16384 && HLL_BITS == 6 ) {
1550
+ if (HLL_USE_NEON ) {
1551
+ hllDenseCompressNEON (reg_dense , reg_raw );
1552
+ return ;
1553
+ }
1554
+ }
1369
1555
#endif
1370
1556
1371
1557
for (int i = 0 ; i < HLL_REGISTERS ; i ++ ) {
@@ -1772,16 +1958,22 @@ void pfdebugCommand(client *c) {
1772
1958
if (!strcasecmp (c -> argv [2 ]-> ptr , "on" )) {
1773
1959
#ifdef HAVE_AVX2
1774
1960
simd_enabled = 1 ;
1961
+ #endif
1962
+ #ifdef __ARM_NEON
1963
+ simd_enabled = 1 ;
1775
1964
#endif
1776
1965
} else if (!strcasecmp (c -> argv [2 ]-> ptr , "off" )) {
1777
1966
#ifdef HAVE_AVX2
1778
1967
simd_enabled = 0 ;
1968
+ #endif
1969
+ #ifdef __ARM_NEON
1970
+ simd_enabled = 0 ;
1779
1971
#endif
1780
1972
} else {
1781
1973
addReplyError (c , "Argument must be ON or OFF" );
1782
1974
}
1783
1975
1784
- if (HLL_USE_AVX2 ) {
1976
+ if (HLL_USE_AVX2 || HLL_USE_NEON ) {
1785
1977
addReplyStatus (c , "enabled" );
1786
1978
} else {
1787
1979
addReplyStatus (c , "disabled" );
0 commit comments