Skip to content

Commit b2c857e

Browse files
committed
Add ARM NEON optimization for HyperLogLog
- Implement two NEON optmized functions for converting between raw and dense representations in HyperLogLog: 1. hllMergeDenseNEON 2. hllDenseCompressNEON These functions process 16 registers in each iteration. - Utilize existing SIMD test in hyperloglog.tcl (previously added for AVX2 optimization) to validate NEON implementation Test: valkey-benchmark -n 1000000 --dbnum 9 -p 21111 PFMERGE z hll1{t} hll2{t} +-------------------+-----------+-----------+---------------+ | Metric | Before | After | Improvement % | +-------------------+-----------+-----------+---------------+ | Throughput (k rps)| 7.42 | 76.98 | 937.47% | +-------------------+-----------+-----------+---------------+ | Latency (msec) | | | | | avg | 6.686 | 0.595 | 91.10% | | min | 0.520 | 0.152 | 70.77% | | p50 | 7.799 | 0.599 | 92.32% | | p95 | 8.039 | 0.767 | 90.46% | | p99 | 8.111 | 0.807 | 90.05% | | max | 9.263 | 1.463 | 84.21% | +-------------------+-----------+-----------+---------------+ Hardware: CPU: Graviton 3 Architecture: aarch64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 64 On-line CPU(s) list: 0-63 NUMA: NUMA node(s): 1 NUMA node0 CPU(s): 0-63 Memory: 256 GB Signed-off-by: xbasel <[email protected]>
1 parent aa88453 commit b2c857e

File tree

1 file changed

+193
-1
lines changed

1 file changed

+193
-1
lines changed

src/hyperloglog.c

+193-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
#include <immintrin.h>
4848
#endif
4949

50+
#ifdef __ARM_NEON
51+
#include <arm_neon.h>
52+
#endif
53+
5054
/* The HyperLogLog implementation is based on the following ideas:
5155
*
5256
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -227,6 +231,13 @@ static int simd_enabled = 1;
227231
#define HLL_USE_AVX2 0
228232
#endif
229233

234+
#ifdef __ARM_NEON
235+
static int simd_enabled = 1;
236+
#define HLL_USE_NEON (simd_enabled)
237+
#else
238+
#define HLL_USE_NEON 0
239+
#endif
240+
230241
/* =========================== Low level bit macros ========================= */
231242

232243
/* Macros to access the dense representation.
@@ -1193,6 +1204,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
11931204
}
11941205
#endif
11951206

1207+
#if defined(__ARM_NEON)
1208+
/*
1209+
* hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
1210+
*
1211+
* This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
1212+
* It extracts 6 bits registers from a dense format, and stores them in raw format
1213+
*
1214+
* Parameters:
1215+
* - reg_raw: Pointer to the raw register array
1216+
* - reg_dense: Pointer to the dense register array
1217+
*/
1218+
void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) {
1219+
uint8_t *dense_ptr = (uint8_t *)reg_dense;
1220+
uint8_t *raw_ptr = (uint8_t *)reg_raw;
1221+
1222+
uint8x16_t idx = {0, 1, 2, 0xFF,
1223+
3, 4, 5, 0xFF,
1224+
6, 7, 8, 0xFF,
1225+
9, 10, 11, 0xFF};
1226+
1227+
// Bit masks for extracting specific bit ranges
1228+
uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5
1229+
uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11
1230+
uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17
1231+
uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23
1232+
1233+
for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) {
1234+
/* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
1235+
* 16 registers, which is copied into 16 bytes raw registers.
1236+
* The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
1237+
* in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
1238+
*/
1239+
uint8x16_t r = vld1q_u8(dense_ptr);
1240+
1241+
/* Reorder bytes based on index mapping
1242+
* Lookup indices
1243+
*From: {AAAB|BBCC|CDDD}
1244+
*To: {AAA0|BBB0|CCC0|DDD0}
1245+
*/
1246+
uint8x16_t x = vqtbl1q_u8(r, idx);
1247+
1248+
// Extract and isolate registers
1249+
uint8x16_t a1 = vandq_u8(x, mask1);
1250+
uint8x16_t a2 = vandq_u8(x, mask2);
1251+
uint8x16_t a3 = vandq_u8(x, mask3);
1252+
uint8x16_t a4 = vandq_u8(x, mask4);
1253+
1254+
// Align extracted values by shifting left
1255+
uint32x4_t a2_32 = vreinterpretq_u32_u8(a2);
1256+
a2_32 = vshlq_n_u32(a2_32, 2);
1257+
a2 = vreinterpretq_u8_u32(a2_32);
1258+
1259+
uint32x4_t a3_32 = vreinterpretq_u32_u8(a3);
1260+
a3_32 = vshlq_n_u32(a3_32, 4);
1261+
a3 = vreinterpretq_u8_u32(a3_32);
1262+
1263+
uint32x4_t a4_32 = vreinterpretq_u32_u8(a4);
1264+
a4_32 = vshlq_n_u32(a4_32, 6);
1265+
a4 = vreinterpretq_u8_u32(a4_32);
1266+
1267+
// Combine extracted values
1268+
uint8x16_t y1 = vorrq_u8(a1, a2);
1269+
uint8x16_t y2 = vorrq_u8(a3, a4);
1270+
uint8x16_t y = vorrq_u8(y1, y2);
1271+
1272+
// Load current raw register values
1273+
uint8x16_t z = vld1q_u8(raw_ptr);
1274+
1275+
// Update raw registers with max values
1276+
z = vmaxq_u8(z, y);
1277+
1278+
// Store updated values
1279+
vst1q_u8(raw_ptr, z);
1280+
1281+
raw_ptr += 16;
1282+
dense_ptr += 12;
1283+
}
1284+
1285+
/* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
1286+
uint8_t val;
1287+
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
1288+
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
1289+
if (val > reg_raw[i]) {
1290+
reg_raw[i] = val; // Update raw register if new value is greater
1291+
}
1292+
}
1293+
}
1294+
#endif // __ARM_NEON__
1295+
11961296
/* Merge dense-encoded registers to raw registers array. */
11971297
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
11981298
#ifdef HAVE_AVX2
@@ -1203,6 +1303,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
12031303
}
12041304
}
12051305
#endif
1306+
#ifdef __ARM_NEON
1307+
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
1308+
if (HLL_USE_NEON) {
1309+
hllMergeDenseNEON(reg_raw, reg_dense);
1310+
return;
1311+
}
1312+
}
1313+
#endif
12061314

12071315
uint8_t val;
12081316
for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1357,6 +1465,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
13571465
}
13581466
#endif
13591467

1468+
#if defined(__ARM_NEON)
1469+
/*
1470+
* hllDenseCompressNEON is ARM optimzied version of hllDenseCompress using NEON.
1471+
*
1472+
* This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
1473+
* It uses NEON SIMD instructions to process multiple values at once.
1474+
*
1475+
* - The first loop processes most of the registers in 16-element blocks using NEON instructions.
1476+
* - The second loop handles the remaining registers using a direct assignment macro.
1477+
*
1478+
*/
1479+
void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) {
1480+
/* Shuffle indices for packing bytes of dense registers
1481+
* From: {AAA0|BBB0|CCC0|DDD0}
1482+
* To: {AAAB|BBCC|CDDD|0000}
1483+
*/
1484+
uint8x16_t idx = {
1485+
0, 1, 2, // Extract bytes from lane 0
1486+
4, 5, 6, // Extract bytes from lane 1
1487+
8, 9, 10, // Extract bytes from lane 2
1488+
12, 13, 14, // Extract bytes from lane 3
1489+
0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding)
1490+
};
1491+
1492+
// Bit masks for extracting first 6 bits from every byte within 32-bit lanes
1493+
uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5
1494+
uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13
1495+
uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21
1496+
uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29
1497+
1498+
uint8_t *r = (uint8_t *)reg_raw; // Input pointer
1499+
uint8_t *t = (uint8_t *)reg_dense; // Output pointer
1500+
1501+
// Process registers in blocks of 16 using NEON instructions
1502+
// The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
1503+
for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) {
1504+
// Load 16 bytes as 4x 32-bit values
1505+
uint32x4_t x = vld1q_u32((uint32_t *)r);
1506+
1507+
// Apply masks to extract a single register from every 4 registers, for every lane
1508+
uint32x4_t a1 = vandq_u32(x, mask1);
1509+
uint32x4_t a2 = vandq_u32(x, mask2);
1510+
uint32x4_t a3 = vandq_u32(x, mask3);
1511+
uint32x4_t a4 = vandq_u32(x, mask4);
1512+
1513+
// Shift extracted bits to align them properly
1514+
a2 = vshrq_n_u32(a2, 2);
1515+
a3 = vshrq_n_u32(a3, 4);
1516+
a4 = vshrq_n_u32(a4, 6);
1517+
1518+
uint32x4_t y1 = vorrq_u32(a1, a2);
1519+
uint32x4_t y2 = vorrq_u32(a3, a4);
1520+
uint32x4_t y = vorrq_u32(y1, y2);
1521+
1522+
// Perform a table lookup to shuffle extracted values and align them in 12 bytes
1523+
vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx));
1524+
1525+
t += 12;
1526+
r += 16;
1527+
}
1528+
1529+
// Handle the remaining registers individually (12 bytes)
1530+
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
1531+
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
1532+
}
1533+
}
1534+
#endif // __ARM_NEON__
1535+
13601536
/* Compress raw registers to dense representation. */
13611537
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
13621538
#ifdef HAVE_AVX2
@@ -1366,6 +1542,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
13661542
return;
13671543
}
13681544
}
1545+
1546+
#endif
1547+
1548+
#ifdef __ARM_NEON
1549+
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
1550+
if (HLL_USE_NEON) {
1551+
hllDenseCompressNEON(reg_dense, reg_raw);
1552+
return;
1553+
}
1554+
}
13691555
#endif
13701556

13711557
for (int i = 0; i < HLL_REGISTERS; i++) {
@@ -1772,16 +1958,22 @@ void pfdebugCommand(client *c) {
17721958
if (!strcasecmp(c->argv[2]->ptr, "on")) {
17731959
#ifdef HAVE_AVX2
17741960
simd_enabled = 1;
1961+
#endif
1962+
#ifdef __ARM_NEON
1963+
simd_enabled = 1;
17751964
#endif
17761965
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
17771966
#ifdef HAVE_AVX2
17781967
simd_enabled = 0;
1968+
#endif
1969+
#ifdef __ARM_NEON
1970+
simd_enabled = 0;
17791971
#endif
17801972
} else {
17811973
addReplyError(c, "Argument must be ON or OFF");
17821974
}
17831975

1784-
if (HLL_USE_AVX2) {
1976+
if (HLL_USE_AVX2 || HLL_USE_NEON) {
17851977
addReplyStatus(c, "enabled");
17861978
} else {
17871979
addReplyStatus(c, "disabled");

0 commit comments

Comments
 (0)