Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperloglog ARM NEON SIMD optimization #1859

Open
wants to merge 1 commit into
base: unstable
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 193 additions & 1 deletion src/hyperloglog.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#include <immintrin.h>
#endif

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

/* The HyperLogLog implementation is based on the following ideas:
*
* * The use of a 64 bit hash function as proposed in [1], in order to estimate
Expand Down Expand Up @@ -227,6 +231,13 @@ static int simd_enabled = 1;
#define HLL_USE_AVX2 0
#endif

#ifdef __ARM_NEON
static int simd_enabled = 1;
#define HLL_USE_NEON (simd_enabled)
#else
#define HLL_USE_NEON 0
#endif

/* =========================== Low level bit macros ========================= */

/* Macros to access the dense representation.
Expand Down Expand Up @@ -1193,6 +1204,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
}
#endif

#if defined(__ARM_NEON)
/*
* hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON
*
* This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions.
* It extracts 6 bits registers from a dense format, and stores them in raw format
*
* Parameters:
* - reg_raw: Pointer to the raw register array
* - reg_dense: Pointer to the dense register array
*/
void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) {
uint8_t *dense_ptr = (uint8_t *)reg_dense;
uint8_t *raw_ptr = (uint8_t *)reg_raw;

uint8x16_t idx = {0, 1, 2, 0xFF,
3, 4, 5, 0xFF,
6, 7, 8, 0xFF,
9, 10, 11, 0xFF};

// Bit masks for extracting specific bit ranges
uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5
uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11
uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17
uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23

for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) {
/* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain
* 16 registers, which is copied into 16 bytes raw registers.
* The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit
* in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration.
*/
uint8x16_t r = vld1q_u8(dense_ptr);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May crash on ARMv7-A if the address isn't 16-byte aligned.

TODO


/* Reorder bytes based on index mapping
* Lookup indices
*From: {AAAB|BBCC|CDDD}
*To: {AAA0|BBB0|CCC0|DDD0}
*/
uint8x16_t x = vqtbl1q_u8(r, idx);

// Extract and isolate registers
uint8x16_t a1 = vandq_u8(x, mask1);
uint8x16_t a2 = vandq_u8(x, mask2);
uint8x16_t a3 = vandq_u8(x, mask3);
uint8x16_t a4 = vandq_u8(x, mask4);

// Align extracted values by shifting left
uint32x4_t a2_32 = vreinterpretq_u32_u8(a2);
a2_32 = vshlq_n_u32(a2_32, 2);
a2 = vreinterpretq_u8_u32(a2_32);

uint32x4_t a3_32 = vreinterpretq_u32_u8(a3);
a3_32 = vshlq_n_u32(a3_32, 4);
a3 = vreinterpretq_u8_u32(a3_32);

uint32x4_t a4_32 = vreinterpretq_u32_u8(a4);
a4_32 = vshlq_n_u32(a4_32, 6);
a4 = vreinterpretq_u8_u32(a4_32);

// Combine extracted values
uint8x16_t y1 = vorrq_u8(a1, a2);
uint8x16_t y2 = vorrq_u8(a3, a4);
uint8x16_t y = vorrq_u8(y1, y2);

// Load current raw register values
uint8x16_t z = vld1q_u8(raw_ptr);

// Update raw registers with max values
z = vmaxq_u8(z, y);

// Store updated values
vst1q_u8(raw_ptr, z);

raw_ptr += 16;
dense_ptr += 12;
}

/* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */
uint8_t val;
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
HLL_DENSE_GET_REGISTER(val, reg_dense, i);
if (val > reg_raw[i]) {
reg_raw[i] = val; // Update raw register if new value is greater
}
}
}
#endif // __ARM_NEON__

/* Merge dense-encoded registers to raw registers array. */
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
#ifdef HAVE_AVX2
Expand All @@ -1203,6 +1303,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
}
}
#endif
#ifdef __ARM_NEON
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
if (HLL_USE_NEON) {
hllMergeDenseNEON(reg_raw, reg_dense);
return;
}
}
#endif

uint8_t val;
for (int i = 0; i < HLL_REGISTERS; i++) {
Expand Down Expand Up @@ -1357,6 +1465,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
}
#endif

#if defined(__ARM_NEON)
/*
* hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON.
*
* This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`).
* It uses NEON SIMD instructions to process multiple values at once.
*
* - The first loop processes most of the registers in 16-element blocks using NEON instructions.
* - The second loop handles the remaining registers using a direct assignment macro.
*
*/
void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) {
/* Shuffle indices for packing bytes of dense registers
* From: {AAA0|BBB0|CCC0|DDD0}
* To: {AAAB|BBCC|CDDD|0000}
*/
uint8x16_t idx = {
0, 1, 2, // Extract bytes from lane 0
4, 5, 6, // Extract bytes from lane 1
8, 9, 10, // Extract bytes from lane 2
12, 13, 14, // Extract bytes from lane 3
0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding)
};

// Bit masks for extracting first 6 bits from every byte within 32-bit lanes
uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5
uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13
uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21
uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29

uint8_t *r = (uint8_t *)reg_raw; // Input pointer
uint8_t *t = (uint8_t *)reg_dense; // Output pointer

// Process registers in blocks of 16 using NEON instructions
// The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes.
for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) {
// Load 16 bytes as 4x 32-bit values
uint32x4_t x = vld1q_u32((uint32_t *)r);

// Apply masks to extract a single register from every 4 registers, for every lane
uint32x4_t a1 = vandq_u32(x, mask1);
uint32x4_t a2 = vandq_u32(x, mask2);
uint32x4_t a3 = vandq_u32(x, mask3);
uint32x4_t a4 = vandq_u32(x, mask4);

// Shift extracted bits to align them properly
a2 = vshrq_n_u32(a2, 2);
a3 = vshrq_n_u32(a3, 4);
a4 = vshrq_n_u32(a4, 6);

uint32x4_t y1 = vorrq_u32(a1, a2);
uint32x4_t y2 = vorrq_u32(a3, a4);
uint32x4_t y = vorrq_u32(y1, y2);

// Perform a table lookup to shuffle extracted values and align them in 12 bytes
vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx));

t += 12;
r += 16;
}

// Handle the remaining registers individually (12 bytes)
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) {
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
}
}
#endif // __ARM_NEON__

/* Compress raw registers to dense representation. */
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
#ifdef HAVE_AVX2
Expand All @@ -1366,6 +1542,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
return;
}
}

#endif

#ifdef __ARM_NEON
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
if (HLL_USE_NEON) {
hllDenseCompressNEON(reg_dense, reg_raw);
return;
}
}
#endif

for (int i = 0; i < HLL_REGISTERS; i++) {
Expand Down Expand Up @@ -1772,16 +1958,22 @@ void pfdebugCommand(client *c) {
if (!strcasecmp(c->argv[2]->ptr, "on")) {
#ifdef HAVE_AVX2
simd_enabled = 1;
#endif
#ifdef __ARM_NEON
simd_enabled = 1;
#endif
Comment on lines 1959 to 1964
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#if defined(HAVE_AVX) || defined(__ARM_NEON)

or we can consider some macro like HLL_HAVE_SIMD to avoid this repetition, also to avoid defining the variable twice.

} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
#ifdef HAVE_AVX2
simd_enabled = 0;
#endif
#ifdef __ARM_NEON
simd_enabled = 0;
#endif
} else {
addReplyError(c, "Argument must be ON or OFF");
}

if (HLL_USE_AVX2) {
if (HLL_USE_AVX2 || HLL_USE_NEON) {
addReplyStatus(c, "enabled");
} else {
addReplyStatus(c, "disabled");
Expand Down
Loading