-
Notifications
You must be signed in to change notification settings - Fork 758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hyperloglog ARM NEON SIMD optimization #1859
Open
xbasel
wants to merge
1
commit into
valkey-io:unstable
Choose a base branch
from
xbasel:hll_neon
base: unstable
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,10 @@ | |
#include <immintrin.h> | ||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
#include <arm_neon.h> | ||
#endif | ||
|
||
/* The HyperLogLog implementation is based on the following ideas: | ||
* | ||
* * The use of a 64 bit hash function as proposed in [1], in order to estimate | ||
|
@@ -227,6 +231,13 @@ static int simd_enabled = 1; | |
#define HLL_USE_AVX2 0 | ||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
static int simd_enabled = 1; | ||
#define HLL_USE_NEON (simd_enabled) | ||
#else | ||
#define HLL_USE_NEON 0 | ||
#endif | ||
|
||
/* =========================== Low level bit macros ========================= */ | ||
|
||
/* Macros to access the dense representation. | ||
|
@@ -1193,6 +1204,95 @@ void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { | |
} | ||
#endif | ||
|
||
#if defined(__ARM_NEON) | ||
/* | ||
* hllMergeDenseNEON is an ARM optimized version of hllMergeDense using NEON | ||
* | ||
* This function merges HyperLogLog (HLL) dense registers using ARM NEON SIMD instructions. | ||
* It extracts 6 bits registers from a dense format, and stores them in raw format | ||
* | ||
* Parameters: | ||
* - reg_raw: Pointer to the raw register array | ||
* - reg_dense: Pointer to the dense register array | ||
*/ | ||
void hllMergeDenseNEON(uint8_t *reg_raw, const uint8_t *reg_dense) { | ||
uint8_t *dense_ptr = (uint8_t *)reg_dense; | ||
uint8_t *raw_ptr = (uint8_t *)reg_raw; | ||
|
||
uint8x16_t idx = {0, 1, 2, 0xFF, | ||
3, 4, 5, 0xFF, | ||
6, 7, 8, 0xFF, | ||
9, 10, 11, 0xFF}; | ||
|
||
// Bit masks for extracting specific bit ranges | ||
uint8x16_t mask1 = vreinterpretq_u8_u32(vdupq_n_u32(0x0000003f)); // Bits 0-5 | ||
uint8x16_t mask2 = vreinterpretq_u8_u32(vdupq_n_u32(0x00000fc0)); // Bits 6-11 | ||
uint8x16_t mask3 = vreinterpretq_u8_u32(vdupq_n_u32(0x0003f000)); // Bits 12-17 | ||
uint8x16_t mask4 = vreinterpretq_u8_u32(vdupq_n_u32(0x00fc0000)); // Bits 18-23 | ||
|
||
for (int i = 0; i < HLL_REGISTERS / 16 - 1; ++i) { | ||
/* Load 16 bytes from dense registers but only the first 12 bytes are processed because they contain | ||
* 16 registers, which is copied into 16 bytes raw registers. | ||
* The last 4 bytes are ignored because (1) they do not form a complete number of registers, and do not fit | ||
* in the 16 bytes. The unprocessed 4 bytes are processed in the next iteration. | ||
*/ | ||
uint8x16_t r = vld1q_u8(dense_ptr); | ||
|
||
/* Reorder bytes based on index mapping | ||
* Lookup indices | ||
*From: {AAAB|BBCC|CDDD} | ||
*To: {AAA0|BBB0|CCC0|DDD0} | ||
*/ | ||
uint8x16_t x = vqtbl1q_u8(r, idx); | ||
|
||
// Extract and isolate registers | ||
uint8x16_t a1 = vandq_u8(x, mask1); | ||
uint8x16_t a2 = vandq_u8(x, mask2); | ||
uint8x16_t a3 = vandq_u8(x, mask3); | ||
uint8x16_t a4 = vandq_u8(x, mask4); | ||
|
||
// Align extracted values by shifting left | ||
uint32x4_t a2_32 = vreinterpretq_u32_u8(a2); | ||
a2_32 = vshlq_n_u32(a2_32, 2); | ||
a2 = vreinterpretq_u8_u32(a2_32); | ||
|
||
uint32x4_t a3_32 = vreinterpretq_u32_u8(a3); | ||
a3_32 = vshlq_n_u32(a3_32, 4); | ||
a3 = vreinterpretq_u8_u32(a3_32); | ||
|
||
uint32x4_t a4_32 = vreinterpretq_u32_u8(a4); | ||
a4_32 = vshlq_n_u32(a4_32, 6); | ||
a4 = vreinterpretq_u8_u32(a4_32); | ||
|
||
// Combine extracted values | ||
uint8x16_t y1 = vorrq_u8(a1, a2); | ||
uint8x16_t y2 = vorrq_u8(a3, a4); | ||
uint8x16_t y = vorrq_u8(y1, y2); | ||
|
||
// Load current raw register values | ||
uint8x16_t z = vld1q_u8(raw_ptr); | ||
|
||
// Update raw registers with max values | ||
z = vmaxq_u8(z, y); | ||
|
||
// Store updated values | ||
vst1q_u8(raw_ptr, z); | ||
|
||
raw_ptr += 16; | ||
dense_ptr += 12; | ||
} | ||
|
||
/* Process remaining registers, we do this manually because we don't want to over-read 4 bytes */ | ||
uint8_t val; | ||
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) { | ||
HLL_DENSE_GET_REGISTER(val, reg_dense, i); | ||
if (val > reg_raw[i]) { | ||
reg_raw[i] = val; // Update raw register if new value is greater | ||
} | ||
} | ||
} | ||
#endif // __ARM_NEON__ | ||
|
||
/* Merge dense-encoded registers to raw registers array. */ | ||
void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { | ||
#ifdef HAVE_AVX2 | ||
|
@@ -1203,6 +1303,14 @@ void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { | |
} | ||
} | ||
#endif | ||
#ifdef __ARM_NEON | ||
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { | ||
if (HLL_USE_NEON) { | ||
hllMergeDenseNEON(reg_raw, reg_dense); | ||
return; | ||
} | ||
} | ||
#endif | ||
|
||
uint8_t val; | ||
for (int i = 0; i < HLL_REGISTERS; i++) { | ||
|
@@ -1357,6 +1465,74 @@ void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { | |
} | ||
#endif | ||
|
||
#if defined(__ARM_NEON) | ||
/* | ||
* hllDenseCompressNEON is ARM optimized version of hllDenseCompress using NEON. | ||
* | ||
* This function takes a raw register (`reg_raw`) and compresses it into a dense representation (`reg_dense`). | ||
* It uses NEON SIMD instructions to process multiple values at once. | ||
* | ||
* - The first loop processes most of the registers in 16-element blocks using NEON instructions. | ||
* - The second loop handles the remaining registers using a direct assignment macro. | ||
* | ||
*/ | ||
void hllDenseCompressNEON(uint8_t *reg_dense, const uint8_t *reg_raw) { | ||
/* Shuffle indices for packing bytes of dense registers | ||
* From: {AAA0|BBB0|CCC0|DDD0} | ||
* To: {AAAB|BBCC|CDDD|0000} | ||
*/ | ||
uint8x16_t idx = { | ||
0, 1, 2, // Extract bytes from lane 0 | ||
4, 5, 6, // Extract bytes from lane 1 | ||
8, 9, 10, // Extract bytes from lane 2 | ||
12, 13, 14, // Extract bytes from lane 3 | ||
0xFF, 0xFF, 0xFF, 0xFF // Zero out last 4 elements (padding) | ||
}; | ||
|
||
// Bit masks for extracting first 6 bits from every byte within 32-bit lanes | ||
uint32x4_t mask1 = vdupq_n_u32(0x0000003F); // Extract bits 0-5 | ||
uint32x4_t mask2 = vdupq_n_u32(0x00003F00); // Extract bits 8-13 | ||
uint32x4_t mask3 = vdupq_n_u32(0x003F0000); // Extract bits 16-21 | ||
uint32x4_t mask4 = vdupq_n_u32(0x3F000000); // Extract bits 24-29 | ||
|
||
uint8_t *r = (uint8_t *)reg_raw; // Input pointer | ||
uint8_t *t = (uint8_t *)reg_dense; // Output pointer | ||
|
||
// Process registers in blocks of 16 using NEON instructions | ||
// The last 16 registers are processed separately to avoid overwriting, as the final write is 12 bytes. | ||
for (int i = 0; i < HLL_REGISTERS / 16 - 1; i++) { | ||
// Load 16 bytes as 4x 32-bit values | ||
uint32x4_t x = vld1q_u32((uint32_t *)r); | ||
|
||
// Apply masks to extract a single register from every 4 registers, for every lane | ||
uint32x4_t a1 = vandq_u32(x, mask1); | ||
uint32x4_t a2 = vandq_u32(x, mask2); | ||
uint32x4_t a3 = vandq_u32(x, mask3); | ||
uint32x4_t a4 = vandq_u32(x, mask4); | ||
|
||
// Shift extracted bits to align them properly | ||
a2 = vshrq_n_u32(a2, 2); | ||
a3 = vshrq_n_u32(a3, 4); | ||
a4 = vshrq_n_u32(a4, 6); | ||
|
||
uint32x4_t y1 = vorrq_u32(a1, a2); | ||
uint32x4_t y2 = vorrq_u32(a3, a4); | ||
uint32x4_t y = vorrq_u32(y1, y2); | ||
|
||
// Perform a table lookup to shuffle extracted values and align them in 12 bytes | ||
vst1q_u8(t, vqtbl1q_u8(vreinterpretq_u8_u32(y), idx)); | ||
|
||
t += 12; | ||
r += 16; | ||
} | ||
|
||
// Handle the remaining registers individually (12 bytes) | ||
for (int i = HLL_REGISTERS - 16; i < HLL_REGISTERS; i++) { | ||
HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); | ||
} | ||
} | ||
#endif // __ARM_NEON__ | ||
|
||
/* Compress raw registers to dense representation. */ | ||
void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { | ||
#ifdef HAVE_AVX2 | ||
|
@@ -1366,6 +1542,16 @@ void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { | |
return; | ||
} | ||
} | ||
|
||
#endif | ||
|
||
#ifdef __ARM_NEON | ||
if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { | ||
if (HLL_USE_NEON) { | ||
hllDenseCompressNEON(reg_dense, reg_raw); | ||
return; | ||
} | ||
} | ||
#endif | ||
|
||
for (int i = 0; i < HLL_REGISTERS; i++) { | ||
|
@@ -1772,16 +1958,22 @@ void pfdebugCommand(client *c) { | |
if (!strcasecmp(c->argv[2]->ptr, "on")) { | ||
#ifdef HAVE_AVX2 | ||
simd_enabled = 1; | ||
#endif | ||
#ifdef __ARM_NEON | ||
simd_enabled = 1; | ||
#endif | ||
Comment on lines
1959
to
1964
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
or we can consider some macro like |
||
} else if (!strcasecmp(c->argv[2]->ptr, "off")) { | ||
#ifdef HAVE_AVX2 | ||
simd_enabled = 0; | ||
#endif | ||
#ifdef __ARM_NEON | ||
simd_enabled = 0; | ||
#endif | ||
} else { | ||
addReplyError(c, "Argument must be ON or OFF"); | ||
} | ||
|
||
if (HLL_USE_AVX2) { | ||
if (HLL_USE_AVX2 || HLL_USE_NEON) { | ||
addReplyStatus(c, "enabled"); | ||
} else { | ||
addReplyStatus(c, "disabled"); | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May crash on ARMv7-A if the address isn't 16-byte aligned.
TODO