-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HADOOP-19724. [RISC-V] Add rv64 Zbc (CLMUL) bulk CRC32 (CRC32C not optimized) #8031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,24 +16,200 @@ | |
| * limitations under the License. | ||
| */ | ||
|
|
||
| #include <assert.h> | ||
| #include <stddef.h> // for size_t | ||
| #include <stdio.h> | ||
| #include <string.h> | ||
|
|
||
| #include "bulk_crc32.h" | ||
| #include "gcc_optimizations.h" | ||
|
|
||
| /** | ||
| * Hardware-accelerated CRC32 calculation using RISC-V Zbc extension. | ||
| * Uses carry-less multiply instructions (clmul/clmulh) for CRC32 (zlib | ||
| * polynomial). | ||
| */ | ||
|
|
||
| typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, | ||
| const uint8_t *, size_t, int); | ||
| extern crc_pipelined_func_t pipelined_crc32_zlib_func; | ||
|
|
||
| #if defined(__riscv) && (__riscv_xlen == 64) | ||
|
|
||
| #define RV_CRC32_CONST_R3 0x01751997d0ULL | ||
| #define RV_CRC32_CONST_R4 0x00ccaa009eULL | ||
| #define RV_CRC32_CONST_R5 0x0163cd6124ULL | ||
| #define RV_CRC32_MASK32 0x00000000FFFFFFFFULL | ||
| #define RV_CRC32_POLY_TRUE_LE_FULL 0x01DB710641ULL | ||
| #define RV_CRC32_CONST_RU 0x01F7011641ULL | ||
|
|
||
| static inline uint64_t rv_clmul(uint64_t a, uint64_t b) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. explain what it does with a comment. same for the methods below |
||
| uint64_t r; | ||
| __asm__ volatile( | ||
| ".option push\n\t" | ||
| ".option arch, +zbc\n\t" | ||
| "clmul %0, %1, %2\n\t" | ||
| ".option pop\n\t" | ||
| : "=r"(r) | ||
| : "r"(a), "r"(b)); | ||
| return r; | ||
| } | ||
|
|
||
| static inline uint64_t rv_clmulh(uint64_t a, uint64_t b) { | ||
| uint64_t r; | ||
| __asm__ volatile( | ||
| ".option push\n\t" | ||
| ".option arch, +zbc\n\t" | ||
| "clmulh %0, %1, %2\n\t" | ||
| ".option pop\n\t" | ||
| : "=r"(r) | ||
| : "r"(a), "r"(b)); | ||
| return r; | ||
| } | ||
|
|
||
| static inline uint32_t rv_crc32_zlib_bitwise(uint32_t crc, const uint8_t *buf, | ||
| size_t len) { | ||
| uint32_t c = crc; | ||
| for (size_t i = 0; i < len; ++i) { | ||
| c ^= buf[i]; | ||
| for (int k = 0; k < 8; ++k) { | ||
| uint32_t mask = -(int32_t)(c & 1); | ||
| c = (c >> 1) ^ (0xEDB88320U & mask); // reflected polynomial | ||
| } | ||
| } | ||
| return c; | ||
| } | ||
|
|
||
| static uint32_t rv_crc32_zlib_clmul(uint32_t crc, const uint8_t *buf, | ||
| size_t len) { | ||
| const uint8_t *p = buf; | ||
| size_t n = len; | ||
|
|
||
| if (n < 32) { | ||
| return rv_crc32_zlib_bitwise(crc, p, n); | ||
| } | ||
|
|
||
| uintptr_t mis = (uintptr_t)p & 0xF; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment that this is to handle misaligned data and that this is considered unlikely |
||
| if (unlikely(mis)) { | ||
| size_t pre = 16 - mis; | ||
| if (pre > n) pre = n; | ||
| crc = rv_crc32_zlib_bitwise(crc, p, pre); | ||
| p += pre; | ||
| n -= pre; | ||
| } | ||
|
|
||
| uint64_t x0 = *(const uint64_t *)(const void *)(p + 0); | ||
| uint64_t x1 = *(const uint64_t *)(const void *)(p + 8); | ||
| x0 ^= (uint64_t)crc; | ||
| p += 16; | ||
| n -= 16; | ||
|
|
||
| const uint64_t C1 = RV_CRC32_CONST_R3; | ||
| const uint64_t C2 = RV_CRC32_CONST_R4; | ||
|
|
||
| while (likely(n >= 16)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and explain this is going through 16 bytes of aligned data |
||
| uint64_t tL = rv_clmul(C2, x1); | ||
| uint64_t tH = rv_clmulh(C2, x1); | ||
| uint64_t yL = rv_clmul(C1, x0); | ||
| uint64_t yH = rv_clmulh(C1, x0); | ||
| x0 = yL ^ tL; | ||
| x1 = yH ^ tH; | ||
|
|
||
| uint64_t d0 = *(const uint64_t *)(const void *)(p + 0); | ||
| uint64_t d1 = *(const uint64_t *)(const void *)(p + 8); | ||
| x0 ^= d0; | ||
| x1 ^= d1; | ||
| p += 16; | ||
| n -= 16; | ||
| } | ||
|
|
||
| { | ||
| uint64_t tH = rv_clmulh(x0, C2); | ||
| uint64_t tL = rv_clmul(x0, C2); | ||
| x0 = x1 ^ tL; | ||
| x1 = tH; | ||
| } | ||
|
|
||
| uint64_t hi = x1; | ||
| uint64_t lo = x0; | ||
| uint64_t t2 = (lo >> 32) | (hi << 32); | ||
| lo &= RV_CRC32_MASK32; | ||
|
|
||
| lo = rv_clmul(RV_CRC32_CONST_R5, lo) ^ t2; | ||
| uint64_t tmp = lo; | ||
| lo &= RV_CRC32_MASK32; | ||
| lo = rv_clmul(lo, RV_CRC32_CONST_RU); | ||
| lo &= RV_CRC32_MASK32; | ||
| lo = rv_clmul(lo, RV_CRC32_POLY_TRUE_LE_FULL) ^ tmp; | ||
|
|
||
| uint32_t c = (uint32_t)(lo >> 32); | ||
|
|
||
| if (n) { | ||
| c = rv_crc32_zlib_bitwise(c, p, n); | ||
| } | ||
| return c; | ||
| } | ||
|
|
||
| /** | ||
| * RISC-V CRC32 hardware acceleration (placeholder) | ||
| * Pipelined version of hardware-accelerated CRC32 calculation using | ||
| * RISC-V Zbc carry-less multiply instructions. | ||
| * | ||
| * Phase 1: provide a RISC-V-specific compilation unit that currently makes | ||
| * no runtime changes and falls back to the generic software path in | ||
| * bulk_crc32.c. Future work will add Zbc-based acceleration and runtime | ||
| * dispatch. | ||
| * crc1, crc2, crc3 : Store initial checksum for each block before | ||
| * calling. When it returns, updated checksums are stored. | ||
| * p_buf : The base address of the data buffer. The buffer should be | ||
| * at least as big as block_size * num_blocks. | ||
| * block_size : The size of each block in bytes. | ||
| * num_blocks : The number of blocks to work on. Min = 1, Max = 3 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0 is taken too, just treated as a no-op. mention and that any other value raises an assertion. which isn't going to be picked up, is it? |
||
| */ | ||
| static void pipelined_crc32_zlib(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, | ||
| const uint8_t *p_buf, size_t block_size, | ||
| int num_blocks) { | ||
| const uint8_t *p1 = p_buf; | ||
| const uint8_t *p2 = p_buf + block_size; | ||
| const uint8_t *p3 = p_buf + 2 * block_size; | ||
|
|
||
| #include <assert.h> | ||
| #include <stddef.h> // for size_t | ||
| switch (num_blocks) { | ||
| case 3: | ||
| *crc3 = rv_crc32_zlib_clmul(*crc3, p3, block_size); | ||
| // fall through | ||
| case 2: | ||
| *crc2 = rv_crc32_zlib_clmul(*crc2, p2, block_size); | ||
| // fall through | ||
| case 1: | ||
| *crc1 = rv_crc32_zlib_clmul(*crc1, p1, block_size); | ||
| break; | ||
| case 0: | ||
| return; | ||
| default: | ||
| assert(0 && "BUG: Invalid number of checksum blocks"); | ||
| } | ||
| } | ||
|
|
||
| #include "bulk_crc32.h" | ||
| #include "gcc_optimizations.h" | ||
| #endif // __riscv && __riscv_xlen==64 | ||
|
|
||
| /* Constructor hook reserved for future HW capability detection and | ||
| * function-pointer dispatch. Intentionally a no-op for the initial phase. */ | ||
| void __attribute__((constructor)) init_riscv_crc_support(void) | ||
| { | ||
| /* No-op: keep using the default software implementations. */ | ||
| /** | ||
| * On library load, determine what sort of crc we are going to do | ||
| * and set crc function pointers appropriately. | ||
| */ | ||
| void __attribute__((constructor)) init_cpu_support_flag(void) { | ||
| #if defined(__riscv) && (__riscv_xlen == 64) | ||
| // check if CPU supports Zbc. | ||
| // parse /proc/cpuinfo 'isa' line for substring "zbc". | ||
| FILE *f = fopen("/proc/cpuinfo", "r"); | ||
| if (f) { | ||
| char line[256]; | ||
| int has_zbc = 0; | ||
| while (fgets(line, sizeof(line), f)) { | ||
| if ((strstr(line, "isa") || strstr(line, "extensions")) && | ||
| strstr(line, "zbc")) { | ||
| has_zbc = 1; | ||
| break; | ||
| } | ||
| } | ||
| fclose(f); | ||
| if (has_zbc) { | ||
| pipelined_crc32_zlib_func = pipelined_crc32_zlib; | ||
| } | ||
| } | ||
| #endif | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
explain what these are...assumign they're all defined in the crc spec, say so