Skip to content

Optional SIMD strchrnul #594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions libc-top-half/musl/src/string/strchrnul.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include <stdint.h>
#include <limits.h>

#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#endif

#define ALIGN (sizeof(size_t))
#define ONES ((size_t)-1/UCHAR_MAX)
#define HIGHS (ONES * (UCHAR_MAX/2+1))
Expand All @@ -12,6 +16,49 @@ char *__strchrnul(const char *s, int c)
c = (unsigned char)c;
if (!c) return (char *)s + strlen(s);

#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
// which results in an ICE when inline assembly is used with a vector result.
#if __clang_major__ != 19 && __clang_major__ != 20
// Note that reading before/after the allocation of a pointer is UB in
// C, so inline assembly is used to generate the exact machine
// instruction we want with opaque semantics to the compiler to avoid
// the UB.
uintptr_t align = (uintptr_t)s % sizeof(v128_t);
uintptr_t addr = (uintptr_t)s - align;
v128_t vc = wasm_i8x16_splat(c);

for (;;) {
v128_t v;
__asm__ (
"local.get %1\n"
"v128.load 0\n"
"local.set %0\n"
: "=r"(v)
: "r"(addr)
: "memory");
const v128_t cmp = wasm_i8x16_eq(v, (v128_t){}) | wasm_i8x16_eq(v, vc);
// Bitmask is slow on AArch64, any_true is much faster.
if (wasm_v128_any_true(cmp)) {
// Clear the bits corresponding to align (little-endian)
// so we can count trailing zeros.
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
// At least one bit will be set, unless align cleared them.
// Knowing this helps the compiler if it unrolls the loop.
__builtin_assume(mask || align);
// If the mask became zero because of align,
// it's as if we didn't find anything.
if (mask) {
// Find the offset of the first one bit (little-endian).
return (char *)s + (addr - (uintptr_t)s + __builtin_ctz(mask));
}
}
align = 0;
addr += sizeof(v128_t);
}
#endif
#endif

#ifdef __GNUC__
typedef size_t __attribute__((__may_alias__)) word;
const word *w;
Expand Down
6 changes: 3 additions & 3 deletions test/src/misc/memchr.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ void test(char *ptr, size_t length, void *want) {
int main(void) {
char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);

for (size_t length = 0; length < 64; length++) {
for (size_t alignment = 0; alignment < 24; alignment++) {
for (ptrdiff_t length = 0; length < 64; length++) {
for (ptrdiff_t alignment = 0; alignment < 24; alignment++) {
for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
// Create a buffer with the given length, at a pointer with the given
// alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
Expand All @@ -26,10 +26,10 @@ int main(void) {
char *ptr = LIMIT - PAGESIZE - 8 + alignment;
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
memset(ptr, 5, length);
ptr[pos] = 7;

// The first instance of the character is found.
if (pos >= 0) ptr[pos + 2] = 7;
ptr[pos] = 7;

// The character is found if it's within range.
test(ptr, length, 0 <= pos && pos < length ? &ptr[pos] : NULL);
Expand Down
58 changes: 58 additions & 0 deletions test/src/misc/strchrnul.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680

#define _GNU_SOURCE

#include <__macro_PAGESIZE.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>

void test(char *ptr, char *want) {
char *got = strchrnul(ptr, 7);
if (got != want) {
printf("strchrnul(%p, 7) = %p, want %p\n", ptr, got, want);
}
}

int main(void) {
char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);

for (ptrdiff_t length = 0; length < 64; length++) {
for (ptrdiff_t alignment = 0; alignment < 24; alignment++) {
for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
// Create a buffer with the given length, at a pointer with the given
// alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
// will straddle a (Wasm, and likely OS) page boundary. Place the
// character to find at every position in the buffer, including just
// prior to it and after its end.
char *ptr = LIMIT - PAGESIZE - 8 + alignment;
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
memset(ptr, 5, length);

// The first instance of the character is found.
if (pos >= 0) ptr[pos + 2] = 7;
ptr[pos] = 7;
ptr[length] = 0;

// The character is found if it's within range.
test(ptr, 0 <= pos && pos < length ? &ptr[pos] : &ptr[length]);
}
}

// We need space for the terminator.
if (length == 0) continue;

// Ensure we never read past the end of memory.
char *ptr = LIMIT - length;
memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
memset(ptr, 5, length);

ptr[length - 1] = 7;
test(ptr, &ptr[length - 1]);

ptr[length - 1] = 0;
test(ptr, &ptr[length - 1]);
}

return 0;
}