-
-
Notifications
You must be signed in to change notification settings - Fork 106
Closed
Labels
help wantedExtra attention is neededExtra attention is needed
Description
All find_next_host_delimiter_special
and find_next_host_delimiter
are doing is a simple search which could've been done using a lookup table.
Lines 203 to 325 in a9dd0d4
ada_really_inline size_t find_next_host_delimiter_special( | |
std::string_view view, size_t location) noexcept { | |
// performance: if you plan to call find_next_host_delimiter more than once, | |
// you *really* want find_next_host_delimiter to be inlined, because | |
// otherwise, the constants may get reloaded each time (bad). | |
auto has_zero_byte = [](uint64_t v) { | |
return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); | |
}; | |
auto index_of_first_set_byte = [](uint64_t v) { | |
return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1; | |
}; | |
auto broadcast = [](uint8_t v) -> uint64_t { | |
return 0x101010101010101ull * v; | |
}; | |
size_t i = location; | |
uint64_t mask1 = broadcast(':'); | |
uint64_t mask2 = broadcast('/'); | |
uint64_t mask3 = broadcast('\\'); | |
uint64_t mask4 = broadcast('?'); | |
uint64_t mask5 = broadcast('['); | |
// This loop will get autovectorized under many optimizing compilers, | |
// so you get actually SIMD! | |
for (; i + 7 < view.size(); i += 8) { | |
uint64_t word{}; | |
// performance: the next memcpy translates into a single CPU instruction. | |
memcpy(&word, view.data() + i, sizeof(word)); | |
// performance: on little-endian systems (most systems), this next line is | |
// free. | |
word = swap_bytes_if_big_endian(word); | |
uint64_t xor1 = word ^ mask1; | |
uint64_t xor2 = word ^ mask2; | |
uint64_t xor3 = word ^ mask3; | |
uint64_t xor4 = word ^ mask4; | |
uint64_t xor5 = word ^ mask5; | |
uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | | |
has_zero_byte(xor3) | has_zero_byte(xor4) | | |
has_zero_byte(xor5); | |
if (is_match) { | |
return size_t(i + index_of_first_set_byte(is_match)); | |
} | |
} | |
if (i < view.size()) { | |
uint64_t word{}; | |
// performance: the next memcpy translates into a function call, but | |
// that is difficult to avoid. Might be a bit expensive. | |
memcpy(&word, view.data() + i, view.size() - i); | |
word = swap_bytes_if_big_endian(word); | |
uint64_t xor1 = word ^ mask1; | |
uint64_t xor2 = word ^ mask2; | |
uint64_t xor3 = word ^ mask3; | |
uint64_t xor4 = word ^ mask4; | |
uint64_t xor5 = word ^ mask5; | |
uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | | |
has_zero_byte(xor3) | has_zero_byte(xor4) | | |
has_zero_byte(xor5); | |
if (is_match) { | |
return size_t(i + index_of_first_set_byte(is_match)); | |
} | |
} | |
return view.size(); | |
} | |
// starting at index location, this finds the next location of a character | |
// :, /, ? or [. If none is found, view.size() is returned. | |
// For use within get_host_delimiter_location. | |
ada_really_inline size_t find_next_host_delimiter(std::string_view view, | |
size_t location) noexcept { | |
// performance: if you plan to call find_next_host_delimiter more than once, | |
// you *really* want find_next_host_delimiter to be inlined, because | |
// otherwise, the constants may get reloaded each time (bad). | |
auto has_zero_byte = [](uint64_t v) { | |
return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); | |
}; | |
auto index_of_first_set_byte = [](uint64_t v) { | |
return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1; | |
}; | |
auto broadcast = [](uint8_t v) -> uint64_t { | |
return 0x101010101010101ull * v; | |
}; | |
size_t i = location; | |
uint64_t mask1 = broadcast(':'); | |
uint64_t mask2 = broadcast('/'); | |
uint64_t mask4 = broadcast('?'); | |
uint64_t mask5 = broadcast('['); | |
// This loop will get autovectorized under many optimizing compilers, | |
// so you get actually SIMD! | |
for (; i + 7 < view.size(); i += 8) { | |
uint64_t word{}; | |
// performance: the next memcpy translates into a single CPU instruction. | |
memcpy(&word, view.data() + i, sizeof(word)); | |
// performance: on little-endian systems (most systems), this next line is | |
// free. | |
word = swap_bytes_if_big_endian(word); | |
uint64_t xor1 = word ^ mask1; | |
uint64_t xor2 = word ^ mask2; | |
uint64_t xor4 = word ^ mask4; | |
uint64_t xor5 = word ^ mask5; | |
uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | | |
has_zero_byte(xor4) | has_zero_byte(xor5); | |
if (is_match) { | |
return size_t(i + index_of_first_set_byte(is_match)); | |
} | |
} | |
if (i < view.size()) { | |
uint64_t word{}; | |
// performance: the next memcpy translates into a function call, but | |
// that is difficult to avoid. Might be a bit expensive. | |
memcpy(&word, view.data() + i, view.size() - i); | |
// performance: on little-endian systems (most systems), this next line is | |
// free. | |
word = swap_bytes_if_big_endian(word); | |
uint64_t xor1 = word ^ mask1; | |
uint64_t xor2 = word ^ mask2; | |
uint64_t xor4 = word ^ mask4; | |
uint64_t xor5 = word ^ mask5; | |
uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | | |
has_zero_byte(xor4) | has_zero_byte(xor5); | |
if (is_match) { | |
return size_t(i + index_of_first_set_byte(is_match)); | |
} | |
} | |
return view.size(); | |
} |
Trying to make it easy for the compiler to SIMDIfy it has backfired.
Ada's version assembly (which by the way, it won't vectorize until you specify -march=native
or a-likes)
Table Lookup generated assembly
Benchmarks
--------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------
AdaAutoVectorizedSolution 25.7 ns 25.7 ns 174780063
WebppSimpleCharsetSearch 28.3 ns 28.2 ns 148920655
WebppSimpleCharmapSearch 15.9 ns 15.9 ns 265948817
Is there a reason why you have gone into such a trouble of optimizing it this way? Am I missing something or is this a clear premature optimization situation?
Metadata
Metadata
Assignees
Labels
help wantedExtra attention is neededExtra attention is needed