Questionable premature optimizations? · Issue #547 · ada-url/ada

All find_next_host_delimiter_special and find_next_host_delimiter are doing is a simple search which could've been done using a lookup table.

ada/src/helpers.cpp

Lines 203 to 325 in a9dd0d4

    
           ada_really_inline size_t find_next_host_delimiter_special( 
        
               std::string_view view, size_t location) noexcept { 
        
             // performance: if you plan to call find_next_host_delimiter more than once, 
        
             // you *really* want find_next_host_delimiter to be inlined, because 
        
             // otherwise, the constants may get reloaded each time (bad). 
        
             auto has_zero_byte = [](uint64_t v) { 
        
               return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); 
        
             }; 
        
             auto index_of_first_set_byte = [](uint64_t v) { 
        
               return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1; 
        
             }; 
        
             auto broadcast = [](uint8_t v) -> uint64_t { 
        
               return 0x101010101010101ull * v; 
        
             }; 
        
             size_t i = location; 
        
             uint64_t mask1 = broadcast(':'); 
        
             uint64_t mask2 = broadcast('/'); 
        
             uint64_t mask3 = broadcast('\\'); 
        
             uint64_t mask4 = broadcast('?'); 
        
             uint64_t mask5 = broadcast('['); 
        
             // This loop will get autovectorized under many optimizing compilers, 
        
             // so you get actually SIMD! 
        
             for (; i + 7 < view.size(); i += 8) { 
        
               uint64_t word{}; 
        
               // performance: the next memcpy translates into a single CPU instruction. 
        
               memcpy(&word, view.data() + i, sizeof(word)); 
        
               // performance: on little-endian systems (most systems), this next line is 
        
               // free. 
        
               word = swap_bytes_if_big_endian(word); 
        
               uint64_t xor1 = word ^ mask1; 
        
               uint64_t xor2 = word ^ mask2; 
        
               uint64_t xor3 = word ^ mask3; 
        
               uint64_t xor4 = word ^ mask4; 
        
               uint64_t xor5 = word ^ mask5; 
        
               uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | 
        
                                   has_zero_byte(xor3) | has_zero_byte(xor4) | 
        
                                   has_zero_byte(xor5); 
        
               if (is_match) { 
        
                 return size_t(i + index_of_first_set_byte(is_match)); 
        
               } 
        
             } 
        
             if (i < view.size()) { 
        
               uint64_t word{}; 
        
               // performance: the next memcpy translates into a function call, but 
        
               // that is difficult to avoid. Might be a bit expensive. 
        
               memcpy(&word, view.data() + i, view.size() - i); 
        
               word = swap_bytes_if_big_endian(word); 
        
               uint64_t xor1 = word ^ mask1; 
        
               uint64_t xor2 = word ^ mask2; 
        
               uint64_t xor3 = word ^ mask3; 
        
               uint64_t xor4 = word ^ mask4; 
        
               uint64_t xor5 = word ^ mask5; 
        
               uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | 
        
                                   has_zero_byte(xor3) | has_zero_byte(xor4) | 
        
                                   has_zero_byte(xor5); 
        
               if (is_match) { 
        
                 return size_t(i + index_of_first_set_byte(is_match)); 
        
               } 
        
             } 
        
             return view.size(); 
        
           } 
        
           // starting at index location, this finds the next location of a character 
        
           // :, /, ? or [. If none is found, view.size() is returned. 
        
           // For use within get_host_delimiter_location. 
        
           ada_really_inline size_t find_next_host_delimiter(std::string_view view, 
        
                                                             size_t location) noexcept { 
        
             // performance: if you plan to call find_next_host_delimiter more than once, 
        
             // you *really* want find_next_host_delimiter to be inlined, because 
        
             // otherwise, the constants may get reloaded each time (bad). 
        
             auto has_zero_byte = [](uint64_t v) { 
        
               return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080); 
        
             }; 
        
             auto index_of_first_set_byte = [](uint64_t v) { 
        
               return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1; 
        
             }; 
        
             auto broadcast = [](uint8_t v) -> uint64_t { 
        
               return 0x101010101010101ull * v; 
        
             }; 
        
             size_t i = location; 
        
             uint64_t mask1 = broadcast(':'); 
        
             uint64_t mask2 = broadcast('/'); 
        
             uint64_t mask4 = broadcast('?'); 
        
             uint64_t mask5 = broadcast('['); 
        
             // This loop will get autovectorized under many optimizing compilers, 
        
             // so you get actually SIMD! 
        
             for (; i + 7 < view.size(); i += 8) { 
        
               uint64_t word{}; 
        
               // performance: the next memcpy translates into a single CPU instruction. 
        
               memcpy(&word, view.data() + i, sizeof(word)); 
        
               // performance: on little-endian systems (most systems), this next line is 
        
               // free. 
        
               word = swap_bytes_if_big_endian(word); 
        
               uint64_t xor1 = word ^ mask1; 
        
               uint64_t xor2 = word ^ mask2; 
        
               uint64_t xor4 = word ^ mask4; 
        
               uint64_t xor5 = word ^ mask5; 
        
               uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | 
        
                                   has_zero_byte(xor4) | has_zero_byte(xor5); 
        
               if (is_match) { 
        
                 return size_t(i + index_of_first_set_byte(is_match)); 
        
               } 
        
             } 
        
             if (i < view.size()) { 
        
               uint64_t word{}; 
        
               // performance: the next memcpy translates into a function call, but 
        
               // that is difficult to avoid. Might be a bit expensive. 
        
               memcpy(&word, view.data() + i, view.size() - i); 
        
               // performance: on little-endian systems (most systems), this next line is 
        
               // free. 
        
               word = swap_bytes_if_big_endian(word); 
        
               uint64_t xor1 = word ^ mask1; 
        
               uint64_t xor2 = word ^ mask2; 
        
               uint64_t xor4 = word ^ mask4; 
        
               uint64_t xor5 = word ^ mask5; 
        
               uint64_t is_match = has_zero_byte(xor1) | has_zero_byte(xor2) | 
        
                                   has_zero_byte(xor4) | has_zero_byte(xor5); 
        
               if (is_match) { 
        
                 return size_t(i + index_of_first_set_byte(is_match)); 
        
               } 
        
             } 
        
             return view.size(); 
        
           }

Trying to make it easy for the compiler to SIMDIfy it has backfired.

Ada's version assembly (which by the way, it won't vectorize until you specify -march=native or a-likes)
Table Lookup generated assembly
Benchmarks

--------------------------------------------------------------------
Benchmark                          Time             CPU   Iterations
--------------------------------------------------------------------
AdaAutoVectorizedSolution       25.7 ns         25.7 ns    174780063
WebppSimpleCharsetSearch        28.3 ns         28.2 ns    148920655
WebppSimpleCharmapSearch        15.9 ns         15.9 ns    265948817

Is there a reason why you have gone into such a trouble of optimizing it this way? Am I missing something or is this a clear premature optimization situation?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Questionable premature optimizations? #547

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	ada_really_inline size_t find_next_host_delimiter_special(
	std::string_view view, size_t location) noexcept {
	// performance: if you plan to call find_next_host_delimiter more than once,
	// you really want find_next_host_delimiter to be inlined, because
	// otherwise, the constants may get reloaded each time (bad).
	auto has_zero_byte = [](uint64_t v) {
	return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080);
	};
	auto index_of_first_set_byte = [](uint64_t v) {
	return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1;
	};
	auto broadcast = [](uint8_t v) -> uint64_t {
	return 0x101010101010101ull * v;
	};
	size_t i = location;
	uint64_t mask1 = broadcast(':');
	uint64_t mask2 = broadcast('/');
	uint64_t mask3 = broadcast('\\');
	uint64_t mask4 = broadcast('?');
	uint64_t mask5 = broadcast('[');
	// This loop will get autovectorized under many optimizing compilers,
	// so you get actually SIMD!
	for (; i + 7 < view.size(); i += 8) {
	uint64_t word{};
	// performance: the next memcpy translates into a single CPU instruction.
	memcpy(&word, view.data() + i, sizeof(word));
	// performance: on little-endian systems (most systems), this next line is
	// free.
	word = swap_bytes_if_big_endian(word);
	uint64_t xor1 = word ^ mask1;
	uint64_t xor2 = word ^ mask2;
	uint64_t xor3 = word ^ mask3;
	uint64_t xor4 = word ^ mask4;
	uint64_t xor5 = word ^ mask5;
	uint64_t is_match = has_zero_byte(xor1) \| has_zero_byte(xor2) \|
	has_zero_byte(xor3) \| has_zero_byte(xor4) \|
	has_zero_byte(xor5);
	if (is_match) {
	return size_t(i + index_of_first_set_byte(is_match));
	}
	}
	if (i < view.size()) {
	uint64_t word{};
	// performance: the next memcpy translates into a function call, but
	// that is difficult to avoid. Might be a bit expensive.
	memcpy(&word, view.data() + i, view.size() - i);
	word = swap_bytes_if_big_endian(word);
	uint64_t xor1 = word ^ mask1;
	uint64_t xor2 = word ^ mask2;
	uint64_t xor3 = word ^ mask3;
	uint64_t xor4 = word ^ mask4;
	uint64_t xor5 = word ^ mask5;
	uint64_t is_match = has_zero_byte(xor1) \| has_zero_byte(xor2) \|
	has_zero_byte(xor3) \| has_zero_byte(xor4) \|
	has_zero_byte(xor5);
	if (is_match) {
	return size_t(i + index_of_first_set_byte(is_match));
	}
	}
	return view.size();
	}

	// starting at index location, this finds the next location of a character
	// :, /, ? or [. If none is found, view.size() is returned.
	// For use within get_host_delimiter_location.
	ada_really_inline size_t find_next_host_delimiter(std::string_view view,
	size_t location) noexcept {
	// performance: if you plan to call find_next_host_delimiter more than once,
	// you really want find_next_host_delimiter to be inlined, because
	// otherwise, the constants may get reloaded each time (bad).
	auto has_zero_byte = [](uint64_t v) {
	return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080);
	};
	auto index_of_first_set_byte = [](uint64_t v) {
	return ((((v - 1) & 0x101010101010101) * 0x101010101010101) >> 56) - 1;
	};
	auto broadcast = [](uint8_t v) -> uint64_t {
	return 0x101010101010101ull * v;
	};
	size_t i = location;
	uint64_t mask1 = broadcast(':');
	uint64_t mask2 = broadcast('/');
	uint64_t mask4 = broadcast('?');
	uint64_t mask5 = broadcast('[');
	// This loop will get autovectorized under many optimizing compilers,
	// so you get actually SIMD!
	for (; i + 7 < view.size(); i += 8) {
	uint64_t word{};
	// performance: the next memcpy translates into a single CPU instruction.
	memcpy(&word, view.data() + i, sizeof(word));
	// performance: on little-endian systems (most systems), this next line is
	// free.
	word = swap_bytes_if_big_endian(word);
	uint64_t xor1 = word ^ mask1;
	uint64_t xor2 = word ^ mask2;
	uint64_t xor4 = word ^ mask4;
	uint64_t xor5 = word ^ mask5;
	uint64_t is_match = has_zero_byte(xor1) \| has_zero_byte(xor2) \|
	has_zero_byte(xor4) \| has_zero_byte(xor5);
	if (is_match) {
	return size_t(i + index_of_first_set_byte(is_match));
	}
	}
	if (i < view.size()) {
	uint64_t word{};
	// performance: the next memcpy translates into a function call, but
	// that is difficult to avoid. Might be a bit expensive.
	memcpy(&word, view.data() + i, view.size() - i);
	// performance: on little-endian systems (most systems), this next line is
	// free.
	word = swap_bytes_if_big_endian(word);
	uint64_t xor1 = word ^ mask1;
	uint64_t xor2 = word ^ mask2;
	uint64_t xor4 = word ^ mask4;
	uint64_t xor5 = word ^ mask5;
	uint64_t is_match = has_zero_byte(xor1) \| has_zero_byte(xor2) \|
	has_zero_byte(xor4) \| has_zero_byte(xor5);
	if (is_match) {
	return size_t(i + index_of_first_set_byte(is_match));
	}
	}
	return view.size();
	}

Uh oh!

Questionable premature optimizations? #547

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions