Skip to content

Commit f28eb43

Browse files
authored
Implementation of atol using SWAR of 128 bits (#83)
* 128 bit Lemire, atol draft * Introduces atol test case as requested in PR review * Completes addressing PR review lgtm, merging --sbruce
1 parent b80783c commit f28eb43

File tree

8 files changed

+198
-44
lines changed

8 files changed

+198
-44
lines changed

benchmark/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ macro(set_xcode_properties TARGET_NAME)
2121
set_target_properties(${TARGET_NAME} PROPERTIES
2222
XCODE_ATTRIBUTE_ENABLE_AVX YES
2323
XCODE_ATTRIBUTE_ENABLE_AVX2 YES
24-
XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS "-mavx -mavx2"
25-
XCODE_ATTRIBUTE_OTHER_CFLAGS "-mavx -mavx2"
24+
XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS "-mavx -mavx2 -mbmi2"
25+
XCODE_ATTRIBUTE_OTHER_CFLAGS "-mavx -mavx2 -mbmi2"
2626
)
2727
endif()
2828
endmacro()
@@ -44,7 +44,7 @@ add_subdirectory(dependencies/google_benchmark)
4444
add_executable(
4545
catch2Benchmark
4646
catch2BenchmarkMain.cpp catch2Functions.cpp catch2swar-demo.cpp
47-
atoi.cpp
47+
atoi.cpp atoi-catch2-tests.cpp
4848
egyptian.cpp
4949
# RobinHood.benchmark.cpp
5050
)

benchmark/atoi-catch2-tests.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include "catch2/catch.hpp"
2+
3+
#include "zoo/swar/SWAR.h"
4+
5+
uint64_t calculateBase10(zoo::swar::SWAR<8, __uint128_t>) noexcept;
6+
namespace zoo {
7+
int64_t c_strToL(const char *str) noexcept;
8+
}
9+
10+
template<int RequiredAlignment>
11+
void requireAlignment(const void *p) {
12+
std::uintptr_t asNumber = reinterpret_cast<std::uintptr_t>(p);
13+
REQUIRE(asNumber % RequiredAlignment == 0);
14+
}
15+
16+
TEST_CASE("Calculate Base10", "[pure-test][swar][atoi]") {
17+
alignas(16) char inputString[] = "1234567898765432";
18+
requireAlignment<16>(inputString);
19+
using S = zoo::swar::SWAR<8, __uint128_t>;
20+
S input;
21+
memcpy(&input.m_v, inputString, 16);
22+
constexpr S CharZeros{zoo::meta::BitmaskMaker<__uint128_t, '0', 8>::value};
23+
input = input - CharZeros;
24+
25+
constexpr auto expected = __int128_t(12345678) * 100'000'000 + 98765432;
26+
auto calculated = calculateBase10(input);
27+
REQUIRE(expected == calculated);
28+
}
29+
30+
TEST_CASE("Atol", "[pure-test][swar][atoi]") {
31+
// Because of the code sharing between the implementation of atoi and atol,
32+
// and because atoi has been thoroughly tested in the benchmarking,
33+
// there isn't much to test of atol, other than being able to deal with
34+
// numbers higher than representable with an integer and 16-byte
35+
// misalignment.
36+
alignas(16) char inputString[] = "0123456789-\f9123456789987654321";
37+
// notice 91...99...1 is very close to 2^63 - 1, max for 64 signed
38+
// log2(9123456789987654321) ~= 62.984
39+
requireAlignment<16>(inputString);
40+
// Notice the second number, 91...99...1 is misaligned by 12 bytes
41+
SECTION("Aligned atol") {
42+
auto expected = 123456789;
43+
auto converted = zoo::c_strToL(inputString);
44+
REQUIRE(converted == expected);
45+
}
46+
SECTION("Misaligned atol") {
47+
auto expected = 9123456789987654321ll;
48+
auto secondNumber = inputString + 11;
49+
REQUIRE(std::string("\f9123456789987654321") == secondNumber);
50+
auto converted = zoo::c_strToL(secondNumber);
51+
REQUIRE(converted == expected);
52+
}
53+
}

benchmark/atoi-corpus.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,9 @@ struct CorpusAtoi {
267267
}
268268
int number = exp(logBase10 * M_LN10);
269269
auto n = sprintf(conversionBuffer, "%d%c", number, postNumber(generator));
270-
if(n < 0) { throw 0; }
270+
if(n < 0) {
271+
throw 0;
272+
}
271273
allCharacters.append(conversionBuffer);
272274
sizes.push_back(count + negativeSign + iz + n);
273275
consumeStrPtr(allCharacters.c_str() + currentLength, count + negativeSign + iz + n);
@@ -302,7 +304,9 @@ struct CorpusAtoi {
302304
};
303305

304306
#define ATOI_CORPUS_X_LIST \
305-
X(GLIBC_atoi, atoi) X(ZOO_ATOI, zoo::c_strToI) X(COMPARE_ATOI, zoo::compareAtoi)
307+
X(GLIBC_atoi, atoi) X(ZOO_ATOI, zoo::c_strToI)\
308+
X(COMPARE_ATOI, zoo::compareAtoi) \
309+
X(COMPARE_ATOL, zoo::compareAtol)
306310

307311
#define X(Typename, FunctionToCall) \
308312
struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } };

benchmark/atoi.cpp

Lines changed: 73 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "atoi.h"
2+
#include "atoi_impl.h"
23

34
#include "zoo/swar/associative_iteration.h"
45

@@ -53,6 +54,21 @@ uint32_t calculateBase10(zoo::swar::SWAR<8, uint64_t> convertedToIntegers) noexc
5354
return uint32_t(by10001base2to32.value() >> 32);
5455
}
5556

57+
uint64_t calculateBase10(zoo::swar::SWAR<8, __uint128_t> convertedToIntegers) noexcept {
58+
auto by11base256 = convertedToIntegers.multiply(256*10 + 1);
59+
auto bytePairs = zoo::swar::doublePrecision(by11base256).odd;
60+
auto by101base2to16 = bytePairs.multiply(1 + (100 << 16));
61+
auto byteQuads = zoo::swar::doublePrecision(by101base2to16).odd;
62+
auto by10001base2to32 = byteQuads.multiply(1 + (10000ull << 32));
63+
// Now, truly work with 128 bits: combine two 32 bit results, each
64+
// corresponding to 8 bytes of inputs, into the the 64 bit result by
65+
// scaling one by 10^8
66+
auto byteOcts = zoo::swar::doublePrecision(by10001base2to32).odd;
67+
auto byHundredMillionBase2to64 =
68+
byteOcts.multiply(1 + (__uint128_t(100'000'000) << 64));
69+
return uint64_t(byHundredMillionBase2to64.value() >> 64);
70+
}
71+
5672
// Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2
5773
// of 10^9 is slightly less than 30, thus, only 30 bits are needed.
5874
uint32_t lemire_as_zoo_swar(const char *chars) noexcept {
@@ -104,25 +120,6 @@ std::size_t leadingSpacesCountAligned(S bytes) noexcept {
104120
return rv;
105121
}
106122

107-
/// @brief Loads the "block" containing the pointer, by proper alignment
108-
/// @tparam PtrT Pointer type for loading
109-
/// @tparam Block as the name indicates
110-
/// @param pointerInsideBlock the potentially misaligned pointer
111-
/// @param b where the loaded bytes will be put
112-
/// @return a pair to indicate the aligned pointer to the base of the block
113-
/// and the misalignment, in bytes, of the source pointer
114-
template<typename PtrT, typename Block>
115-
std::tuple<PtrT *, int>
116-
blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) {
117-
uintptr_t asUint = reinterpret_cast<uintptr_t>(pointerInsideBlock);
118-
constexpr auto Alignment = alignof(Block), Size = sizeof(Block);
119-
static_assert(Alignment == Size);
120-
auto misalignment = asUint % Alignment;
121-
auto *base = reinterpret_cast<PtrT *>(asUint - misalignment);
122-
memcpy(b, base, Size);
123-
return { base, misalignment };
124-
}
125-
126123
std::size_t leadingSpacesCount(const char *p) noexcept {
127124
using S = swar::SWAR<8, uint64_t>;
128125
S bytes;
@@ -173,11 +170,33 @@ auto leadingDigitsCount(const char *p) noexcept {
173170
}
174171
}
175172

176-
int c_strToI(const char *str) noexcept {
177-
constexpr static std::array<int, 8> LastFactor = {
178-
1, 10, 100, 1000,
179-
10'000, 100'000, 1000'000, 10'000'000
180-
};
173+
namespace impl {
174+
175+
template<typename> struct ConversionTraits;
176+
template<> struct ConversionTraits<int32_t>{
177+
constexpr static auto NPositions = 9; // from 10^0 to 10^8
178+
using PowersOf10Array = std::array<int32_t, NPositions>;
179+
using DoublePrecision = uint64_t;
180+
};
181+
template<> struct ConversionTraits<int64_t>{
182+
constexpr static auto NPositions = 17; // from 10^0 to 10^16
183+
using PowersOf10Array = std::array<int64_t, NPositions>;
184+
using DoublePrecision = __uint128_t;
185+
};
186+
187+
template<typename Result>
188+
auto PowersOf10Array() {
189+
using Traits = ConversionTraits<Result>;
190+
typename Traits::PowersOf10Array rv{1};
191+
for (std::size_t i = 1; i < Traits::NPositions; ++i) {
192+
rv[i] = rv[i - 1] * 10;
193+
}
194+
return rv;
195+
};
196+
197+
template<typename Return>
198+
Return c_strToIntegral(const char *str) noexcept {
199+
auto LastFactor = PowersOf10Array<Return>();
181200
auto leadingSpaces = leadingSpacesCount(str);
182201
auto s = str + leadingSpaces;
183202
auto sign = 1;
@@ -187,14 +206,19 @@ int c_strToI(const char *str) noexcept {
187206
case '+': ++s; break;
188207
default: ;
189208
}
190-
using S = swar::SWAR<8, uint64_t>;
209+
210+
using SWAR_BaseType = typename ConversionTraits<Return>::DoublePrecision;
211+
constexpr auto
212+
NBytes = sizeof(SWAR_BaseType),
213+
NBitsPerByte = 8ul; // 8 bits per byte
214+
using S = swar::SWAR<NBitsPerByte, SWAR_BaseType>;
191215
S bytes;
192216
auto [base, misalignment] = blockAlignedLoad(s, &bytes.m_v);
193-
auto bitDisplacement = 8 * misalignment;
217+
auto bitDisplacement = NBitsPerByte * misalignment;
194218
constexpr static S
195-
AllZeroCharacter{meta::BitmaskMaker<uint64_t, '0', 8>::value},
219+
AllZeroCharacter{meta::BitmaskMaker<SWAR_BaseType, '0', NBitsPerByte>::value},
196220
AllOn = ~S{0};
197-
// blit the zero-characters to the misaligned part
221+
198222
auto mask = S{AllOn.value() << bitDisplacement};
199223
auto misalignedEliminated = bytes & mask;
200224
auto zeroCharactersIntroduced = AllZeroCharacter & ~mask;
@@ -210,23 +234,35 @@ int c_strToI(const char *str) noexcept {
210234
auto nonDigitIndex = nonDigits.lsbIndex();
211235
auto asIntegers = bytes - AllZeroCharacter; // upper lanes garbage
212236
auto integersInHighLanes =
213-
// allow complete clearing of the 8 bytes by doing 2 shifts,
214-
// since it is UB to shift 64 bits.
215-
asIntegers.shiftLanesLeft(7 - nonDigitIndex).shiftLanesLeft(1);
237+
// split the shift in two steps because if nonDigitIndex is
238+
// zero, then you'd shift all bits, this would result in U.B.
239+
// for a single shift
240+
asIntegers.shiftLanesLeft(NBytes - 1 - nonDigitIndex)
241+
.shiftLanesLeft(1);
216242
auto inBase10 = calculateBase10(integersInHighLanes);
217243
auto scaledAccumulator = accumulator * LastFactor[nonDigitIndex];
218-
return int((scaledAccumulator + inBase10) * sign);
244+
return Return((scaledAccumulator + inBase10) * sign);
219245
}
220-
// all 8 bytes are digits
246+
// all bytes are digits
221247
auto asIntegers = bytes - AllZeroCharacter;
222-
accumulator *= 100'000'000;
248+
accumulator *= LastFactor.back();
223249
auto inBase10 = calculateBase10(asIntegers);
224250
accumulator += inBase10;
225-
base += 8;
226-
memcpy(&bytes.m_v, base, 8);
251+
base += NBytes;
252+
memcpy(&bytes.m_v, base, NBytes);
227253
}
228254
}
229255

256+
}
257+
258+
int c_strToI(const char *str) noexcept {
259+
return impl::c_strToIntegral<int>(str);
260+
}
261+
262+
int64_t c_strToL(const char *str) noexcept {
263+
return impl::c_strToIntegral<int64_t>(str);
264+
}
265+
230266
/// \brief Helper function to fix the non-string part of block
231267
template<typename S>
232268
S adjustMisalignmentFor_strlen(S data, int misalignment) {
@@ -252,7 +288,7 @@ std::size_t c_strLength(const char *s) {
252288

253289
auto indexOfFirstTrue = [](auto bs) { return bs.lsbIndex(); };
254290

255-
// Misalignment must be taken into account because a SWAR read is
291+
// Misalignment must be taken into account because a SWAR read is
256292
// speculative, it might read bytes outside of the actual string.
257293
// It is safe to read within the page where the string occurs, and to
258294
// guarantee that, simply make aligned reads because the size of the SWAR

benchmark/atoi.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ std::size_t leadingSpacesCount(const char *) noexcept;
1818
std::size_t c_strLength(const char *s);
1919
std::size_t c_strLength_natural(const char *s);
2020
int32_t c_strToI(const char *) noexcept;
21+
int64_t c_strToL(const char *) noexcept;
2122

2223
inline int compareAtoi(const char *s) {
2324
auto
@@ -27,6 +28,16 @@ inline int compareAtoi(const char *s) {
2728
return from_stdlib;
2829
}
2930

31+
inline int compareAtol(const char *s) {
32+
auto
33+
from_stdlib = atoll(s),
34+
from_zoo = c_strToL(s);
35+
if(from_stdlib != from_zoo) {
36+
auto recalc = c_strToL(s);
37+
throw 0; }
38+
return from_stdlib;
39+
}
40+
3041
#if ZOO_CONFIGURED_TO_USE_AVX()
3142
std::size_t avx2_strlen(const char* str);
3243
#endif

benchmark/atoi_impl.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "zoo/swar/SWAR.h"
2+
3+
#include <tuple>
4+
#include <string.h>
5+
6+
uint64_t calculateBase10(
7+
zoo::swar::SWAR<8, __uint128_t> convertedToIntegers
8+
) noexcept;
9+
10+
uint32_t calculateBase10(
11+
zoo::swar::SWAR<8, uint64_t> convertedToIntegers
12+
) noexcept;
13+
14+
namespace zoo {
15+
16+
/// @brief Loads the "block" containing the pointer, by proper alignment
17+
/// @tparam PtrT Pointer type for loading
18+
/// @tparam Block as the name indicates
19+
/// @param pointerInsideBlock the potentially misaligned pointer
20+
/// @param b where the loaded bytes will be put
21+
/// @return a pair to indicate the aligned pointer to the base of the block
22+
/// and the misalignment, in bytes, of the source pointer
23+
/// \note The misalignment is in the range [ 0, sizeof(Block) [
24+
template<typename PtrT, typename Block>
25+
std::tuple<PtrT *, int>
26+
blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) {
27+
uintptr_t asUint = reinterpret_cast<uintptr_t>(pointerInsideBlock);
28+
constexpr auto Alignment = alignof(Block), Size = sizeof(Block);
29+
static_assert(Alignment == Size);
30+
auto misalignment = asUint % Alignment;
31+
auto *base = reinterpret_cast<PtrT *>(asUint - misalignment);
32+
memcpy(b, base, Size);
33+
return { base, misalignment };
34+
}
35+
36+
}

benchmark/catch2swar-demo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") {
106106
#undef X
107107
}
108108

109-
TEST_CASE("Atoi correctness", "[swar][atoi]") {
109+
TEST_CASE("Atoi correctness", "[pure-test][swar][atoi]") {
110110
auto empty = "";
111111
REQUIRE(0 == zoo::c_strToI(empty));
112112
alignas(8) constexpr char EmptyMisaligned[8] = { 'Q', '\0', '0', '1', '2', '3', '9', '\0' };
@@ -133,4 +133,4 @@ TEST_CASE("Atoi correctness", "[swar][atoi]") {
133133
sprintf(buffer, " %d", randomNumber);
134134
auto glibc = atoi(buffer);
135135
REQUIRE(zoo::c_strToI(buffer) == glibc);
136-
}
136+
}

inc/zoo/swar/SWAR.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,14 @@ constexpr std::make_unsigned_t<T> msbIndex(T v) noexcept {
3434
}
3535

3636
/// Index into the bits of the type T that contains the LSB.
37+
///
38+
/// \todo incorporate __builtin_ctzg when it is more widely available
3739
template<typename T>
3840
constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept {
41+
// This check should be SFINAE, but supporting all sorts
42+
// of base types is an ongoing task, we put a bare-minimum
43+
// temporary preventive measure with static_assert
44+
static_assert(sizeof(T) <= 8, "Unsupported");
3945
#ifdef _MSC_VER
4046
// ~v & (v - 1) turns on all trailing zeroes, zeroes the rest
4147
return meta::logFloor(1 + (~v & (v - 1)));
@@ -44,6 +50,14 @@ constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept {
4450
#endif
4551
}
4652

53+
#ifndef _MSC_VER
54+
constexpr __uint128_t lsbIndex(__uint128_t v) noexcept {
55+
auto low = (v << 64) >> 64;
56+
if(low) { return __builtin_ctzll(low); }
57+
return 64 + __builtin_ctzll(v >> 64);
58+
}
59+
#endif
60+
4761
/// Core abstraction around SIMD Within A Register (SWAR). Specifies 'lanes'
4862
/// of NBits width against a type T, and provides an abstraction for performing
4963
/// SIMD operations against that primitive type T treated as a SIMD register.

0 commit comments

Comments
 (0)