Implementation of atol using SWAR of 128 bits (#83)

thecppzoo · web-flow · commit f28eb4337976 · 2024-05-23T13:31:15.000-07:00
* 128 bit Lemire, atol draft

* Introduces atol test case as requested in PR review

* Completes addressing PR review

lgtm, merging --sbruce
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -21,8 +21,8 @@ macro(set_xcode_properties TARGET_NAME)
         set_target_properties(${TARGET_NAME} PROPERTIES
             XCODE_ATTRIBUTE_ENABLE_AVX YES
             XCODE_ATTRIBUTE_ENABLE_AVX2 YES
-            XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS "-mavx -mavx2"
-            XCODE_ATTRIBUTE_OTHER_CFLAGS "-mavx -mavx2"
+            XCODE_ATTRIBUTE_OTHER_CPLUSPLUSFLAGS "-mavx -mavx2 -mbmi2"
+            XCODE_ATTRIBUTE_OTHER_CFLAGS "-mavx -mavx2 -mbmi2"
         )
     endif()
 endmacro()
@@ -44,7 +44,7 @@ add_subdirectory(dependencies/google_benchmark)
 add_executable(
   catch2Benchmark
     catch2BenchmarkMain.cpp catch2Functions.cpp catch2swar-demo.cpp
-    atoi.cpp
+    atoi.cpp atoi-catch2-tests.cpp
     egyptian.cpp
     # RobinHood.benchmark.cpp
 )
diff --git a/benchmark/atoi-catch2-tests.cpp b/benchmark/atoi-catch2-tests.cpp
@@ -0,0 +1,53 @@
+#include "catch2/catch.hpp"
+
+#include "zoo/swar/SWAR.h"
+
+uint64_t calculateBase10(zoo::swar::SWAR<8, __uint128_t>) noexcept;
+namespace zoo {
+int64_t c_strToL(const char *str) noexcept;
+}
+
+template<int RequiredAlignment>
+void requireAlignment(const void *p) {
+    std::uintptr_t asNumber = reinterpret_cast<std::uintptr_t>(p);
+    REQUIRE(asNumber % RequiredAlignment == 0);
+}
+
+TEST_CASE("Calculate Base10", "[pure-test][swar][atoi]") {
+    alignas(16) char inputString[] = "1234567898765432";
+    requireAlignment<16>(inputString);
+    using S = zoo::swar::SWAR<8, __uint128_t>;
+    S input;
+    memcpy(&input.m_v, inputString, 16);
+    constexpr S CharZeros{zoo::meta::BitmaskMaker<__uint128_t, '0', 8>::value};
+    input = input - CharZeros;
+
+    constexpr auto expected = __int128_t(12345678) * 100'000'000 + 98765432;
+    auto calculated = calculateBase10(input);
+    REQUIRE(expected == calculated);
+}
+
+TEST_CASE("Atol", "[pure-test][swar][atoi]") {
+    // Because of the code sharing between the implementation of atoi and atol,
+    // and because atoi has been thoroughly tested in the benchmarking,
+    // there isn't much to test of atol, other than being able to deal with
+    // numbers higher than representable with an integer and 16-byte
+    // misalignment.
+    alignas(16) char inputString[] = "0123456789-\f9123456789987654321";
+    // notice 91...99...1 is very close to 2^63 - 1, max for 64 signed
+    // log2(9123456789987654321) ~= 62.984
+    requireAlignment<16>(inputString);
+    // Notice the second number, 91...99...1 is misaligned by 12 bytes
+    SECTION("Aligned atol") {
+        auto expected = 123456789;
+        auto converted = zoo::c_strToL(inputString);
+        REQUIRE(converted == expected);
+    }
+    SECTION("Misaligned atol") {
+        auto expected = 9123456789987654321ll;
+        auto secondNumber = inputString + 11;
+        REQUIRE(std::string("\f9123456789987654321") == secondNumber);
+        auto converted = zoo::c_strToL(secondNumber);
+        REQUIRE(converted == expected);
+    }
+}
diff --git a/benchmark/atoi-corpus.h b/benchmark/atoi-corpus.h
@@ -267,7 +267,9 @@ struct CorpusAtoi {
             }
             int number = exp(logBase10 * M_LN10);
             auto n = sprintf(conversionBuffer, "%d%c", number, postNumber(generator));
-            if(n < 0) { throw 0; }
+            if(n < 0) {
+                throw 0;
+            }
             allCharacters.append(conversionBuffer);
             sizes.push_back(count + negativeSign + iz + n);
             consumeStrPtr(allCharacters.c_str() + currentLength, count + negativeSign + iz + n);
@@ -302,7 +304,9 @@ struct CorpusAtoi {
 };
 
 #define ATOI_CORPUS_X_LIST \
-    X(GLIBC_atoi, atoi) X(ZOO_ATOI, zoo::c_strToI) X(COMPARE_ATOI, zoo::compareAtoi)
+    X(GLIBC_atoi, atoi) X(ZOO_ATOI, zoo::c_strToI)\
+    X(COMPARE_ATOI, zoo::compareAtoi) \
+    X(COMPARE_ATOL, zoo::compareAtol)
 
 #define X(Typename, FunctionToCall) \
     struct Invoke##Typename { int operator()(const char *p) { return FunctionToCall(p); } };
diff --git a/benchmark/atoi.cpp b/benchmark/atoi.cpp
@@ -1,4 +1,5 @@
 #include "atoi.h"
+#include "atoi_impl.h"
 
 #include "zoo/swar/associative_iteration.h"
 
@@ -53,6 +54,21 @@ uint32_t calculateBase10(zoo::swar::SWAR<8, uint64_t> convertedToIntegers) noexc
     return uint32_t(by10001base2to32.value() >> 32);
 }
 
+uint64_t calculateBase10(zoo::swar::SWAR<8, __uint128_t> convertedToIntegers) noexcept {
+    auto by11base256 = convertedToIntegers.multiply(256*10 + 1);
+    auto bytePairs = zoo::swar::doublePrecision(by11base256).odd;
+    auto by101base2to16 = bytePairs.multiply(1 + (100 << 16));
+    auto byteQuads = zoo::swar::doublePrecision(by101base2to16).odd;
+    auto by10001base2to32 = byteQuads.multiply(1 + (10000ull << 32));
+    // Now, truly work with 128 bits: combine two 32 bit results, each
+    // corresponding to 8 bytes of inputs, into the the 64 bit result by
+    // scaling one by 10^8
+    auto byteOcts = zoo::swar::doublePrecision(by10001base2to32).odd;
+    auto byHundredMillionBase2to64 =
+        byteOcts.multiply(1 + (__uint128_t(100'000'000) << 64));
+    return uint64_t(byHundredMillionBase2to64.value() >> 64);
+}
+
 // Note: eight digits can represent from 0 to (10^9) - 1, the logarithm base 2
 // of 10^9 is slightly less than 30, thus, only 30 bits are needed.
 uint32_t lemire_as_zoo_swar(const char *chars) noexcept {
@@ -104,25 +120,6 @@ std::size_t leadingSpacesCountAligned(S bytes) noexcept {
     return rv;
 }
 
-/// @brief Loads the "block" containing the pointer, by proper alignment
-/// @tparam PtrT Pointer type for loading
-/// @tparam Block as the name indicates
-/// @param pointerInsideBlock the potentially misaligned pointer
-/// @param b where the loaded bytes will be put
-/// @return a pair to indicate the aligned pointer to the base of the block
-/// and the misalignment, in bytes, of the source pointer
-template<typename PtrT, typename Block>
-std::tuple<PtrT *, int>
-blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) {
-    uintptr_t asUint = reinterpret_cast<uintptr_t>(pointerInsideBlock);
-    constexpr auto Alignment = alignof(Block), Size = sizeof(Block);
-    static_assert(Alignment == Size);
-    auto misalignment = asUint % Alignment;
-    auto *base = reinterpret_cast<PtrT *>(asUint - misalignment);
-    memcpy(b, base, Size);
-    return { base, misalignment };
-}
-
 std::size_t leadingSpacesCount(const char *p) noexcept {
     using S = swar::SWAR<8, uint64_t>;
     S bytes;
@@ -173,11 +170,33 @@ auto leadingDigitsCount(const char *p) noexcept {
     }
 }
 
-int c_strToI(const char *str) noexcept {
-    constexpr static std::array<int, 8> LastFactor = {
-        1, 10, 100, 1000,
-        10'000, 100'000, 1000'000, 10'000'000
-    };
+namespace impl {
+
+template<typename> struct ConversionTraits;
+template<> struct ConversionTraits<int32_t>{
+    constexpr static auto NPositions = 9; // from 10^0 to 10^8
+    using PowersOf10Array = std::array<int32_t, NPositions>;
+    using DoublePrecision = uint64_t;
+};
+template<> struct ConversionTraits<int64_t>{
+    constexpr static auto NPositions = 17; // from 10^0 to 10^16
+    using PowersOf10Array = std::array<int64_t, NPositions>;
+    using DoublePrecision = __uint128_t;
+};
+
+template<typename Result>
+auto PowersOf10Array() {
+    using Traits = ConversionTraits<Result>;
+    typename Traits::PowersOf10Array rv{1};
+    for (std::size_t i = 1; i < Traits::NPositions; ++i) {
+        rv[i] = rv[i - 1] * 10;
+    }
+    return rv;
+};
+
+template<typename Return>
+Return c_strToIntegral(const char *str) noexcept {
+    auto LastFactor = PowersOf10Array<Return>();
     auto leadingSpaces = leadingSpacesCount(str);
     auto s = str + leadingSpaces;
     auto sign = 1;
@@ -187,14 +206,19 @@ int c_strToI(const char *str) noexcept {
         case '+': ++s; break;
         default: ;
     }
-    using S = swar::SWAR<8, uint64_t>;
+
+    using SWAR_BaseType = typename ConversionTraits<Return>::DoublePrecision;
+    constexpr auto
+        NBytes = sizeof(SWAR_BaseType),
+        NBitsPerByte = 8ul; // 8 bits per byte
+    using S = swar::SWAR<NBitsPerByte, SWAR_BaseType>;
     S bytes;
     auto [base, misalignment] = blockAlignedLoad(s, &bytes.m_v);
-    auto bitDisplacement = 8 * misalignment;
+    auto bitDisplacement = NBitsPerByte * misalignment;
     constexpr static S
-        AllZeroCharacter{meta::BitmaskMaker<uint64_t, '0', 8>::value},
+        AllZeroCharacter{meta::BitmaskMaker<SWAR_BaseType, '0', NBitsPerByte>::value},
         AllOn = ~S{0};
-    // blit the zero-characters to the misaligned part
+
     auto mask = S{AllOn.value() << bitDisplacement};
     auto misalignedEliminated = bytes & mask;
     auto zeroCharactersIntroduced = AllZeroCharacter & ~mask;
@@ -210,23 +234,35 @@ int c_strToI(const char *str) noexcept {
             auto nonDigitIndex = nonDigits.lsbIndex();
             auto asIntegers = bytes - AllZeroCharacter; // upper lanes garbage
             auto integersInHighLanes =
-                // allow complete clearing of the 8 bytes by doing 2 shifts,
-                // since it is UB to shift 64 bits.
-                asIntegers.shiftLanesLeft(7 - nonDigitIndex).shiftLanesLeft(1);
+                // split the shift in two steps because if nonDigitIndex is
+                // zero, then you'd shift all bits, this would result in U.B.
+                // for a single shift
+                asIntegers.shiftLanesLeft(NBytes - 1 - nonDigitIndex)
+                          .shiftLanesLeft(1);
             auto inBase10 = calculateBase10(integersInHighLanes);
             auto scaledAccumulator = accumulator * LastFactor[nonDigitIndex];
-            return int((scaledAccumulator + inBase10) * sign);
+            return Return((scaledAccumulator + inBase10) * sign);
         }
-        // all 8 bytes are digits
+        // all bytes are digits
         auto asIntegers = bytes - AllZeroCharacter;
-        accumulator *= 100'000'000;
+        accumulator *= LastFactor.back();
         auto inBase10 = calculateBase10(asIntegers);
         accumulator += inBase10;
-        base += 8;
-        memcpy(&bytes.m_v, base, 8);
+        base += NBytes;
+        memcpy(&bytes.m_v, base, NBytes);
     }
 }
 
+}
+
+int c_strToI(const char *str) noexcept {
+    return impl::c_strToIntegral<int>(str);
+}
+
+int64_t c_strToL(const char *str) noexcept {
+    return impl::c_strToIntegral<int64_t>(str);
+}
+
 /// \brief Helper function to fix the non-string part of block
 template<typename S>
 S adjustMisalignmentFor_strlen(S data, int misalignment) {
@@ -252,7 +288,7 @@ std::size_t c_strLength(const char *s) {
 
     auto indexOfFirstTrue = [](auto bs) { return bs.lsbIndex(); };
 
-     // Misalignment must be taken into account because a SWAR read is
+    // Misalignment must be taken into account because a SWAR read is
     // speculative, it might read bytes outside of the actual string.
     // It is safe to read within the page where the string occurs, and to
     // guarantee that, simply make aligned reads because the size of the SWAR
diff --git a/benchmark/atoi.h b/benchmark/atoi.h
@@ -18,6 +18,7 @@ std::size_t leadingSpacesCount(const char *) noexcept;
 std::size_t c_strLength(const char *s);
 std::size_t c_strLength_natural(const char *s);
 int32_t c_strToI(const char *) noexcept;
+int64_t c_strToL(const char *) noexcept;
 
 inline int compareAtoi(const char *s) {
     auto
@@ -27,6 +28,16 @@ inline int compareAtoi(const char *s) {
     return from_stdlib;
 }
 
+inline int compareAtol(const char *s) {
+    auto
+        from_stdlib = atoll(s),
+        from_zoo = c_strToL(s);
+    if(from_stdlib != from_zoo) {
+        auto recalc = c_strToL(s);
+        throw 0; }
+    return from_stdlib;
+}
+
 #if ZOO_CONFIGURED_TO_USE_AVX()
 std::size_t avx2_strlen(const char* str);
 #endif
diff --git a/benchmark/atoi_impl.h b/benchmark/atoi_impl.h
@@ -0,0 +1,36 @@
+#include "zoo/swar/SWAR.h"
+
+#include <tuple>
+#include <string.h>
+
+uint64_t calculateBase10(
+    zoo::swar::SWAR<8, __uint128_t> convertedToIntegers
+) noexcept;
+
+uint32_t calculateBase10(
+    zoo::swar::SWAR<8, uint64_t> convertedToIntegers
+) noexcept;
+
+namespace zoo {
+
+/// @brief Loads the "block" containing the pointer, by proper alignment
+/// @tparam PtrT Pointer type for loading
+/// @tparam Block as the name indicates
+/// @param pointerInsideBlock the potentially misaligned pointer
+/// @param b where the loaded bytes will be put
+/// @return a pair to indicate the aligned pointer to the base of the block
+/// and the misalignment, in bytes, of the source pointer
+/// \note The misalignment is in the range [ 0, sizeof(Block) [
+template<typename PtrT, typename Block>
+std::tuple<PtrT *, int>
+blockAlignedLoad(PtrT *pointerInsideBlock, Block *b) {
+    uintptr_t asUint = reinterpret_cast<uintptr_t>(pointerInsideBlock);
+    constexpr auto Alignment = alignof(Block), Size = sizeof(Block);
+    static_assert(Alignment == Size);
+    auto misalignment = asUint % Alignment;
+    auto *base = reinterpret_cast<PtrT *>(asUint - misalignment);
+    memcpy(b, base, Size);
+    return { base, misalignment };
+}
+
+}
diff --git a/benchmark/catch2swar-demo.cpp b/benchmark/catch2swar-demo.cpp
@@ -106,7 +106,7 @@ TEST_CASE("Atoi benchmarks", "[atoi][swar]") {
     #undef X
 }
 
-TEST_CASE("Atoi correctness", "[swar][atoi]") {
+TEST_CASE("Atoi correctness", "[pure-test][swar][atoi]") {
     auto empty = "";
     REQUIRE(0 == zoo::c_strToI(empty));
     alignas(8) constexpr char EmptyMisaligned[8] = { 'Q', '\0', '0', '1', '2', '3', '9', '\0' };
@@ -133,4 +133,4 @@ TEST_CASE("Atoi correctness", "[swar][atoi]") {
     sprintf(buffer, "    %d", randomNumber);
     auto glibc = atoi(buffer);
     REQUIRE(zoo::c_strToI(buffer) == glibc);
-}
+}
diff --git a/inc/zoo/swar/SWAR.h b/inc/zoo/swar/SWAR.h
@@ -34,8 +34,14 @@ constexpr std::make_unsigned_t<T> msbIndex(T v) noexcept {
 }
 
 /// Index into the bits of the type T that contains the LSB.
+///
+/// \todo incorporate __builtin_ctzg when it is more widely available
 template<typename T>
 constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept {
+    // This check should be SFINAE, but supporting all sorts
+    // of base types is an ongoing task, we put a bare-minimum
+    // temporary preventive measure with static_assert
+    static_assert(sizeof(T) <= 8, "Unsupported");
     #ifdef _MSC_VER
         // ~v & (v - 1) turns on all trailing zeroes, zeroes the rest
         return meta::logFloor(1 + (~v & (v - 1)));
@@ -44,6 +50,14 @@ constexpr std::make_unsigned_t<T> lsbIndex(T v) noexcept {
     #endif
 }
 
+#ifndef _MSC_VER
+constexpr __uint128_t lsbIndex(__uint128_t v) noexcept {
+    auto low = (v << 64) >> 64;
+    if(low) { return __builtin_ctzll(low); }
+    return 64 + __builtin_ctzll(v >> 64);
+}
+#endif
+
 /// Core abstraction around SIMD Within A Register (SWAR).  Specifies 'lanes'
 /// of NBits width against a type T, and provides an abstraction for performing
 /// SIMD operations against that primitive type T treated as a SIMD register.