From a418c0358ad01d765625533a46577c0601646514 Mon Sep 17 00:00:00 2001 From: Michal Siedlaczek Date: Tue, 17 Dec 2024 20:35:08 -0500 Subject: [PATCH] Lookup table v1 implementation This lookup table implementation is meant to replace the current lexicon data structure. The overall concept is the same; the aim here is to improve the binary format to allow for extensions, and to improve the interaction with the table from the code, as well as naming convention. This change includes the data structure implementing the mapping concept, a CLI tool to build a table and read data from it, and a unit test suite, as well as CLI tests. In this particular change, we do not introduce any breaking changes. The new code is not used in the already existing tools and workflows. This work will be done in the future. Changelog-added: New lookup table implementation available Signed-off-by: Michal Siedlaczek --- docs/src/SUMMARY.md | 4 + docs/src/specs/lookup-table.md | 112 ++++++++++++ include/pisa/io.hpp | 2 +- include/pisa/lookup_table.hpp | 229 +++++++++++++++++++++++ include/pisa/span.hpp | 50 ++++- include/pisa/stream.hpp | 56 ++++++ src/lookup_table.cpp | 310 +++++++++++++++++++++++++++++++ src/stream.cpp | 41 +++++ test/cli/run.sh | 1 + test/cli/test_lookup_table.sh | 209 +++++++++++++++++++++ test/test_lookup_table.cpp | 321 +++++++++++++++++++++++++++++++++ test/test_span.cpp | 80 ++++++++ tools/CMakeLists.txt | 1 + tools/lookup_table.cpp | 173 ++++++++++++++++++ 14 files changed, 1585 insertions(+), 4 deletions(-) create mode 100644 docs/src/specs/lookup-table.md create mode 100644 include/pisa/lookup_table.hpp create mode 100644 include/pisa/stream.hpp create mode 100644 src/lookup_table.cpp create mode 100644 src/stream.cpp create mode 100755 test/cli/test_lookup_table.sh create mode 100644 test/test_lookup_table.cpp create mode 100644 tools/lookup_table.cpp diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 63cc358d..ef1e4931 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -46,3 +46,7 @@ - [`taily-stats`](cli/taily-stats.md) - [`taily-thresholds`](cli/taily-thresholds.md) - [`thresholds`](cli/thresholds.md) + +# Specifications + +- [Lookup Table](specs/lookup-table.md) diff --git a/docs/src/specs/lookup-table.md b/docs/src/specs/lookup-table.md new file mode 100644 index 00000000..6d7d57a7 --- /dev/null +++ b/docs/src/specs/lookup-table.md @@ -0,0 +1,112 @@ +# Lookup Table Format Specification + +A lookup table is a bidirectional mapping from an index, representing an +internal ID, to a binary payload, such as string. E.g., an `N`-element +lookup table maps values `0...N-1` to their payloads. These tables are +used for things like mapping terms to term IDs and document IDs to +titles or URLs. + +The format of a lookup table is designed to operate without having to +parse the entire structure. Once the header is parsed, it is possible to +operate directly on the binary format to access the data. In fact, a +lookup table will typically be memory mapped. Therefore, it is possible +to perform a lookup (or reverse lookup) without loading the entire +structure into memory. + +The header always begins as follows: + +``` ++--------+--------+-------- -+ +| 0x87 | Ver. | ... | ++--------+--------+-------- -+ +``` + +The first byte is a constant identifier. When reading, we can verify +whether this byte is correct to make sure we are using the correct type +of data structure. + +The second byte is equal to the version of the format. + +The remaining of the format is defined separately for each version. The +version is introduced in order to be able to update the format in the +future but still be able to read old formats for backwards +compatibility. + +## v1 + +``` ++--------+--------+--------+--------+--------+--------+--------+--------+ +| 0x87 | 0x01 | Flags | 0x00 | ++--------+--------+--------+--------+--------+--------+--------+--------+ +| Length | ++--------+--------+--------+--------+--------+--------+--------+--------+ +| | +| Offsets | +| | ++-----------------------------------------------------------------------+ +| | +| Payloads | +| | ++-----------------------------------------------------------------------+ +``` + +Immediately after the version bit, we have flags byte. + +``` + MSB LSB ++---+---+---+---+---+---+---+---+ +| 0 | 0 | 0 | 0 | 0 | 0 | W | S | ++---+---+---+---+---+---+---+---+ +``` + +The first bit (`S`) indicates whether the payloads are sorted (1) or not +(0). The second bit (`W`) defines the width of offsets (see below): +32-bit (0) or 64-bit (1). In most use cases, the cumulative size of the +payloads will be small enough to address it by 32-bit offsets. For +example, if we store words that are 16-bytes long on average, we can +address over 200 million of them. For this many elements, reducing the +width of the offsets would save us over 700 MB. Still, we want to +support 64-bit addressing because some payloads may be much longer +(e.g., URLs). + +The rest of the bits in the flags byte are currently not used, but +should be set to 0 to make sure that if more flags are introduced, we +know what values to expect in the older iterations, and thus we can make +sure to keep it backwards-compatible. + +The following 5 bytes are padding with values of 0. This is to help with +byte alignment. When loaded to memory, it should be loaded with 8-byte +alignment. When memory mapped, it should be already correctly aligned by +the operating system (at least on Linux). + +Following the padding, there is a 64-bit unsigned integer encoding the +number of elements in the lexicon (`N`). + +Given `N` and `W`, we can now calculate the byte range of all offsets, +and thus the address offset for the start of the payloads. The offsets +are `N+1` little-endian unsigned integers of size determined by `W` +(either 4 or 8 bytes). The offsets are associated with consecutive IDs +from 0 to `N-1`; the last the `N+1` offsets points at the first byte +after the last payload. The offsets are relative to the beginning of the +first payload, therefore the first offset will always be 0. + +Payloads are arbitrary bytes, and must be interpreted by the software. +Although the typical use case are strings, this can be any binary +payload. Note that in case of strings, they will not be 0-terminated +unless they were specifically stored as such. Although this should be +clear by the fact a payload is simply a sequence of bytes, it is only +prudent to point it out. Thus, one must be extremely careful when using +C-style strings, as their use is contingent on a correct values inserted +and encoded in the first place, and assuming 0-terminated strings may +easily lead to undefined behavior. Thus, it is recommended to store +strings without terminating them, and then interpret them as string +views (such as `std::string_view`) instead of a C-style string. + +The boundaries of the k-th payload are defined by the values of k-th and +(k+1)-th offsets. Note that because of the additional offset that points +to immediately after the last payload, we can read offsets `k` and `k+1` +for any index `k < N` (recall that `N` is the number of elements). + +If the payloads are sorted (S), we can find an ID of a certain payload +with a binary search. This is crucial for any application that requires +mapping from payloads to their position in the table. diff --git a/include/pisa/io.hpp b/include/pisa/io.hpp index 60ba83a1..5236264f 100644 --- a/include/pisa/io.hpp +++ b/include/pisa/io.hpp @@ -36,7 +36,7 @@ template void for_each_line(std::istream& is, Function fn) { std::string line; while (std::getline(is, line)) { - fn(line); + fn(std::move(line)); } } diff --git a/include/pisa/lookup_table.hpp b/include/pisa/lookup_table.hpp new file mode 100644 index 00000000..4a6af1cc --- /dev/null +++ b/include/pisa/lookup_table.hpp @@ -0,0 +1,229 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace pisa::lt { + +namespace detail { + + class BaseLookupTable { + public: + virtual ~BaseLookupTable() = default; + [[nodiscard]] virtual auto size() const noexcept -> std::size_t = 0; + [[nodiscard]] virtual auto operator[](std::size_t idx) const + -> std::span = 0; + [[nodiscard]] virtual auto find(std::span value) const noexcept + -> std::optional = 0; + + [[nodiscard]] virtual auto clone() -> std::unique_ptr = 0; + }; + + class BaseLookupTableEncoder { + public: + virtual ~BaseLookupTableEncoder() = default; + void virtual insert(std::span payload) = 0; + void virtual encode(std::ostream& out) = 0; + }; + +} // namespace detail + +namespace v1 { + + class Flags { + private: + std::uint8_t flags = 0; + + public: + constexpr Flags() = default; + explicit constexpr Flags(std::uint8_t bitset) : flags(bitset) {} + + [[nodiscard]] auto sorted() const noexcept -> bool; + [[nodiscard]] auto wide_offsets() const noexcept -> bool; + [[nodiscard]] auto bits() const noexcept -> std::uint8_t; + }; + + namespace flags { + inline constexpr std::uint8_t SORTED = 0b001; + inline constexpr std::uint8_t WIDE_OFFSETS = 0b010; + } // namespace flags + +}; // namespace v1 + +} // namespace pisa::lt + +namespace pisa { + +/** + * Lookup table mapping integers from a range [0, N) to binary payloads. + * + * This table assigns each _unique_ value (duplicates are not allowed) to an index in [0, N), where + * N is the size of the table. Thus, this structure is equivalent to a sequence of binary values. + * The difference between `LookupTable` and, say, `std::vector` is that its encoding supports + * reading the values without fully parsing the entire binary representation of the table. As such, + * it supports quickly initializing the structure from an external device (with random access), + * e.g., via mmap, and performing a lookup without loading the entire structure to main memory. + * This is especially useful for short-lived programs that must perform a lookup without the + * unnecessary overhead of loading it to memory. + * + * If the values are sorted, and the appropriate flag is toggled in the header, a quick binary + * search lookup can be performed to find an index of a value. If the values are not sorted, then a + * linear scan will be used; therefore, one should consider having values sorted if such lookups are + * important. Getting the value at a given index is a constant-time operation, though if using + * memory mapping, each such operation may need to load multiple pages to memory. + */ +class LookupTable { + private: + std::unique_ptr<::pisa::lt::detail::BaseLookupTable> m_impl; + + explicit LookupTable(std::unique_ptr<::pisa::lt::detail::BaseLookupTable> impl); + + [[nodiscard]] static auto v1(std::span bytes) -> LookupTable; + + public: + LookupTable(LookupTable const&); + LookupTable(LookupTable&&); + LookupTable& operator=(LookupTable const&); + LookupTable& operator=(LookupTable&&); + ~LookupTable(); + + /** + * The number of elements in the table. + */ + [[nodiscard]] auto size() const noexcept -> std::size_t; + + /** + * Retrieves the value at index `idx`. + * + * If `idx < size()`, then `std::out_of_range` exception is thrown. See `at()` if you want to + * conveniently cast the memory span to another type. + */ + [[nodiscard]] auto operator[](std::size_t idx) const -> std::span; + + /** + * Returns the position of `value` in the table or `std::nullopt` if the value does not exist. + * + * See the templated version of this function if you want to automatically cast from another + * type to byte span. + */ + [[nodiscard]] auto find(std::span value) const noexcept + -> std::optional; + + /** + * Returns the value at index `idx` cast to type `T`. + * + * The type `T` must define `T::value_type` that resolves to a byte-wide type, as well as a + * constructor that takes `T::value_type const*` (pointer to the first byte) and `std::size_t` + * (the total number of bytes). If `T::value_type` is longer than 1 byte, this operation results + * in **undefined behavior**. + * + * Examples of types that can be used are: `std::string_view` or `std::span`. + */ + template + [[nodiscard]] auto at(std::size_t idx) const -> T { + auto bytes = this->operator[](idx); + return T(reinterpret_cast(bytes.data()), bytes.size()); + } + + /** + * Returns the position of `value` in the table or `std::nullopt` if the value does not exist. + * + * The type `T` of the value must be such that `std:span` is + * constructible from `T`. + */ + template + requires(std::constructible_from, T>) + [[nodiscard]] auto find(T value) const noexcept -> std::optional { + return find(std::as_bytes(std::span(value))); + } + + /** + * Constructs a lookup table from the encoded sequence of bytes. + */ + [[nodiscard]] static auto from_bytes(std::span bytes) -> LookupTable; +}; + +/** + * Lookup table encoder. + * + * This class builds and encodes a sequence of values to the binary format of lookup table. + * See `LookupTable` for more details. + * + * Note that all encoded data is accumulated in memory and only flushed to the output stream when + * `encode()` member function is called. + */ +class LookupTableEncoder { + std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> m_impl; + + explicit LookupTableEncoder(std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> impl); + + public: + /** + * Constructs an encoder for a lookup table in v1 format, with the given flag options. + * + * If sorted flag is _not_ set, then an additional hash set will be produced to keep track of + * duplicates. This will increase the memory footprint at build time. + */ + static LookupTableEncoder v1(::pisa::lt::v1::Flags flags); + + /** + * Inserts payload. + * + * If sorted flag was set at construction time, it will throw if the given payload is not + * lexicographically greater than the previously inserted payload. If sorted flag was _not_ set + * and the given payload has already been inserted, it will throw as well. + */ + auto insert(std::span payload) -> LookupTableEncoder&; + + /** + * Writes the encoded table to the output stream. + */ + auto encode(std::ostream& out) -> LookupTableEncoder&; + + /** + * Inserts a payload of type `Payload`. + * + * `std::span` must be constructible from `Payload`, which + * in turn will be cast as byte span before calling the non-templated version of `insert()`. + */ + template + requires(std::constructible_from, Payload>) + auto insert(Payload const& payload) -> LookupTableEncoder& { + insert(std::as_bytes(std::span(payload))); + return *this; + } + + /** + * Inserts all payloads in the given span. + * + * It calls `insert()` for each element in the span. See `insert()` for more details. + */ + template + auto insert_span(std::span payloads) -> LookupTableEncoder& { + for (auto const& payload: payloads) { + insert(payload); + } + return *this; + } +}; + +} // namespace pisa diff --git a/include/pisa/span.hpp b/include/pisa/span.hpp index 4eb0b103..bb44dc18 100644 --- a/include/pisa/span.hpp +++ b/include/pisa/span.hpp @@ -34,12 +34,47 @@ template return span[pos]; } -} // namespace pisa +template +[[nodiscard]] constexpr auto subspan_or_throw( + std::span const& span, + typename std::span::size_type offset, + typename std::span::size_type count, + std::string const& error_msg +) -> std::span { + if (offset + count > span.size()) { + throw std::out_of_range(error_msg); + } + return span.subspan(offset, count); +} -namespace std { +template +[[nodiscard]] constexpr auto subspan_or_throw( + std::span const& span, + typename std::span::size_type offset, + typename std::span::size_type count +) -> std::span { + return subspan_or_throw(span, offset, count, "out of range subspan"); +} template -[[nodiscard]] auto operator==(std::span const& lhs, std::span const& rhs) -> bool { +[[nodiscard]] auto lex_lt(std::span const& lhs, std::span const& rhs) -> bool { + auto lit = lhs.begin(); + auto rit = rhs.begin(); + while (lit != lhs.end() && rit != rhs.end()) { + if (*lit < *rit) { + return true; + } + if (*lit > *rit) { + return false; + } + ++lit; + ++rit; + } + return lit == lhs.end() && rit != rhs.end(); +} + +template +[[nodiscard]] auto lex_eq(std::span const& lhs, std::span const& rhs) -> bool { if (lhs.size() != rhs.size()) { return false; } @@ -53,4 +88,13 @@ template return true; } +} // namespace pisa + +namespace std { + +template +[[nodiscard]] auto operator==(std::span const& lhs, std::span const& rhs) -> bool { + return ::pisa::lex_eq(lhs, rhs); +} + } // namespace std diff --git a/include/pisa/stream.hpp b/include/pisa/stream.hpp new file mode 100644 index 00000000..6768db49 --- /dev/null +++ b/include/pisa/stream.hpp @@ -0,0 +1,56 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace pisa { + +class FileOpenError: public std::exception { + public: + explicit FileOpenError(std::string const& file); + [[nodiscard]] auto what() const noexcept -> char const*; + + private: + std::string m_message; +}; + +class WriteError: public std::exception { + public: + [[nodiscard]] auto what() const noexcept -> char const*; +}; + +auto open_file_w(std::string const& filename) -> std::ofstream; + +template +auto put(std::basic_ostream& stream, CharT ch) -> std::ostream& { + if (!stream.put(ch)) { + throw WriteError(); + } + return stream; +} + +template +auto write(std::basic_ostream& stream, CharT const* data, std::streamsize count) + -> std::basic_ostream& { + if (!stream.write(data, count)) { + throw WriteError(); + } + return stream; +} + +} // namespace pisa diff --git a/src/lookup_table.cpp b/src/lookup_table.cpp new file mode 100644 index 00000000..2c2143f5 --- /dev/null +++ b/src/lookup_table.cpp @@ -0,0 +1,310 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fmt/core.h" +#include "pisa/lookup_table.hpp" +#include "pisa/span.hpp" +#include "pisa/stream.hpp" + +namespace pisa::lt { + +constexpr std::byte VERIFICATION_BYTE = std::byte(0x87); +constexpr std::size_t PADDING_LENGTH = 5; +constexpr std::array PADDING = { + std::byte{0}, std::byte{0}, std::byte{0}, std::byte{0}, std::byte{0} +}; + +auto v1::Flags::sorted() const noexcept -> bool { + return (this->flags & 0b1) > 0; +} + +auto v1::Flags::wide_offsets() const noexcept -> bool { + return (this->flags & 0b10) > 0; +} + +auto v1::Flags::bits() const noexcept -> std::uint8_t { + return this->flags; +} + +} // namespace pisa::lt + +namespace pisa { + +LookupTable::LookupTable(std::unique_ptr<::pisa::lt::detail::BaseLookupTable> impl) + : m_impl(std::move(impl)) {} + +LookupTable::LookupTable(LookupTable const& other) : m_impl(other.m_impl->clone()) {} + +LookupTable::LookupTable(LookupTable&&) = default; + +LookupTable& LookupTable::operator=(LookupTable const& other) { + m_impl = other.m_impl->clone(); + return *this; +} + +LookupTable& LookupTable::operator=(LookupTable&&) = default; + +LookupTable::~LookupTable() = default; + +template + requires(std::unsigned_integral) +[[nodiscard]] auto +read(std::span bytes, std::size_t offset, std::string const& error_msg) -> T { + auto sub = pisa::subspan_or_throw(bytes, offset, sizeof(T), error_msg); + T value; + std::memcpy(&value, bytes.data() + offset, sizeof(T)); + return value; +} + +template + requires(std::unsigned_integral) +[[nodiscard]] auto read(std::span bytes, std::size_t offset) -> T { + return read(bytes, offset, "not enough bytes"); +} + +void validate_padding(std::span bytes) { + auto padding = read(bytes, 0, "not enough bytes for header"); + padding &= 0xFFFFFFFFFF000000; + if (padding != 0) { + throw std::domain_error(fmt::format( + "bytes 3-7 must be all 0 but are {:#2x} {:#2x} {:#2x} {:#2x} {:#2x}", + bytes[3], + bytes[4], + bytes[5], + bytes[6], + bytes[7] + )); + } +} + +template +class LookupTableV1: public ::pisa::lt::detail::BaseLookupTable { + std::span m_offsets; + std::span m_payloads; + std::size_t m_size; + bool m_sorted; + + [[nodiscard]] auto read_offset(std::size_t idx) const -> Offset { + return read(m_offsets, idx * sizeof(Offset)); + } + + [[nodiscard]] auto read_payload(std::size_t idx) const -> std::span { + auto offset = read_offset(idx); + auto count = read_offset(idx + 1) - offset; + return pisa::subspan_or_throw(m_payloads, offset, count, "not enough bytes for payload"); + } + + public: + LookupTableV1(std::span offsets, std::span payloads, bool sorted) + : m_offsets(offsets), + m_payloads(payloads), + m_size(m_offsets.size() / sizeof(Offset) - 1), + m_sorted(sorted) {} + + ~LookupTableV1() = default; + + [[nodiscard]] virtual auto clone() -> std::unique_ptr override { + return std::make_unique>(m_offsets, m_payloads, m_sorted); + } + + [[nodiscard]] virtual auto size() const noexcept -> std::size_t override { return m_size; } + + [[nodiscard]] virtual auto operator[](std::size_t idx) const + -> std::span override { + if (idx >= m_size) { + throw std::out_of_range( + fmt::format("accessing element {} in a table of size {}", idx, m_size) + ); + } + auto offset = read_offset(idx); + auto count = read_offset(idx + 1) - offset; + return pisa::subspan_or_throw(m_payloads, offset, count, "not enough bytes for payload"); + } + + [[nodiscard]] virtual auto find_sorted(std::span value) const noexcept + -> std::optional { + if (size() == 0) { + return std::nullopt; + } + std::size_t low = 0; + std::size_t high = size() - 1; + while (low < high) { + auto mid = std::midpoint(low, high); + auto midval = read_payload(mid); + if (lex_lt(midval, value)) { + low = mid + 1; + } else { + high = mid; + } + } + return lex_eq(value, read_payload(low)) ? std::optional(low) : std::nullopt; + } + + [[nodiscard]] virtual auto find_unsorted(std::span value) const noexcept + -> std::optional { + for (std::size_t pos = 0; pos < size(); ++pos) { + if (read_payload(pos) == value) { + return pos; + } + } + return std::nullopt; + } + + [[nodiscard]] virtual auto find(std::span value) const noexcept + -> std::optional override { + return m_sorted ? find_sorted(value) : find_unsorted(value); + } +}; + +template +auto construct_lookup_table_v1(std::span bytes, bool sorted) + -> std::unique_ptr<::pisa::lt::detail::BaseLookupTable> { + auto length = read(bytes, 8, "not enough bytes for table length"); + std::size_t offsets_bytes_length = (length + 1) * sizeof(Offset); + auto offsets = + pisa::subspan_or_throw(bytes, 16, offsets_bytes_length, "not enough bytes for offsets"); + auto payloads = pisa::subspan_or_throw(bytes, 16 + offsets_bytes_length, std::dynamic_extent); + return std::make_unique>(offsets, payloads, sorted); +} + +auto LookupTable::v1(std::span bytes) -> LookupTable { + validate_padding(bytes); + auto flags = lt::v1::Flags(static_cast(bytes[2])); + if (flags.wide_offsets()) { + return LookupTable(construct_lookup_table_v1(bytes, flags.sorted())); + } + return LookupTable(construct_lookup_table_v1(bytes, flags.sorted())); +} + +auto LookupTable::from_bytes(std::span bytes) -> LookupTable { + auto leading_bytes = pisa::subspan_or_throw(bytes, 0, 2, "header must be at least 2 bytes"); + auto verification_byte = leading_bytes[0]; + if (verification_byte != lt::VERIFICATION_BYTE) { + throw std::domain_error(fmt::format( + "lookup table verification byte invalid: must be {:#x} but {:#x} given", + lt::VERIFICATION_BYTE, + verification_byte + )); + } + + auto version = static_cast(leading_bytes[1]); + if (version != 1) { + throw std::domain_error(fmt::format("only version 1 is valid but {} given", version)); + } + + return LookupTable::v1(bytes); +} + +auto LookupTable::size() const noexcept -> std::size_t { + return m_impl->size(); +} +auto LookupTable::operator[](std::size_t idx) const -> std::span { + return m_impl->operator[](idx); +} + +auto LookupTable::find(std::span value) const noexcept + -> std::optional { + return m_impl->find(value); +} + +template +class LookupTableEncoderV1: public ::pisa::lt::detail::BaseLookupTableEncoder { + ::pisa::lt::v1::Flags m_flags; + std::vector m_offsets{0}; + std::vector m_payloads{}; + std::unordered_set m_inserted{}; + std::span m_prev_payload; + + void encode_header(std::ostream& out) { + auto flag_bits = m_flags.bits(); + pisa::put(out, static_cast(lt::VERIFICATION_BYTE)); + pisa::put(out, static_cast(1)); + pisa::put(out, static_cast(flag_bits)); + pisa::write( + out, reinterpret_cast(&::pisa::lt::PADDING), ::pisa::lt::PADDING_LENGTH + ); + } + + void write_offsets(std::ostream& out) { + for (auto const& offset: m_offsets) { + pisa::write(out, reinterpret_cast(&offset), sizeof(Offset)); + } + } + + public: + explicit LookupTableEncoderV1(::pisa::lt::v1::Flags flags) : m_flags(flags) {} + + virtual ~LookupTableEncoderV1() = default; + + void virtual insert(std::span payload) { + if (m_flags.sorted()) { + if (!pisa::lex_lt(m_prev_payload, payload)) { + throw std::invalid_argument("payloads not strictly sorted in sorted table"); + } + } else { + auto payload_as_str = + std::string_view(reinterpret_cast(payload.data()), payload.size()); + if (auto pos = m_inserted.find(payload_as_str); pos != m_inserted.end()) { + throw std::invalid_argument("payload duplicate"); + } + m_inserted.insert(payload_as_str); + } + auto prev_begin = m_offsets.back(); + m_offsets.push_back(m_offsets.back() + payload.size()); + m_payloads.insert(m_payloads.end(), payload.begin(), payload.end()); + m_prev_payload = std::span(m_payloads).subspan(prev_begin, payload.size()); + } + + void virtual encode(std::ostream& out) { + encode_header(out); + std::uint64_t size = m_offsets.size() - 1; + pisa::write(out, reinterpret_cast(&size), sizeof(size)); + write_offsets(out); + pisa::write(out, reinterpret_cast(m_payloads.data()), m_payloads.size()); + } +}; + +LookupTableEncoder::LookupTableEncoder(std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> impl) + : m_impl(std::move(impl)) {} + +LookupTableEncoder LookupTableEncoder::v1(::pisa::lt::v1::Flags flags) { + if (flags.wide_offsets()) { + return LookupTableEncoder(std::make_unique>(flags)); + } + return LookupTableEncoder(std::make_unique>(flags)); +} + +auto LookupTableEncoder::insert(std::span payload) -> LookupTableEncoder& { + m_impl->insert(payload); + return *this; +} + +auto LookupTableEncoder::encode(std::ostream& out) -> LookupTableEncoder& { + m_impl->encode(out); + return *this; +} + +} // namespace pisa diff --git a/src/stream.cpp b/src/stream.cpp new file mode 100644 index 00000000..b9b14895 --- /dev/null +++ b/src/stream.cpp @@ -0,0 +1,41 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "fmt/core.h" +#include "pisa/stream.hpp" + +namespace pisa { + +FileOpenError::FileOpenError(std::string const& file) + : m_message(fmt::format("failed to open file: {}", file)) {} + +auto FileOpenError::what() const noexcept -> char const* { + return m_message.c_str(); +} + +auto WriteError::what() const noexcept -> char const* { + return "failed to write to stream"; +} + +auto open_file_w(std::string const& filename) -> std::ofstream { + auto stream = std::ofstream(filename); + if (!stream) { + throw FileOpenError(filename); + } + return stream; +} + +} // namespace pisa diff --git a/test/cli/run.sh b/test/cli/run.sh index fb61a5a8..6a837a23 100755 --- a/test/cli/run.sh +++ b/test/cli/run.sh @@ -5,3 +5,4 @@ bash "$DIR/setup.sh" bats "$DIR/test_taily_stats.sh" bats "$DIR/test_count_postings.sh" bats "$DIR/test_wand_data.sh" +bats "$DIR/test_lookup_table.sh" diff --git a/test/cli/test_lookup_table.sh b/test/cli/test_lookup_table.sh new file mode 100755 index 00000000..ac4cfd66 --- /dev/null +++ b/test/cli/test_lookup_table.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bats + +set +x + +sorted_values=$(cat < "$input_file" + + # build + lookup-table build -o "$lt" < "$input_file" + + # verify size + count_expected_bytes "$input_file" > "$workdir/expected_bytes" + wc -c > "$workdir/actual_bytes" < "$lt" + diff "$workdir/expected_bytes" "$workdir/actual_bytes" + + # print by index + assert_eq "$(lookup-table print "$lt" --at 0)" adipiscing + assert_eq "$(lookup-table print "$lt" --at 10)" erat + assert_eq "$(lookup-table print "$lt" --at 15)" ipsum + assert_eq "$(lookup-table print "$lt" --at 16)" lorem + assert_eq "$(lookup-table print "$lt" --at 22)" ultricies + # out of bounds exits with a failure exit code and prints out error + run lookup-table print "$lt" --at 23 + (( status != 0 )) + assert_eq "${lines[0]}" 'error: position 23 in a table of size 23 is out of bounds' + + # find + assert_eq "$(lookup-table find "$lt" adipiscing)" 0 + assert_eq "$(lookup-table find "$lt" erat)" 10 + assert_eq "$(lookup-table find "$lt" ipsum)" 15 + assert_eq "$(lookup-table find "$lt" lorem)" 16 + assert_eq "$(lookup-table find "$lt" ultricies)" 22 + # no element found + run lookup-table find "$lt" zonk + (( status != 0 )) + assert_eq "${lines[0]}" "error: value 'zonk' not found" + + # print + + lookup-table print "$lt" > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 5 --to 17 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --from 5 --count 13 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --to 10 > "$workdir/printed" + diff "$workdir/printed" <(head -11 "$workdir/input") + + lookup-table print "$lt" --count 10 > "$workdir/printed" + diff "$workdir/printed" <(head -10 "$workdir/input") +} + +@test "build unsorted input" { + workdir=$(mktemp -d) + input_file="$workdir/input" + lt="$workdir/lt" + + echo "$workdir" + printf "%s\n" "$unsorted_values" > "$input_file" + + # build + lookup-table build -o "$lt" < "$input_file" + + # verify size + count_expected_bytes "$input_file" > "$workdir/expected_bytes" + wc -c > "$workdir/actual_bytes" < "$lt" + diff "$workdir/expected_bytes" "$workdir/actual_bytes" + + # print by index + assert_eq "$(lookup-table print "$lt" --at 0)" arcu + assert_eq "$(lookup-table print "$lt" --at 10)" elit + assert_eq "$(lookup-table print "$lt" --at 15)" bibendum + assert_eq "$(lookup-table print "$lt" --at 16)" odor + assert_eq "$(lookup-table print "$lt" --at 22)" ridiculus + # out of bounds exits with a failure exit code and prints out error + run lookup-table print "$lt" --at 23 + (( status != 0 )) + assert_eq "${lines[0]}" 'error: position 23 in a table of size 23 is out of bounds' + + # find + assert_eq "$(lookup-table find "$lt" arcu)" 0 + assert_eq "$(lookup-table find "$lt" elit)" 10 + assert_eq "$(lookup-table find "$lt" bibendum)" 15 + assert_eq "$(lookup-table find "$lt" odor)" 16 + assert_eq "$(lookup-table find "$lt" ridiculus)" 22 + # no element found + run lookup-table find "$lt" zonk + (( status != 0 )) + assert_eq "${lines[0]}" "error: value 'zonk' not found" + + # print + + lookup-table print "$lt" > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 5 --to 17 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --from 5 --count 13 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --to 10 > "$workdir/printed" + diff "$workdir/printed" <(head -11 "$workdir/input") + + lookup-table print "$lt" --count 10 > "$workdir/printed" + diff "$workdir/printed" <(head -10 "$workdir/input") +} diff --git a/test/test_lookup_table.cpp b/test/test_lookup_table.cpp new file mode 100644 index 00000000..5952f663 --- /dev/null +++ b/test/test_lookup_table.cpp @@ -0,0 +1,321 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define CATCH_CONFIG_MAIN +#include "catch2/catch.hpp" + +#include +#include +#include +#include + +#include "pisa/lookup_table.hpp" +#include "pisa/span.hpp" + +using namespace std::literals; + +static const std::vector FLAG_COMBINATIONS{ + pisa::lt::v1::Flags(), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED), + pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS) +}; + +template +auto repeat(T value, std::size_t length) -> std::vector { + std::vector bytes(length); + std::fill_n(bytes.begin(), length, value); + return bytes; +} + +auto zeroes(std::size_t length) -> std::vector { + return repeat(0, length); +} + +auto vec_of(std::byte val) -> std::vector { + return {val}; +} + +auto vec_of(int val) -> std::vector { + return {static_cast(val)}; +} + +auto vec_of(std::initializer_list vals) -> std::vector { + return {vals}; +} + +auto vec_of(std::vector const& vals) -> std::vector { + std::vector result; + std::transform(vals.begin(), vals.end(), std::back_inserter(result), [](int v) { + return std::byte(v); + }); + return result; +} + +template +auto mem(Ts... input) -> std::vector { + std::vector result; + ( + [&]() { + auto v = vec_of(input); + result.insert(result.end(), v.begin(), v.end()); + }(), + ... + ); + return result; +} + +auto encode_lookup_table(std::vector payloads, pisa::lt::v1::Flags flags) { + std::ostringstream out; + pisa::LookupTableEncoder::v1(flags) + .insert_span(std::span(payloads.data(), payloads.size())) + .encode(out); + return out.str(); +} + +TEST_CASE("flags") { + SECTION("defaults") { + auto default_flags = pisa::lt::v1::Flags(); + REQUIRE_FALSE(default_flags.sorted()); + REQUIRE_FALSE(default_flags.wide_offsets()); + } + SECTION("sorted") { + auto default_flags = pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED); + REQUIRE(default_flags.sorted()); + REQUIRE_FALSE(default_flags.wide_offsets()); + } + SECTION("wide_offsets") { + auto default_flags = pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS); + REQUIRE_FALSE(default_flags.sorted()); + REQUIRE(default_flags.wide_offsets()); + } + SECTION("sorted + wide_offsets") { + auto default_flags = + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS); + REQUIRE(default_flags.sorted()); + REQUIRE(default_flags.wide_offsets()); + } +} + +TEST_CASE("LookupTable::from") { + SECTION("wrong identifier") { + auto bytes = mem(0, 0, 0, 0); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "lookup table verification byte invalid: must be 0x87 but 0x0 given" + ); + } + SECTION("invalid version 0") { + auto bytes = mem(0x87, 0, 0, 0); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "only version 1 is valid but 0 given" + ); + } + SECTION("invalid version 2") { + auto bytes = mem(0x87, 2, 0, 0); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "only version 1 is valid but 2 given" + ); + } + SECTION("padding is invalid") { + auto bytes = mem(0x87, 1, 0, 0); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for header" + ); + bytes = mem(0x87, 1, 0, 0, 0, 0, 0, 1); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "bytes 3-7 must be all 0 but are 0x0 0x0 0x0 0x0 0x1" + ); + bytes = mem(0x87, 1, 0, 1, 2, 3, 4, 5); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "bytes 3-7 must be all 0 but are 0x1 0x2 0x3 0x4 0x5" + ); + } + SECTION("empty table narrow offsets") { + auto bytes = mem(0x87, 1, zeroes(18)); + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 0); + } + SECTION("empty table wide offsets") { + auto bytes = mem(0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, zeroes(21)); + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 0); + } + SECTION("empty table must have a single offset") { + auto bytes = mem(0x87, 1, zeroes(14)); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for offsets" + ); + } + SECTION("not enough bytes for offsets") { + auto bytes = mem(0x87, 1, zeroes(6), 1, zeroes(7)); + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for offsets" + ); + } + SECTION("12 bytes is not enough for 3 wide offsets") { + /* clang-format off */ + auto bytes = mem( + // header + 0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, zeroes(5), + // size + 2, zeroes(7), + // offsets + zeroes(12) + ); + /* clang-format on */ + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for offsets" + ); + } + SECTION("12 bytes is enough for 3 narrow offsets") { + /* clang-format off */ + auto bytes = mem( + // header + 0x87, 1, 0, zeroes(5), + // size + 2, zeroes(7), + // offsets + zeroes(12) + ); + /* clang-format on */ + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 2); + } + SECTION("[a, bcd, efgh] with narrow offsets") { + /* clang-format off */ + auto bytes = mem( + // header + 0x87, 1, 0, zeroes(5), + // size + 3, zeroes(7), + // offsets + zeroes(4), + 1, zeroes(3), + 4, zeroes(3), + 8, zeroes(3), + // payloads + 'a', + 'b', 'c', 'd', + 'e', 'f', 'g', 'h' + ); + /* clang-format on */ + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 3); + REQUIRE( + lt[0] + == std::span(reinterpret_cast(bytes.data()) + 32, 1) + ); + REQUIRE( + lt[1] + == std::span(reinterpret_cast(bytes.data()) + 33, 3) + ); + REQUIRE( + lt[2] + == std::span(reinterpret_cast(bytes.data()) + 36, 4) + ); + } +} + +TEST_CASE("LookupTable v1") { + SECTION("encode [a, bcd, efgh]") { + /* clang-format off */ + auto expected = mem( + // header + 0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, zeroes(5), + // size + 3, zeroes(7), + // offsets + zeroes(8), + 1, zeroes(7), + 4, zeroes(7), + 8, zeroes(7), + // payloads + 'a', + 'b', 'c', 'd', + 'e', 'f', 'g', 'h' + ); + /* clang-format on */ + std::ostringstream out; + auto encoder = + pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS)); + std::vector payloads{"a", "bcd", "efgh"}; + encoder.insert_span(std::span(payloads.data(), payloads.size())); + encoder.encode(out); + std::string bytes = out.str(); + auto actual = std::as_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(actual == std::as_bytes(std::span(expected.data(), expected.size()))); + } + SECTION("wrong order in sorted table") { + std::ostringstream out; + auto encoder = pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED)); + std::vector payloads{"bcd", "a", "efgh"}; + REQUIRE_THROWS_WITH( + encoder.insert_span(std::span(payloads.data(), payloads.size())), + "payloads not strictly sorted in sorted table" + ); + } + SECTION("detects duplicates") { + auto flags = GENERATE_REF(from_range(FLAG_COMBINATIONS)); + std::ostringstream out; + auto encoder = pisa::LookupTableEncoder::v1(flags); + std::vector payloads{"a", "b", "b", "c"}; + REQUIRE_THROWS_WITH( + encoder.insert_span(std::span(payloads.data(), payloads.size())), + flags.sorted() ? "payloads not strictly sorted in sorted table" : "payload duplicate" + ); + } + SECTION("operator[]") { + auto flags = GENERATE_REF(from_range(FLAG_COMBINATIONS)); + std::string bytes = encode_lookup_table({"a", "bcd", "efgh"}, flags); + + auto lt = pisa::LookupTable::from_bytes(std::as_bytes(std::span(bytes))); + + REQUIRE(lt.at(0) == "a"); + REQUIRE(lt.at(1) == "bcd"); + REQUIRE(lt.at(2) == "efgh"); + + REQUIRE(lt.at(0) == "a"); + REQUIRE(lt.at(1) == "bcd"); + REQUIRE(lt.at(2) == "efgh"); + + auto val = lt.at>(0); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'a'}); + val = lt.at>(1); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'b', 'c', 'd'}); + val = lt.at>(2); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'e', 'f', 'g', 'h'}); + } + SECTION("find()") { + auto flags = GENERATE_REF(from_range(FLAG_COMBINATIONS)); + std::string bytes = encode_lookup_table({"a", "bcd", "efgh"}, flags); + auto lt = pisa::LookupTable::from_bytes(std::as_bytes(std::span(bytes))); + + REQUIRE_FALSE(lt.find(""sv).has_value()); + REQUIRE(lt.find("a"sv) == 0); + REQUIRE_FALSE(lt.find("aa"sv).has_value()); + REQUIRE(lt.find("bcd"sv) == 1); + REQUIRE_FALSE(lt.find("bcde"sv).has_value()); + REQUIRE(lt.find("efgh"sv) == 2); + REQUIRE_FALSE(lt.find("efghi"sv).has_value()); + } +} diff --git a/test/test_span.cpp b/test/test_span.cpp index 4b0a5b9e..deffd22c 100644 --- a/test/test_span.cpp +++ b/test/test_span.cpp @@ -1,6 +1,23 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #define CATCH_CONFIG_MAIN #include "catch2/catch.hpp" +#include +#include + #include "span.hpp" TEST_CASE("pisa::at", "[span]") { @@ -13,6 +30,28 @@ TEST_CASE("pisa::at", "[span]") { REQUIRE_THROWS_AS(pisa::at(span, 4), std::out_of_range); } +TEST_CASE("pisa::subspan", "[span]") { + std::vector vec{0, 1, 2, 3}; + auto span = std::span{vec.data(), vec.size()}; + REQUIRE(pisa::subspan_or_throw(span, 0, 0) == std::span(vec.data(), 0)); + REQUIRE(pisa::subspan_or_throw(span, 0, 1) == std::span(vec.data(), 1)); + REQUIRE(pisa::subspan_or_throw(span, 1, 0) == std::span(vec.data() + 1, 0)); + REQUIRE(pisa::subspan_or_throw(span, 0, 4) == std::span(vec.data(), 4)); + REQUIRE(pisa::subspan_or_throw(span, 1, 3) == std::span(vec.data() + 1, 3)); + REQUIRE(pisa::subspan_or_throw(span, 0, 3) == std::span(vec.data(), 3)); + REQUIRE(pisa::subspan_or_throw(span, 2, 2) == std::span(vec.data() + 2, 2)); + REQUIRE(pisa::subspan_or_throw(span, 3, 1) == std::span(vec.data() + 3, 1)); + REQUIRE(pisa::subspan_or_throw(span, 4, 0) == std::span(vec.data() + 4, 0)); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 0, 6), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 0, 5), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 1, 4), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 2, 3), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 3, 2), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 4, 1), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 5, 0), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 5, 1), std::out_of_range); +} + TEST_CASE("operator== for spans", "[span]") { std::vector vec1{0, 1, 2, 3}; auto span1 = std::span(vec1.data(), vec1.size()); @@ -25,3 +64,44 @@ TEST_CASE("operator== for spans", "[span]") { REQUIRE(span2 != span3); REQUIRE(span1 == std::span(vec1.data(), vec1.size())); } + +TEST_CASE("lex_lt", "[span]") { + std::string_view aardvark = "aardvark"; + std::string_view dog = "dog"; + std::string_view zebra = "zebra"; + std::string_view empty = ""; + + REQUIRE_FALSE(pisa::lex_lt(std::span(aardvark), std::span(aardvark))); + REQUIRE(pisa::lex_lt(std::span(aardvark), std::span(dog))); + REQUIRE(pisa::lex_lt(std::span(aardvark), std::span(zebra))); + + REQUIRE_FALSE(pisa::lex_lt(std::span(dog), std::span(dog))); + REQUIRE_FALSE(pisa::lex_lt(std::span(dog), std::span(aardvark))); + REQUIRE(pisa::lex_lt(std::span(dog), std::span(zebra))); + + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(zebra))); + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(aardvark))); + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(dog))); + + REQUIRE(pisa::lex_lt(std::span(empty), std::span(aardvark))); + REQUIRE(pisa::lex_lt(std::span(empty), std::span(dog))); + REQUIRE(pisa::lex_lt(std::span(empty), std::span(zebra))); + REQUIRE_FALSE(pisa::lex_lt(std::span(aardvark), std::span(empty))); + REQUIRE_FALSE(pisa::lex_lt(std::span(dog), std::span(empty))); + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(empty))); + REQUIRE_FALSE(pisa::lex_lt(std::span(empty), std::span(empty))); +} + +TEST_CASE("lex_lt sort", "[span]") { + std::vector> animals{ + "aardvark", "dog", "zebra", "pelican", "goose", "geese", "cat" + }; + std::sort(animals.begin(), animals.end(), pisa::lex_lt); + REQUIRE(animals[0] == std::span("aardvark")); + REQUIRE(animals[1] == std::span("cat")); + REQUIRE(animals[2] == std::span("dog")); + REQUIRE(animals[3] == std::span("geese")); + REQUIRE(animals[4] == std::span("goose")); + REQUIRE(animals[5] == std::span("pelican")); + REQUIRE(animals[6] == std::span("zebra")); +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b60dc377..c12894fb 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -32,3 +32,4 @@ add_tool(kth_threshold kth_threshold.cpp) add_tool(taily-stats taily_stats.cpp) add_tool(taily-thresholds taily_thresholds.cpp) add_tool(extract-maxscores extract_maxscores.cpp) +add_tool(lookup-table lookup_table.cpp) diff --git a/tools/lookup_table.cpp b/tools/lookup_table.cpp new file mode 100644 index 00000000..0a381ca5 --- /dev/null +++ b/tools/lookup_table.cpp @@ -0,0 +1,173 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "app.hpp" +#include "pisa/io.hpp" +#include "pisa/lookup_table.hpp" + +struct Arguments { + std::string lexicon_file{}; + std::string value{}; + std::optional at{}; + std::optional from{}; + std::optional last{}; + std::optional count{}; +}; + +struct Commands { + CLI::App* build{}; + CLI::App* find{}; + CLI::App* print{}; +}; + +auto build_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("build", "Builds a lookup table from stdin"); + cmd->add_option("-o,--output", args.lexicon_file, "Binary output file")->required(); + return cmd; +} + +auto find_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("find", "Finds the given value and returns its position"); + cmd->add_option("table", args.lexicon_file, "Path to lookup table")->required(); + cmd->add_option("value", args.value, "Value to find")->required(); + return cmd; +} + +auto print_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("print", "Prints values"); + cmd->add_option("table", args.lexicon_file, "Path to lookup table")->required(); + auto at = cmd->add_option("--at", args.at, "Position of a single element to print"); + cmd->add_option("--from", args.from, "Starting position")->excludes(at); + auto to = cmd->add_option("--to", args.last, "Last position")->excludes(at); + cmd->add_option("--count", args.count, "Number of values to print")->excludes(at)->excludes(to); + return cmd; +} + +void build(Arguments const& args) { + std::vector values; + std::size_t payload_size = 0; + bool sorted = true; + pisa::io::for_each_line(std::cin, [&values, &payload_size, &sorted](std::string&& value) { + payload_size += value.size(); + values.push_back(std::move(value)); + if (sorted && payload_size > 0 && value <= values.back()) { + sorted = false; + } + }); + std::uint8_t flags = 0; + if (sorted) { + flags |= ::pisa::lt::v1::flags::SORTED; + } + if (payload_size >= (1UL << 32) - 1) { + flags |= ::pisa::lt::v1::flags::WIDE_OFFSETS; + } + auto encoder = ::pisa::LookupTableEncoder::v1(::pisa::lt::v1::Flags(flags)); + for (auto& value: values) { + encoder.insert(value); + } + std::ofstream out(args.lexicon_file); + encoder.encode(out); +} + +void get(pisa::LookupTable const& table, std::size_t idx) { + if (idx >= table.size()) { + throw std::runtime_error( + fmt::format("position {} in a table of size {} is out of bounds", idx, table.size()) + ); + } + auto value = table.at(idx); + std::cout << value; +} + +void find(pisa::LookupTable const& table, std::string const& value) { + auto idx = table.find(value); + if (idx.has_value()) { + std::cout << *idx; + } else { + throw std::runtime_error(fmt::format("value '{}' not found", value)); + } +} + +void print(pisa::LookupTable const& table, Arguments const& args) { + if (args.at.has_value()) { + get(table, *args.at); + return; + } + + std::size_t first = 0; + std::size_t last = table.size() - 1; + + if (args.from.has_value()) { + first = *args.from; + } + if (args.last.has_value()) { + last = *args.last; + } + if (args.count.has_value()) { + last = *args.count + first - 1; + } + + if (first >= table.size()) { + throw std::runtime_error(fmt::format( + "starting position {} in a table of size {} is out of bounds", first, table.size() + )); + } + if (last >= table.size()) { + throw std::runtime_error(fmt::format( + "last position {} in a table of size {} is out of bounds", last, table.size() + )); + } + + for (auto pos = first; pos <= last; ++pos) { + std::cout << table.at(pos) << '\n'; + } +} + +int main(int argc, char** argv) { + Arguments args; + Commands cmds; + + pisa::App app{"Builds, prints, or queries lookup table"}; + app.require_subcommand(); + cmds.build = build_cmd(app, args); + cmds.find = find_cmd(app, args); + cmds.print = print_cmd(app, args); + CLI11_PARSE(app, argc, argv); + + try { + if (*cmds.build) { + build(args); + } else { + mio::mmap_source mem(args.lexicon_file.c_str()); + auto table = pisa::LookupTable::from_bytes( + std::span(reinterpret_cast(mem.data()), mem.size()) + ); + if (*cmds.find) { + find(table, args.value); + } else if (*cmds.print) { + print(table, args); + } + } + } catch (std::exception const& err) { + std::cerr << "error: " << err.what() << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +}