From 686e5746bcbce88bba01f7a359e76ec99e1c40b5 Mon Sep 17 00:00:00 2001 From: Antonio Mallia Date: Fri, 13 Mar 2020 22:16:30 +0100 Subject: [PATCH] Fix 336 (#337) --- benchmarks/CMakeLists.txt | 5 --- benchmarks/selective_queries.cpp | 74 -------------------------------- tools/CMakeLists.txt | 7 +++ tools/selective_queries.cpp | 63 +++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 79 deletions(-) delete mode 100644 benchmarks/selective_queries.cpp create mode 100644 tools/selective_queries.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3c8a2aaef..a39c3d6d1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -8,11 +8,6 @@ target_link_libraries(perftest_interpolative pisa ) -add_executable(selective_queries selective_queries.cpp) -target_link_libraries(selective_queries - pisa -) - add_executable(scan_perftest scan_perftest.cpp) target_link_libraries(scan_perftest pisa diff --git a/benchmarks/selective_queries.cpp b/benchmarks/selective_queries.cpp deleted file mode 100644 index 89d3bde6a..000000000 --- a/benchmarks/selective_queries.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include - -#include "mappable/mapper.hpp" -#include "mio/mmap.hpp" - -#include "index_types.hpp" -#include "query/queries.hpp" -#include "util/util.hpp" -#include "wand_data_compressed.hpp" - -template -void selective_queries(const char* index_filename, std::string const& type) -{ - using namespace pisa; - - IndexType index; - spdlog::info("Loading index from {}", index_filename); - mio::mmap_source m(index_filename); - mapper::map(index, m, mapper::map_flags::warmup); - - spdlog::info("Performing {} queries", type); - - term_id_vec query; - - uint64_t count_taken = 0; - uint64_t count = 0; - while (read_query(query)) { - bool insert = true; - if (query.size() == 1) - insert = false; - else { - count++; - for (term_id_type term: query) { - auto t = index[term]; - if (t.size() <= configuration::get().threshold_wand_list) { - insert = false; - break; - } - } - } - - if (insert) { - count_taken++; - std::cout << query[0]; - for (size_t i = 1; i < query.size(); ++i) - std::cout << " " << query[i]; - std::cout << std::endl; - } - } - - std::cout << (float)count_taken / (float)count << std::endl; -} - -int main(int, const char** argv) -{ - using namespace pisa; - - std::string type = argv[1]; - const char* index_filename = argv[2]; - - if (false) { -#define LOOP_BODY(R, DATA, T) \ - } \ - else if (type == BOOST_PP_STRINGIZE(T)) \ - { \ - selective_queries(index_filename, type); - /**/ - - BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); -#undef LOOP_BODY - } else { - spdlog::error("Unknown type {}", type); - } -} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index e35c30c29..d0b64809d 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -125,3 +125,10 @@ target_link_libraries(count_postings pisa CLI11 ) + +add_executable(selective_queries selective_queries.cpp) +target_link_libraries(selective_queries + pisa + CLI11 +) + diff --git a/tools/selective_queries.cpp b/tools/selective_queries.cpp new file mode 100644 index 000000000..112f9e5a2 --- /dev/null +++ b/tools/selective_queries.cpp @@ -0,0 +1,63 @@ +#include + +#include "mappable/mapper.hpp" +#include "mio/mmap.hpp" + +#include "CLI/CLI.hpp" +#include "app.hpp" +#include "cursor/cursor.hpp" +#include "index_types.hpp" +#include "query/algorithm.hpp" +#include "query/queries.hpp" +#include +#include + +using namespace pisa; + +template +void selective_queries( + const std::string& index_filename, std::string const& encoding, std::vector const& queries) +{ + IndexType index; + spdlog::info("Loading index from {}", index_filename); + mio::mmap_source m(index_filename.c_str()); + mapper::map(index, m, mapper::map_flags::warmup); + + spdlog::info("Performing {} queries", encoding); + + using boost::adaptors::transformed; + using boost::algorithm::join; + for (auto const& query: queries) { + size_t and_results = and_query()(make_cursors(index, query), index.num_docs()).size(); + size_t or_results = or_query()(make_cursors(index, query), index.num_docs()); + + double selectiveness = double(and_results) / double(or_results); + if (selectiveness < 0.005) { + std::cout + << join(query.terms | transformed([](auto d) { return std::to_string(d); }), " ") + << '\n'; + } + } +} + +int main(int argc, const char** argv) +{ + App> app{ + "Filters selective queries for a given index."}; + CLI11_PARSE(app, argc, argv); + + if (false) { +#define LOOP_BODY(R, DATA, T) \ + } \ + else if (app.index_encoding() == BOOST_PP_STRINGIZE(T)) \ + { \ + selective_queries( \ + app.index_filename(), app.index_encoding(), app.queries()); + /**/ + + BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); +#undef LOOP_BODY + } else { + spdlog::error("Unknown encoding {}", app.index_encoding()); + } +}