diff --git a/include/pisa/forward_index_builder.hpp b/include/pisa/forward_index_builder.hpp index 8a9d2e1f9..66879190c 100644 --- a/include/pisa/forward_index_builder.hpp +++ b/include/pisa/forward_index_builder.hpp @@ -59,11 +59,8 @@ using process_content_function_type = void parse_plaintext_content(std::string&& content, std::function process) { - std::istringstream content_stream(content); - std::string term; - while (content_stream >> term) { - process(std::move(term)); - } + TermTokenizer tokenizer(content); + std::for_each(tokenizer.begin(), tokenizer.end(), process); } [[nodiscard]] auto is_http(std::string_view content) -> bool diff --git a/include/pisa/tokenizer.hpp b/include/pisa/tokenizer.hpp index 7dde1fe77..1a96709ef 100644 --- a/include/pisa/tokenizer.hpp +++ b/include/pisa/tokenizer.hpp @@ -31,11 +31,13 @@ struct tokens: lex::lexer { } }; +using token_type = + lex::lexertl::token, boost::mpl::false_>; +using lexer_type = lex::lexertl::actor_lexer; + class TermTokenizer { public: - using token_type = - lex::lexertl::token, boost::mpl::false_>; - using lexer_type = lex::lexertl::actor_lexer; + static tokens const LEXER; explicit TermTokenizer(std::string_view text) : text_(text), first_(text_.begin()), last_(text_.end()) @@ -46,17 +48,18 @@ class TermTokenizer { first_ = text_.begin(); last_ = text_.end(); return boost::make_transform_iterator( - boost::make_filter_iterator(is_valid, lexer_.begin(first_, last_)), transform); + boost::make_filter_iterator(is_valid, LEXER.begin(first_, last_)), transform); } [[nodiscard]] auto end() { return boost::make_transform_iterator( - boost::make_filter_iterator(is_valid, lexer_.end()), transform); + boost::make_filter_iterator(is_valid, LEXER.end()), transform); } private: static bool is_valid(token_type const& tok) { return tok.id() != TokenType::NotValid; } + static std::string transform(token_type const& tok) { auto& val = tok.value(); @@ -76,7 +79,6 @@ class TermTokenizer { std::string_view text_; std::string_view::const_iterator first_; std::string_view::const_iterator last_; - tokens lexer_{}; }; } // namespace pisa diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 000000000..5137dabb9 --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,7 @@ +#include "tokenizer.hpp" + +namespace pisa { + +tokens const TermTokenizer::LEXER = tokens{}; + +} // namespace pisa diff --git a/test/test_forward_index_builder.cpp b/test/test_forward_index_builder.cpp index 0a7397ac1..153930ec6 100644 --- a/test/test_forward_index_builder.cpp +++ b/test/test_forward_index_builder.cpp @@ -1,5 +1,6 @@ #define CATCH_CONFIG_MAIN +#include #include #include @@ -365,7 +366,10 @@ TEST_CASE("Build forward index", "[parsing][forward_index][integration]") std::istringstream content_stream(record->content()); std::string term; while (content_stream >> term) { - original_body.push_back(std::move(term)); + TermTokenizer tok(term); + std::for_each(tok.begin(), tok.end(), [&original_body](auto term) { + original_body.push_back(std::move(term)); + }); } std::vector produced_body; for (auto term_id: *seq_iter) {