Skip to content

Commit 9451276

Browse files
authored
[feature](search) add variant subcolumn suppport for search function (#56718)
### What problem does this PR solve? Issue Number: close #xxx Related PR: #56139 Problem Summary: This PR adds support for variant subcolumn access in search functions, enabling search queries to target specific JSON paths within variant columns using dot notation (e.g., field.subcolumn). The feature extends the search DSL to handle variant data types with subcolumn paths, allowing more granular search capabilities on semi-structured data. ``` SELECT * FROM test_variant_search_subcolumn WHERE search('variantColumn.subcolumn:textMatched'); ```
1 parent 456af82 commit 9451276

File tree

15 files changed

+429
-76
lines changed

15 files changed

+429
-76
lines changed

be/src/olap/rowset/segment_v2/segment_iterator.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,10 +605,16 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() {
605605
{
606606
if (_opts.runtime_state &&
607607
_opts.runtime_state->query_options().enable_inverted_index_query &&
608-
has_index_in_iterators()) {
608+
(has_index_in_iterators() || !_common_expr_ctxs_push_down.empty())) {
609609
SCOPED_RAW_TIMER(&_opts.stats->inverted_index_filter_timer);
610610
size_t input_rows = _row_bitmap.cardinality();
611-
RETURN_IF_ERROR(_apply_inverted_index());
611+
// Only apply column-level inverted index if we have iterators
612+
if (has_index_in_iterators()) {
613+
RETURN_IF_ERROR(_apply_inverted_index());
614+
}
615+
// Always apply expr-level index (e.g., search expressions) if we have common_expr_pushdown
616+
// This allows search expressions with variant subcolumns to be evaluated even when
617+
// the segment doesn't have all subcolumns
612618
RETURN_IF_ERROR(_apply_index_expr());
613619
for (auto it = _common_expr_ctxs_push_down.begin();
614620
it != _common_expr_ctxs_push_down.end();) {

be/src/vec/exprs/vsearch.cpp

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -48,39 +48,63 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context,
4848

4949
auto index_context = context->get_inverted_index_context();
5050
if (index_context == nullptr) {
51-
return Status::OK();
51+
LOG(WARNING) << "collect_search_inputs: No inverted index context available";
52+
return Status::InternalError("No inverted index context available");
5253
}
5354

55+
// Get field bindings for variant subcolumn support
56+
const auto& search_param = expr.get_search_param();
57+
const auto& field_bindings = search_param.field_bindings;
58+
59+
int child_index = 0; // Index for iterating through children
5460
for (const auto& child : expr.children()) {
5561
if (child->is_slot_ref()) {
5662
auto* column_slot_ref = assert_cast<VSlotRef*>(child.get());
5763
int column_id = column_slot_ref->column_id();
5864
auto* iterator = index_context->get_inverted_index_iterator_by_column_id(column_id);
59-
if (iterator == nullptr) {
60-
continue;
65+
66+
// Determine the field_name from field_bindings (for variant subcolumns)
67+
// field_bindings and children should have the same order
68+
std::string field_name;
69+
if (child_index < field_bindings.size()) {
70+
// Use field_name from binding (may include "parent.subcolumn" for variant)
71+
field_name = field_bindings[child_index].field_name;
72+
} else {
73+
// Fallback to column_name if binding not found
74+
field_name = column_slot_ref->column_name();
6175
}
6276

63-
const auto* storage_name_type =
64-
index_context->get_storage_name_and_type_by_column_id(column_id);
65-
if (storage_name_type == nullptr) {
66-
auto err_msg = fmt::format(
67-
"storage_name_type cannot be found for column {} while in {} evaluate",
68-
column_id, expr.expr_name());
69-
LOG(ERROR) << err_msg;
70-
return Status::InternalError(err_msg);
77+
// Only collect fields that have iterators (materialized columns with indexes)
78+
if (iterator != nullptr) {
79+
const auto* storage_name_type =
80+
index_context->get_storage_name_and_type_by_column_id(column_id);
81+
if (storage_name_type == nullptr) {
82+
return Status::InternalError("storage_name_type not found for column {} in {}",
83+
column_id, expr.expr_name());
84+
}
85+
86+
bundle->iterators.emplace(field_name, iterator);
87+
bundle->field_types.emplace(field_name, *storage_name_type);
88+
bundle->column_ids.emplace_back(column_id);
7189
}
7290

73-
auto column_name = column_slot_ref->column_name();
74-
bundle->iterators.emplace(column_name, iterator);
75-
bundle->field_types.emplace(column_name, *storage_name_type);
76-
bundle->column_ids.emplace_back(column_id);
91+
child_index++;
7792
} else if (child->is_literal()) {
7893
auto* literal = assert_cast<VLiteral*>(child.get());
7994
bundle->literal_args.emplace_back(literal->get_column_ptr(), literal->get_data_type(),
8095
literal->expr_name());
8196
} else {
82-
LOG(WARNING) << "VSearchExpr: Unsupported child node type encountered";
83-
return Status::InvalidArgument("search expression child type unsupported");
97+
// Check if this is ElementAt expression (for variant subcolumn access)
98+
if (child->expr_name() == "element_at" && child_index < field_bindings.size() &&
99+
field_bindings[child_index].__isset.is_variant_subcolumn &&
100+
field_bindings[child_index].is_variant_subcolumn) {
101+
// Variant subcolumn not materialized - skip, will create empty BitmapQuery in function_search
102+
child_index++;
103+
continue;
104+
}
105+
106+
// Not a supported child type
107+
return Status::InvalidArgument("Unsupported child node type: {}", child->expr_name());
84108
}
85109
}
86110

@@ -94,16 +118,6 @@ VSearchExpr::VSearchExpr(const TExprNode& node) : VExpr(node) {
94118
_search_param = node.search_param;
95119
_original_dsl = _search_param.original_dsl;
96120
}
97-
98-
LOG(INFO) << "VSearchExpr constructor: dsl='" << _original_dsl
99-
<< "', num_children=" << node.num_children
100-
<< ", has_search_param=" << node.__isset.search_param
101-
<< ", children_size=" << _children.size();
102-
103-
for (size_t i = 0; i < _children.size(); i++) {
104-
LOG(INFO) << "VSearchExpr constructor: child[" << i
105-
<< "] expr_name=" << _children[i]->expr_name();
106-
}
107121
}
108122

109123
const std::string& VSearchExpr::expr_name() const {
@@ -120,7 +134,7 @@ Status VSearchExpr::execute(VExprContext* context, Block* block, int* result_col
120134
}
121135

122136
Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segment_num_rows) {
123-
LOG(INFO) << "VSearchExpr::evaluate_inverted_index called with DSL: " << _original_dsl;
137+
LOG(INFO) << "VSearchExpr::evaluate_inverted_index called, DSL: " << _search_param.original_dsl;
124138

125139
if (_search_param.original_dsl.empty()) {
126140
return Status::InvalidArgument("search DSL is empty");
@@ -135,8 +149,14 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm
135149
SearchInputBundle bundle;
136150
RETURN_IF_ERROR(collect_search_inputs(*this, context, &bundle));
137151

152+
VLOG_DEBUG << "VSearchExpr: bundle.iterators.size()=" << bundle.iterators.size();
153+
138154
if (bundle.iterators.empty()) {
139-
LOG(WARNING) << "VSearchExpr: No indexed columns available for evaluation";
155+
LOG(WARNING) << "VSearchExpr: No indexed columns available for evaluation, DSL: "
156+
<< _original_dsl;
157+
auto empty_bitmap = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
158+
std::make_shared<roaring::Roaring>());
159+
index_context->set_inverted_index_result_for_expr(this, std::move(empty_bitmap));
140160
return Status::OK();
141161
}
142162

@@ -155,15 +175,6 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm
155175
index_context->set_true_for_inverted_index_status(this, column_id);
156176
}
157177

158-
const auto& data_bitmap = result_bitmap.get_data_bitmap();
159-
const uint64_t match_count = data_bitmap ? data_bitmap->cardinality() : 0;
160-
if (match_count > 0) {
161-
LOG(INFO) << "VSearchExpr: Found " << match_count
162-
<< " matching rows for DSL: " << _search_param.original_dsl;
163-
} else {
164-
LOG(INFO) << "VSearchExpr: No matches found for DSL: " << _search_param.original_dsl;
165-
}
166-
167178
return Status::OK();
168179
}
169180

be/src/vec/exprs/vsearch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class VSearchExpr : public VExpr {
3939

4040
bool can_push_down_to_index() const override { return true; }
4141

42+
const TSearchParam& get_search_param() const { return _search_param; }
43+
4244
private:
4345
TSearchParam _search_param;
4446
std::string _original_dsl;

be/src/vec/functions/function_search.cpp

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "olap/rowset/segment_v2/index_file_reader.h"
3636
#include "olap/rowset/segment_v2/index_query_context.h"
3737
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
38+
#include "olap/rowset/segment_v2/inverted_index/query_v2/bitmap_query/bitmap_query.h"
3839
#include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query.h"
3940
#include "olap/rowset/segment_v2/inverted_index/query_v2/operator.h"
4041
#include "olap/rowset/segment_v2/inverted_index/query_v2/term_query/term_query.h"
@@ -52,8 +53,21 @@ Status FieldReaderResolver::resolve(const std::string& field_name,
5253
InvertedIndexQueryType query_type,
5354
FieldReaderBinding* binding) {
5455
DCHECK(binding != nullptr);
56+
57+
// Check if this is a variant subcolumn
58+
bool is_variant_sub = is_variant_subcolumn(field_name);
59+
5560
auto data_it = _data_type_with_names.find(field_name);
5661
if (data_it == _data_type_with_names.end()) {
62+
// For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment)
63+
// Return OK but with null binding to signal "no match"
64+
if (is_variant_sub) {
65+
VLOG_DEBUG << "Variant subcolumn '" << field_name
66+
<< "' not found in this segment, treating as no match";
67+
*binding = FieldReaderBinding();
68+
return Status::OK();
69+
}
70+
// For normal fields, this is an error
5771
return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
5872
"field '{}' not found in inverted index metadata", field_name);
5973
}
@@ -69,6 +83,13 @@ Status FieldReaderResolver::resolve(const std::string& field_name,
6983

7084
auto iterator_it = _iterators.find(field_name);
7185
if (iterator_it == _iterators.end() || iterator_it->second == nullptr) {
86+
// For variant subcolumns, not finding the iterator is normal
87+
if (is_variant_sub) {
88+
VLOG_DEBUG << "Variant subcolumn '" << field_name
89+
<< "' iterator not found in this segment, treating as no match";
90+
*binding = FieldReaderBinding();
91+
return Status::OK();
92+
}
7293
return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
7394
"iterator not found for field '{}'", field_name);
7495
}
@@ -171,27 +192,31 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param(
171192
data_type_with_names,
172193
std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
173194
InvertedIndexResultBitmap& bitmap_result) const {
174-
VLOG_DEBUG << "search: Processing DSL '" << search_param.original_dsl << "' with "
175-
<< data_type_with_names.size() << " indexed columns and " << iterators.size()
176-
<< " iterators";
177-
178195
if (iterators.empty() || data_type_with_names.empty()) {
179-
LOG(INFO) << "No indexed columns or iterators available, returning empty result";
196+
LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:"
197+
<< search_param.original_dsl;
198+
bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
199+
std::make_shared<roaring::Roaring>());
180200
return Status::OK();
181201
}
182202

183203
auto context = std::make_shared<IndexQueryContext>();
184204
context->collection_statistics = std::make_shared<CollectionStatistics>();
185205
context->collection_similarity = std::make_shared<CollectionSimilarity>();
186206

187-
FieldReaderResolver resolver(data_type_with_names, iterators, context);
207+
// Pass field_bindings to resolver for variant subcolumn detection
208+
FieldReaderResolver resolver(data_type_with_names, iterators, context,
209+
search_param.field_bindings);
188210

189211
query_v2::QueryPtr root_query;
190212
std::string root_binding_key;
191213
RETURN_IF_ERROR(build_query_recursive(*this, search_param.root, context, resolver, &root_query,
192214
&root_binding_key));
193215
if (root_query == nullptr) {
194-
LOG(INFO) << "search: Query tree resolved to empty query";
216+
LOG(INFO) << "search: Query tree resolved to empty query, dsl:"
217+
<< search_param.original_dsl;
218+
bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
219+
std::make_shared<roaring::Roaring>());
195220
return Status::OK();
196221
}
197222

@@ -393,9 +418,12 @@ Status FunctionSearch::build_query_recursive(const FunctionSearch& function,
393418
std::string child_binding_key;
394419
RETURN_IF_ERROR(build_query_recursive(function, child_clause, context, resolver,
395420
&child_query, &child_binding_key));
396-
if (child_query != nullptr) {
397-
builder.add(child_query, std::move(child_binding_key));
398-
}
421+
// Add all children including empty BitmapQuery
422+
// BooleanQuery will handle the logic:
423+
// - AND with empty bitmap → result is empty
424+
// - OR with empty bitmap → empty bitmap is ignored by OR logic
425+
// - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery)
426+
builder.add(child_query, std::move(child_binding_key));
399427
}
400428
}
401429

@@ -429,6 +457,19 @@ Status FunctionSearch::build_leaf_query(const FunctionSearch& function, const TS
429457

430458
FieldReaderBinding binding;
431459
RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding));
460+
461+
// Check if binding is empty (variant subcolumn not found in this segment)
462+
if (binding.lucene_reader == nullptr) {
463+
VLOG_DEBUG << "build_leaf_query: Variant subcolumn '" << field_name
464+
<< "' has no index in this segment, creating empty BitmapQuery (no matches)";
465+
// Variant subcolumn doesn't exist - create empty BitmapQuery (no matches)
466+
*out = std::make_shared<query_v2::BitmapQuery>(roaring::Roaring());
467+
if (binding_key) {
468+
binding_key->clear();
469+
}
470+
return Status::OK();
471+
}
472+
432473
if (binding_key) {
433474
*binding_key = binding.binding_key;
434475
}

be/src/vec/functions/function_search.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,28 @@ class FieldReaderResolver {
5757
const std::unordered_map<std::string, vectorized::IndexFieldNameAndTypePair>&
5858
data_type_with_names,
5959
const std::unordered_map<std::string, IndexIterator*>& iterators,
60-
std::shared_ptr<IndexQueryContext> context)
60+
std::shared_ptr<IndexQueryContext> context,
61+
const std::vector<TSearchFieldBinding>& field_bindings = {})
6162
: _data_type_with_names(data_type_with_names),
6263
_iterators(iterators),
63-
_context(std::move(context)) {}
64+
_context(std::move(context)),
65+
_field_bindings(field_bindings) {
66+
// Build a lookup map for quick variant subcolumn checks
67+
for (const auto& binding : _field_bindings) {
68+
if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) {
69+
_variant_subcolumn_fields.insert(binding.field_name);
70+
}
71+
}
72+
}
6473

6574
Status resolve(const std::string& field_name, InvertedIndexQueryType query_type,
6675
FieldReaderBinding* binding);
6776

77+
// Check if a field is a variant subcolumn
78+
bool is_variant_subcolumn(const std::string& field_name) const {
79+
return _variant_subcolumn_fields.count(field_name) > 0;
80+
}
81+
6882
const std::vector<std::shared_ptr<lucene::index::IndexReader>>& readers() const {
6983
return _readers;
7084
}
@@ -94,6 +108,8 @@ class FieldReaderResolver {
94108
_data_type_with_names;
95109
const std::unordered_map<std::string, IndexIterator*>& _iterators;
96110
std::shared_ptr<IndexQueryContext> _context;
111+
std::vector<TSearchFieldBinding> _field_bindings;
112+
std::unordered_set<std::string> _variant_subcolumn_fields;
97113
std::unordered_map<std::string, FieldReaderBinding> _cache;
98114
std::vector<std::shared_ptr<lucene::index::IndexReader>> _readers;
99115
std::unordered_map<std::string, std::shared_ptr<lucene::index::IndexReader>> _binding_readers;

fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ NOT : 'NOT' | 'not' | '!' ;
4848
LPAREN : '(' ;
4949
RPAREN : ')' ;
5050
COLON : ':' ;
51+
DOT : '.' ; // Support for variant subcolumn access (e.g., field.subcolumn)
5152

5253
QUOTED : '"' QUOTED_CHAR* '"' ;
5354
TERM : TERM_START_CHAR TERM_CHAR* ;

fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ orClause : andClause (OR andClause)* ;
2525
andClause : notClause (AND notClause)* ;
2626
notClause : NOT atomClause | atomClause ;
2727
atomClause : LPAREN clause RPAREN | fieldQuery ;
28-
fieldQuery : fieldName COLON searchValue ;
29-
fieldName : TERM | QUOTED ;
28+
29+
// Support for variant subcolumn paths (e.g., field.subcolumn, field.sub1.sub2)
30+
fieldQuery : fieldPath COLON searchValue ;
31+
fieldPath : fieldSegment (DOT fieldSegment)* ;
32+
fieldSegment : TERM | QUOTED ;
3033

3134
searchValue
3235
: TERM

fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,29 @@ private TSearchParam buildThriftParam() {
146146
for (int i = 0; i < qsPlan.fieldBindings.size(); i++) {
147147
SearchDslParser.QsFieldBinding binding = qsPlan.fieldBindings.get(i);
148148
TSearchFieldBinding thriftBinding = new TSearchFieldBinding();
149-
thriftBinding.setFieldName(binding.fieldName);
149+
150+
String fieldPath = binding.fieldName;
151+
thriftBinding.setFieldName(fieldPath);
152+
153+
// Check if this is a variant subcolumn (contains dot)
154+
if (fieldPath.contains(".")) {
155+
// Parse variant subcolumn path
156+
int firstDotPos = fieldPath.indexOf('.');
157+
String parentField = fieldPath.substring(0, firstDotPos);
158+
String subcolumnPath = fieldPath.substring(firstDotPos + 1);
159+
160+
thriftBinding.setIsVariantSubcolumn(true);
161+
thriftBinding.setParentFieldName(parentField);
162+
thriftBinding.setSubcolumnPath(subcolumnPath);
163+
164+
LOG.info("buildThriftParam: variant subcolumn field='{}', parent='{}', subcolumn='{}'",
165+
fieldPath, parentField, subcolumnPath);
166+
} else {
167+
thriftBinding.setIsVariantSubcolumn(false);
168+
}
169+
170+
// Set slot index - this is the index in the children array, not the slotId
171+
thriftBinding.setSlotIndex(i);
150172

151173
if (i < this.children.size() && this.children.get(i) instanceof SlotRef) {
152174
SlotRef slotRef = (SlotRef) this.children.get(i);

fe/fe-core/src/main/java/org/apache/doris/nereids/jobs/executor/Rewriter.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,6 @@ public class Rewriter extends AbstractBatchJobExecutor {
799799
custom(RuleType.ADJUST_CONJUNCTS_RETURN_TYPE, AdjustConjunctsReturnType::new),
800800
bottomUp(
801801
new ExpressionRewrite(CheckLegalityAfterRewrite.INSTANCE),
802-
new RewriteSearchToSlots(),
803802
new CheckMatchExpression(),
804803
new CheckMultiDistinct(),
805804
new CheckRestorePartition(),
@@ -899,6 +898,12 @@ private static List<RewriteJob> getWholeTreeRewriteJobs(
899898
rewriteJobs.addAll(jobs(topic("split multi distinct",
900899
custom(RuleType.DISTINCT_AGG_STRATEGY_SELECTOR, () -> DistinctAggStrategySelector.INSTANCE))));
901900

901+
// Rewrite search function before VariantSubPathPruning
902+
// so that ElementAt expressions from search can be processed
903+
rewriteJobs.addAll(jobs(
904+
bottomUp(new RewriteSearchToSlots())
905+
));
906+
902907
if (needSubPathPushDown) {
903908
rewriteJobs.addAll(jobs(
904909
topic("variant element_at push down",

0 commit comments

Comments
 (0)