-
Notifications
You must be signed in to change notification settings - Fork 25.3k
Use FallbackSyntheticSourceBlockLoader for text fields #126237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3576d9a
3b1f671
197c3f7
2cb9b13
e1a3663
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 126237 | ||
summary: Use `FallbackSyntheticSourceBlockLoader` for text fields | ||
area: Mapping | ||
type: enhancement | ||
issues: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.index.mapper.blockloader; | ||
|
||
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.index.mapper.BlockLoaderTestCase; | ||
import org.elasticsearch.logsdb.datageneration.FieldType; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.stream.Collectors; | ||
|
||
public class TextFieldBlockLoaderTests extends BlockLoaderTestCase { | ||
public TextFieldBlockLoaderTests(Params params) { | ||
super(FieldType.TEXT.toString(), params); | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) { | ||
if (fieldMapping.getOrDefault("store", false).equals(true)) { | ||
return valuesInSourceOrder(value); | ||
} | ||
|
||
var fields = (Map<String, Object>) fieldMapping.get("fields"); | ||
if (fields != null) { | ||
var keywordMultiFieldMapping = (Map<String, Object>) fields.get("kwd"); | ||
boolean docValues = hasDocValues(keywordMultiFieldMapping, true); | ||
boolean index = keywordMultiFieldMapping.getOrDefault("index", true).equals(true); | ||
boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true); | ||
Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above"); | ||
|
||
// See TextFieldMapper.SyntheticSourceHelper#usingSyntheticSourceDelegate | ||
// and TextFieldMapper#canUseSyntheticSourceDelegateForLoading(). | ||
boolean usingSyntheticSourceDelegate = docValues || store; | ||
boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null && (index || store); | ||
if (canUseSyntheticSourceDelegateForLoading) { | ||
return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext); | ||
} | ||
|
||
// Even if multi-field is not eligible for loading it can still be used to produce synthetic source | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole thing is due to the fact that not all |
||
// and then we load from the synthetic source. | ||
// Synthetic source is actually different from keyword field block loader results | ||
// because synthetic source includes values exceeding ignore_above and block loader doesn't. | ||
// TODO ideally this logic should be in some kind of KeywordFieldSyntheticSourceTest that uses same infra as | ||
// KeywordFieldBlockLoaderTest | ||
// It is here since KeywordFieldBlockLoaderTest does not really need it | ||
if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) { | ||
var nullValue = (String) keywordMultiFieldMapping.get("null_value"); | ||
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
|
||
// Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated. | ||
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another inconsistency - block loader returns a |
||
// If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate, | ||
// parse it and see null_value inside. | ||
// But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist). | ||
// Same goes for lookupFromFieldNames(). | ||
boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true); | ||
|
||
if (value == null) { | ||
if (textFieldIndexed == false | ||
&& nullValue != null | ||
&& (ignoreAbove == null || nullValue.length() <= (int) ignoreAbove)) { | ||
return new BytesRef(nullValue); | ||
} | ||
|
||
return null; | ||
} | ||
|
||
if (value instanceof String s) { | ||
return new BytesRef(s); | ||
} | ||
|
||
var values = (List<String>) value; | ||
|
||
// See note above about TextFieldMapper#blockReaderDisiLookup. | ||
if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) { | ||
return null; | ||
} | ||
|
||
var indexed = values.stream() | ||
.map(s -> s == null ? nullValue : s) | ||
.filter(Objects::nonNull) | ||
.filter(s -> ignoreAbove == null || s.length() <= (int) ignoreAbove) | ||
.map(BytesRef::new) | ||
.collect(Collectors.toList()); | ||
|
||
if (store == false) { | ||
// using doc_values for synthetic source | ||
indexed = new ArrayList<>(new HashSet<>(indexed)); | ||
indexed.sort(BytesRef::compareTo); | ||
} | ||
|
||
// ignored values always come last | ||
List<BytesRef> ignored = ignoreAbove == null | ||
? List.of() | ||
: values.stream() | ||
.map(s -> s == null ? nullValue : s) | ||
.filter(Objects::nonNull) | ||
.filter(s -> s.length() > (int) ignoreAbove) | ||
.map(BytesRef::new) | ||
.toList(); | ||
|
||
indexed.addAll(ignored); | ||
|
||
return maybeFoldList(indexed); | ||
} | ||
} | ||
|
||
// Loading from _ignored_source or stored _source | ||
return valuesInSourceOrder(value); | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
private Object valuesInSourceOrder(Object value) { | ||
if (value == null) { | ||
return null; | ||
} | ||
|
||
if (value instanceof String s) { | ||
return new BytesRef(s); | ||
} | ||
|
||
var resultList = ((List<String>) value).stream().filter(Objects::nonNull).map(BytesRef::new).toList(); | ||
return maybeFoldList(resultList); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.logsdb.datageneration.fields.leaf; | ||
|
||
import org.elasticsearch.logsdb.datageneration.FieldDataGenerator; | ||
import org.elasticsearch.logsdb.datageneration.datasource.DataSource; | ||
import org.elasticsearch.logsdb.datageneration.datasource.DataSourceRequest; | ||
|
||
import java.util.Map; | ||
import java.util.function.Supplier; | ||
|
||
public class TextFieldDataGenerator implements FieldDataGenerator { | ||
private final Supplier<Object> valueGenerator; | ||
|
||
public TextFieldDataGenerator(DataSource dataSource) { | ||
var strings = dataSource.get(new DataSourceRequest.StringGenerator()); | ||
var nulls = dataSource.get(new DataSourceRequest.NullWrapper()); | ||
var arrays = dataSource.get(new DataSourceRequest.ArrayWrapper()); | ||
|
||
this.valueGenerator = arrays.wrapper().compose(nulls.wrapper()).apply(() -> strings.generator().get()); | ||
} | ||
|
||
@Override | ||
public Object generateValue(Map<String, Object> fieldMapping) { | ||
return valueGenerator.get(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't really understand why do we require multi field to be indexed here (the actual impl is in
TextFieldMapper
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that is strange (only doc values / stored should matter). What failure occurs when we don't check for
index
here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you'll get different results sometimes, i can't remember now.