Skip to content

Commit 0df9d1c

Browse files
authored
Text field block loader properly handles null values from delegate (#127525)
1 parent bd3e65e commit 0df9d1c

File tree

4 files changed

+70
-27
lines changed

4 files changed

+70
-27
lines changed

docs/reference/elasticsearch/mapping-reference/keyword.md

+36
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,42 @@ Will become:
232232
}
233233
```
234234

235+
If `null_value` is configured, `null` values are replaced with the `null_value` in synthetic source:
236+
237+
$$$synthetic-source-keyword-example-null-value$$$
238+
239+
```console
240+
PUT idx
241+
{
242+
"settings": {
243+
"index": {
244+
"mapping": {
245+
"source": {
246+
"mode": "synthetic"
247+
}
248+
}
249+
}
250+
},
251+
"mappings": {
252+
"properties": {
253+
"kwd": { "type": "keyword", "null_value": "NA" }
254+
}
255+
}
256+
}
257+
PUT idx/_doc/1
258+
{
259+
"kwd": ["foo", null, "bar"]
260+
}
261+
```
262+
263+
Will become:
264+
265+
```console-result
266+
{
267+
"kwd": ["bar", "foo", "NA"]
268+
}
269+
```
270+
235271

236272
## Constant keyword field type [constant-keyword-field-type]
237273

docs/reference/elasticsearch/mapping-reference/text.md

+26-13
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,17 @@ Synthetic `_source` is Generally Available only for TSDB indices (indices that h
104104
::::
105105

106106

107-
`text` fields support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) if they have a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field that supports synthetic `_source` or if the `text` field sets `store` to `true`. Either way, it may not have [`copy_to`](/reference/elasticsearch/mapping-reference/copy-to.md).
107+
`text` fields can use a [`keyword`](/reference/elasticsearch/mapping-reference/keyword.md#keyword-synthetic-source) sub-field to support [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) without storing values of the text field itself.
108108

109-
If using a sub-`keyword` field, then the values are sorted in the same way as a `keyword` field’s values are sorted. By default, that means sorted with duplicates removed. So:
109+
In this case, the synthetic source of the `text` field will have the same [modifications](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source) as a `keyword` field.
110110

111-
$$$synthetic-source-text-example-default$$$
111+
These modifications can impact usage of `text` fields:
112+
* Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more details. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default.
113+
* Handling of `null` values is different. `text` fields ignore `null` values, but `keyword` fields support replacing `null` values with a value specified in the `null_value` parameter. This replacement is represented in synthetic source.
114+
115+
For example:
116+
117+
$$$synthetic-source-text-example-multi-field$$$
112118

113119
```console
114120
PUT idx
@@ -127,8 +133,9 @@ PUT idx
127133
"text": {
128134
"type": "text",
129135
"fields": {
130-
"raw": {
131-
"type": "keyword"
136+
"kwd": {
137+
"type": "keyword",
138+
"null_value": "NA"
132139
}
133140
}
134141
}
@@ -138,9 +145,10 @@ PUT idx
138145
PUT idx/_doc/1
139146
{
140147
"text": [
148+
null,
141149
"the quick brown fox",
142150
"the quick brown fox",
143-
"jumped over the lazy dog"
151+
"jumped over the lazy dog",
144152
]
145153
}
146154
```
@@ -150,18 +158,15 @@ Will become:
150158
```console-result
151159
{
152160
"text": [
153-
"jumped over the lazy dog",
161+
"jumped over the lazy dog"
162+
"NA",
154163
"the quick brown fox"
155164
]
156165
}
157166
```
158167

159-
::::{note}
160-
Reordering text fields can have an effect on [phrase](/reference/query-languages/query-dsl/query-dsl-match-query-phrase.md) and [span](/reference/query-languages/query-dsl/span-queries.md) queries. See the discussion about [`position_increment_gap`](/reference/elasticsearch/mapping-reference/position-increment-gap.md) for more detail. You can avoid this by making sure the `slop` parameter on the phrase queries is lower than the `position_increment_gap`. This is the default.
161-
::::
162168

163-
164-
If the `text` field sets `store` to true then order and duplicates are preserved.
169+
If the `text` field sets `store` to `true` then the sub-field is not used and no modifications are applied. For example:
165170

166171
$$$synthetic-source-text-example-stored$$$
167172

@@ -179,7 +184,15 @@ PUT idx
179184
},
180185
"mappings": {
181186
"properties": {
182-
"text": { "type": "text", "store": true }
187+
"text": {
188+
"type": "text",
189+
"store": true,
190+
"fields": {
191+
"raw": {
192+
"type": "keyword"
193+
}
194+
}
195+
}
183196
}
184197
}
185198
}

server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java

+7
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,13 @@ public Builder builder(BlockFactory factory, int expectedCount) {
10871087
* using whatever
10881088
*/
10891089
private BlockSourceReader.LeafIteratorLookup blockReaderDisiLookup(BlockLoaderContext blContext) {
1090+
if (isSyntheticSource && syntheticSourceDelegate != null) {
1091+
// Since we are using synthetic source and a delegate, we can't use this field
1092+
// to determine if the delegate has values in the document (f.e. handling of `null` is different
1093+
// between text and keyword).
1094+
return BlockSourceReader.lookupMatchingAll();
1095+
}
1096+
10901097
if (isIndexed()) {
10911098
if (getTextSearchInfo().hasNorms()) {
10921099
return BlockSourceReader.lookupFromNorms(name());

server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java

+1-14
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,8 @@ public static Object expectedValue(Map<String, Object> fieldMapping, Object valu
6262
if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) {
6363
var nullValue = (String) keywordMultiFieldMapping.get("null_value");
6464

65-
// Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated.
66-
// If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate,
67-
// parse it and see null_value inside.
68-
// But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist).
69-
// Same goes for lookupFromFieldNames().
70-
boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true);
71-
7265
if (value == null) {
73-
if (textFieldIndexed == false && nullValue != null && nullValue.length() <= (int) ignoreAbove) {
66+
if (nullValue != null && nullValue.length() <= (int) ignoreAbove) {
7467
return new BytesRef(nullValue);
7568
}
7669

@@ -82,12 +75,6 @@ public static Object expectedValue(Map<String, Object> fieldMapping, Object valu
8275
}
8376

8477
var values = (List<String>) value;
85-
86-
// See note above about TextFieldMapper#blockReaderDisiLookup.
87-
if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) {
88-
return null;
89-
}
90-
9178
var indexed = values.stream()
9279
.map(s -> s == null ? nullValue : s)
9380
.filter(Objects::nonNull)

0 commit comments

Comments
 (0)