Skip to content

Commit 6d5d747

Browse files
authored
[9.0] Log stack traces on data nodes before they are cleared for transport (#125732) (#126245)
We recently cleared stack traces on data nodes before transport back to the coordinating node when error_trace=false to reduce unnecessary data transfer and memory on the coordinating node (#118266). However, all logging of exceptions happens on the coordinating node, so stack traces disappeared from any logs. This change logs stack traces directly on the data node when error_trace=false. (cherry picked from commit 9f6eb1d)
1 parent 4cd71d7 commit 6d5d747

File tree

7 files changed

+488
-6
lines changed

7 files changed

+488
-6
lines changed

docs/changelog/125732.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 125732
2+
summary: Log stack traces on data nodes before they are cleared for transport
3+
area: Search
4+
type: bug
5+
issues: []

qa/smoke-test-http/src/internalClusterTest/java/org/elasticsearch/http/SearchErrorTraceIT.java

+123
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,21 @@
1111

1212
import org.apache.http.entity.ContentType;
1313
import org.apache.http.nio.entity.NByteArrayEntity;
14+
import org.apache.logging.log4j.Level;
15+
import org.apache.logging.log4j.core.config.Configurator;
1416
import org.elasticsearch.ExceptionsHelper;
1517
import org.elasticsearch.action.search.MultiSearchRequest;
1618
import org.elasticsearch.action.search.SearchRequest;
1719
import org.elasticsearch.client.Request;
20+
import org.elasticsearch.search.ErrorTraceHelper;
21+
import org.elasticsearch.search.SearchService;
1822
import org.elasticsearch.search.builder.SearchSourceBuilder;
23+
import org.elasticsearch.test.MockLog;
1924
import org.elasticsearch.transport.TransportMessageListener;
2025
import org.elasticsearch.transport.TransportService;
2126
import org.elasticsearch.xcontent.XContentType;
2227
import org.junit.Before;
28+
import org.junit.BeforeClass;
2329

2430
import java.io.IOException;
2531
import java.nio.charset.Charset;
@@ -31,6 +37,11 @@
3137
public class SearchErrorTraceIT extends HttpSmokeTestCase {
3238
private AtomicBoolean hasStackTrace;
3339

40+
@BeforeClass
41+
public static void setDebugLogLevel() {
42+
Configurator.setLevel(SearchService.class, Level.DEBUG);
43+
}
44+
3445
@Before
3546
private void setupMessageListener() {
3647
internalCluster().getDataNodeInstances(TransportService.class).forEach(ts -> {
@@ -119,6 +130,63 @@ public void testSearchFailingQueryErrorTraceFalse() throws IOException {
119130
assertFalse(hasStackTrace.get());
120131
}
121132

133+
public void testDataNodeDoesNotLogStackTraceWhenErrorTraceTrue() throws IOException {
134+
hasStackTrace = new AtomicBoolean();
135+
setupIndexWithDocs();
136+
137+
Request searchRequest = new Request("POST", "/_search");
138+
searchRequest.setJsonEntity("""
139+
{
140+
"query": {
141+
"simple_query_string" : {
142+
"query": "foo",
143+
"fields": ["field"]
144+
}
145+
}
146+
}
147+
""");
148+
149+
String errorTriggeringIndex = "test2";
150+
int numShards = getNumShards(errorTriggeringIndex).numPrimaries;
151+
try (var mockLog = MockLog.capture(SearchService.class)) {
152+
ErrorTraceHelper.addUnseenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
153+
154+
searchRequest.addParameter("error_trace", "true");
155+
getRestClient().performRequest(searchRequest);
156+
mockLog.assertAllExpectationsMatched();
157+
}
158+
}
159+
160+
public void testDataNodeLogsStackTraceWhenErrorTraceFalseOrEmpty() throws IOException {
161+
hasStackTrace = new AtomicBoolean();
162+
setupIndexWithDocs();
163+
164+
Request searchRequest = new Request("POST", "/_search");
165+
searchRequest.setJsonEntity("""
166+
{
167+
"query": {
168+
"simple_query_string" : {
169+
"query": "foo",
170+
"fields": ["field"]
171+
}
172+
}
173+
}
174+
""");
175+
176+
String errorTriggeringIndex = "test2";
177+
int numShards = getNumShards(errorTriggeringIndex).numPrimaries;
178+
try (var mockLog = MockLog.capture(SearchService.class)) {
179+
ErrorTraceHelper.addSeenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
180+
181+
// error_trace defaults to false so we can test both cases with some randomization
182+
if (randomBoolean()) {
183+
searchRequest.addParameter("error_trace", "false");
184+
}
185+
getRestClient().performRequest(searchRequest);
186+
mockLog.assertAllExpectationsMatched();
187+
}
188+
}
189+
122190
public void testMultiSearchFailingQueryErrorTraceDefault() throws IOException {
123191
hasStackTrace = new AtomicBoolean();
124192
setupIndexWithDocs();
@@ -172,4 +240,59 @@ public void testMultiSearchFailingQueryErrorTraceFalse() throws IOException {
172240

173241
assertFalse(hasStackTrace.get());
174242
}
243+
244+
public void testDataNodeDoesNotLogStackTraceWhenErrorTraceTrueMultiSearch() throws IOException {
245+
hasStackTrace = new AtomicBoolean();
246+
setupIndexWithDocs();
247+
248+
XContentType contentType = XContentType.JSON;
249+
MultiSearchRequest multiSearchRequest = new MultiSearchRequest().add(
250+
new SearchRequest("test*").source(new SearchSourceBuilder().query(simpleQueryStringQuery("foo").field("field")))
251+
);
252+
Request searchRequest = new Request("POST", "/_msearch");
253+
byte[] requestBody = MultiSearchRequest.writeMultiLineFormat(multiSearchRequest, contentType.xContent());
254+
searchRequest.setEntity(
255+
new NByteArrayEntity(requestBody, ContentType.create(contentType.mediaTypeWithoutParameters(), (Charset) null))
256+
);
257+
258+
searchRequest.addParameter("error_trace", "true");
259+
260+
String errorTriggeringIndex = "test2";
261+
int numShards = getNumShards(errorTriggeringIndex).numPrimaries;
262+
try (var mockLog = MockLog.capture(SearchService.class)) {
263+
ErrorTraceHelper.addUnseenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
264+
265+
getRestClient().performRequest(searchRequest);
266+
mockLog.assertAllExpectationsMatched();
267+
}
268+
}
269+
270+
public void testDataNodeLogsStackTraceWhenErrorTraceFalseOrEmptyMultiSearch() throws IOException {
271+
hasStackTrace = new AtomicBoolean();
272+
setupIndexWithDocs();
273+
274+
XContentType contentType = XContentType.JSON;
275+
MultiSearchRequest multiSearchRequest = new MultiSearchRequest().add(
276+
new SearchRequest("test*").source(new SearchSourceBuilder().query(simpleQueryStringQuery("foo").field("field")))
277+
);
278+
Request searchRequest = new Request("POST", "/_msearch");
279+
byte[] requestBody = MultiSearchRequest.writeMultiLineFormat(multiSearchRequest, contentType.xContent());
280+
searchRequest.setEntity(
281+
new NByteArrayEntity(requestBody, ContentType.create(contentType.mediaTypeWithoutParameters(), (Charset) null))
282+
);
283+
284+
// error_trace defaults to false so we can test both cases with some randomization
285+
if (randomBoolean()) {
286+
searchRequest.addParameter("error_trace", "false");
287+
}
288+
289+
String errorTriggeringIndex = "test2";
290+
int numShards = getNumShards(errorTriggeringIndex).numPrimaries;
291+
try (var mockLog = MockLog.capture(SearchService.class)) {
292+
ErrorTraceHelper.addSeenLoggingExpectations(numShards, mockLog, errorTriggeringIndex);
293+
294+
getRestClient().performRequest(searchRequest);
295+
mockLog.assertAllExpectationsMatched();
296+
}
297+
}
175298
}

server/src/main/java/org/elasticsearch/search/SearchService.java

+59-5
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@
155155
import java.util.function.Supplier;
156156

157157
import static org.elasticsearch.TransportVersions.ERROR_TRACE_IN_TRANSPORT_HEADER;
158+
import static org.elasticsearch.common.Strings.format;
158159
import static org.elasticsearch.core.TimeValue.timeValueHours;
159160
import static org.elasticsearch.core.TimeValue.timeValueMillis;
160161
import static org.elasticsearch.core.TimeValue.timeValueMinutes;
@@ -519,12 +520,18 @@ protected void doClose() {
519520
* @param <T> the type of the response
520521
* @param listener the action listener to be wrapped
521522
* @param version channel version of the request
523+
* @param nodeId id of the current node
524+
* @param shardId id of the shard being searched
525+
* @param taskId id of the task being executed
522526
* @param threadPool with context where to write the new header
523527
* @return the wrapped action listener
524528
*/
525529
static <T> ActionListener<T> maybeWrapListenerForStackTrace(
526530
ActionListener<T> listener,
527531
TransportVersion version,
532+
String nodeId,
533+
ShardId shardId,
534+
long taskId,
528535
ThreadPool threadPool
529536
) {
530537
boolean header = true;
@@ -533,6 +540,18 @@ static <T> ActionListener<T> maybeWrapListenerForStackTrace(
533540
}
534541
if (header == false) {
535542
return listener.delegateResponse((l, e) -> {
543+
org.apache.logging.log4j.util.Supplier<String> messageSupplier = () -> format(
544+
"[%s]%s: failed to execute search request for task [%d]",
545+
nodeId,
546+
shardId,
547+
taskId
548+
);
549+
// Keep this logic aligned with that of SUPPRESSED_ERROR_LOGGER in RestResponse
550+
if (ExceptionsHelper.status(e).getStatus() < 500 || ExceptionsHelper.isNodeOrShardUnavailableTypeException(e)) {
551+
logger.debug(messageSupplier, e);
552+
} else {
553+
logger.warn(messageSupplier, e);
554+
}
536555
ExceptionsHelper.unwrapCausesAndSuppressed(e, err -> {
537556
err.setStackTrace(EMPTY_STACK_TRACE_ARRAY);
538557
return false;
@@ -544,7 +563,14 @@ static <T> ActionListener<T> maybeWrapListenerForStackTrace(
544563
}
545564

546565
public void executeDfsPhase(ShardSearchRequest request, SearchShardTask task, ActionListener<SearchPhaseResult> listener) {
547-
listener = maybeWrapListenerForStackTrace(listener, request.getChannelVersion(), threadPool);
566+
listener = maybeWrapListenerForStackTrace(
567+
listener,
568+
request.getChannelVersion(),
569+
clusterService.localNode().getId(),
570+
request.shardId(),
571+
task.getId(),
572+
threadPool
573+
);
548574
final IndexShard shard = getShard(request);
549575
rewriteAndFetchShardRequest(shard, request, listener.delegateFailure((l, rewritten) -> {
550576
// fork the execution in the search thread pool
@@ -582,7 +608,14 @@ private void loadOrExecuteQueryPhase(final ShardSearchRequest request, final Sea
582608
}
583609

584610
public void executeQueryPhase(ShardSearchRequest request, SearchShardTask task, ActionListener<SearchPhaseResult> listener) {
585-
ActionListener<SearchPhaseResult> finalListener = maybeWrapListenerForStackTrace(listener, request.getChannelVersion(), threadPool);
611+
ActionListener<SearchPhaseResult> finalListener = maybeWrapListenerForStackTrace(
612+
listener,
613+
request.getChannelVersion(),
614+
clusterService.localNode().getId(),
615+
request.shardId(),
616+
task.getId(),
617+
threadPool
618+
);
586619
assert request.canReturnNullResponseIfMatchNoDocs() == false || request.numberOfShards() > 1
587620
: "empty responses require more than one shard";
588621
final IndexShard shard = getShard(request);
@@ -775,9 +808,16 @@ private SearchPhaseResult executeQueryPhase(ShardSearchRequest request, SearchSh
775808
}
776809

777810
public void executeRankFeaturePhase(RankFeatureShardRequest request, SearchShardTask task, ActionListener<RankFeatureResult> listener) {
778-
listener = maybeWrapListenerForStackTrace(listener, request.getShardSearchRequest().getChannelVersion(), threadPool);
779811
final ReaderContext readerContext = findReaderContext(request.contextId(), request);
780812
final ShardSearchRequest shardSearchRequest = readerContext.getShardSearchRequest(request.getShardSearchRequest());
813+
listener = maybeWrapListenerForStackTrace(
814+
listener,
815+
shardSearchRequest.getChannelVersion(),
816+
clusterService.localNode().getId(),
817+
shardSearchRequest.shardId(),
818+
task.getId(),
819+
threadPool
820+
);
781821
final Releasable markAsUsed = readerContext.markAsUsed(getKeepAlive(shardSearchRequest));
782822
runAsync(getExecutor(readerContext.indexShard()), () -> {
783823
try (SearchContext searchContext = createContext(readerContext, shardSearchRequest, task, ResultsType.RANK_FEATURE, false)) {
@@ -822,8 +862,15 @@ public void executeQueryPhase(
822862
ActionListener<ScrollQuerySearchResult> listener,
823863
TransportVersion version
824864
) {
825-
listener = maybeWrapListenerForStackTrace(listener, version, threadPool);
826865
final LegacyReaderContext readerContext = (LegacyReaderContext) findReaderContext(request.contextId(), request);
866+
listener = maybeWrapListenerForStackTrace(
867+
listener,
868+
version,
869+
clusterService.localNode().getId(),
870+
readerContext.indexShard().shardId(),
871+
task.getId(),
872+
threadPool
873+
);
827874
final Releasable markAsUsed;
828875
try {
829876
markAsUsed = readerContext.markAsUsed(getScrollKeepAlive(request.scroll()));
@@ -864,9 +911,16 @@ public void executeQueryPhase(
864911
ActionListener<QuerySearchResult> listener,
865912
TransportVersion version
866913
) {
867-
listener = maybeWrapListenerForStackTrace(listener, version, threadPool);
868914
final ReaderContext readerContext = findReaderContext(request.contextId(), request.shardSearchRequest());
869915
final ShardSearchRequest shardSearchRequest = readerContext.getShardSearchRequest(request.shardSearchRequest());
916+
listener = maybeWrapListenerForStackTrace(
917+
listener,
918+
version,
919+
clusterService.localNode().getId(),
920+
shardSearchRequest.shardId(),
921+
task.getId(),
922+
threadPool
923+
);
870924
final Releasable markAsUsed = readerContext.markAsUsed(getKeepAlive(shardSearchRequest));
871925
rewriteAndFetchShardRequest(readerContext.indexShard(), shardSearchRequest, listener.delegateFailure((l, rewritten) -> {
872926
// fork the execution in the search thread pool

0 commit comments

Comments
 (0)