Skip to content

Commit b017c7b

Browse files
cijothomaslalitb
andauthored
Fix PeriodicReader panic due to timeout (#2586)
Co-authored-by: Lalit Kumar Bhasin <[email protected]>
1 parent 5e6b3d6 commit b017c7b

File tree

2 files changed

+65
-24
lines changed

2 files changed

+65
-24
lines changed

opentelemetry-sdk/src/metrics/periodic_reader.rs

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,36 @@ impl PeriodicReader {
200200
otel_debug!(
201201
name: "PeriodReaderThreadExportingDueToFlush"
202202
);
203-
if let Err(_e) = cloned_reader.collect_and_export(timeout) {
204-
response_sender.send(false).unwrap();
205-
} else {
206-
response_sender.send(true).unwrap();
203+
204+
let export_result = cloned_reader.collect_and_export(timeout);
205+
otel_debug!(
206+
name: "PeriodReaderInvokedExport",
207+
export_result = format!("{:?}", export_result)
208+
);
209+
210+
// If response_sender is disconnected, we can't send
211+
// the result back. This occurs when the thread that
212+
// initiated flush gave up due to timeout.
213+
// Gracefully handle that with internal logs. The
214+
// internal errors are of Info level, as this is
215+
// useful for user to know whether the flush was
216+
// successful or not, when flush() itself merely
217+
// tells that it timed out.
218+
219+
if export_result.is_err() {
220+
if response_sender.send(false).is_err() {
221+
otel_info!(
222+
name: "PeriodReader.Flush.ResponseSendError",
223+
message = "PeriodicReader's flush has failed, but unable to send this info back to caller.
224+
This occurs when the caller has timed out waiting for the response. If you see this occuring frequently, consider increasing the flush timeout."
225+
);
226+
}
227+
} else if response_sender.send(true).is_err() {
228+
otel_info!(
229+
name: "PeriodReader.Flush.ResponseSendError",
230+
message = "PeriodicReader's flush has completed successfully, but unable to send this info back to caller.
231+
This occurs when the caller has timed out waiting for the response. If you see this occuring frequently, consider increasing the flush timeout."
232+
);
207233
}
208234

209235
// Adjust the remaining interval after the flush
@@ -232,15 +258,39 @@ impl PeriodicReader {
232258
// Perform final export and break out of loop and exit the thread
233259
otel_debug!(name: "PeriodReaderThreadExportingDueToShutdown");
234260
let export_result = cloned_reader.collect_and_export(timeout);
261+
otel_debug!(
262+
name: "PeriodReaderInvokedExport",
263+
export_result = format!("{:?}", export_result)
264+
);
235265
let shutdown_result = exporter_arc.shutdown();
236266
otel_debug!(
237267
name: "PeriodReaderInvokedExporterShutdown",
238268
shutdown_result = format!("{:?}", shutdown_result)
239269
);
270+
271+
// If response_sender is disconnected, we can't send
272+
// the result back. This occurs when the thread that
273+
// initiated shutdown gave up due to timeout.
274+
// Gracefully handle that with internal logs and
275+
// continue with shutdown (i.e exit thread) The
276+
// internal errors are of Info level, as this is
277+
// useful for user to know whether the shutdown was
278+
// successful or not, when shutdown() itself merely
279+
// tells that it timed out.
240280
if export_result.is_err() || shutdown_result.is_err() {
241-
response_sender.send(false).unwrap();
242-
} else {
243-
response_sender.send(true).unwrap();
281+
if response_sender.send(false).is_err() {
282+
otel_info!(
283+
name: "PeriodReaderThreadShutdown.ResponseSendError",
284+
message = "PeriodicReader's shutdown has failed, but unable to send this info back to caller.
285+
This occurs when the caller has timed out waiting for the response. If you see this occuring frequently, consider increasing the shutdown timeout."
286+
);
287+
}
288+
} else if response_sender.send(true).is_err() {
289+
otel_info!(
290+
name: "PeriodReaderThreadShutdown.ResponseSendError",
291+
message = "PeriodicReader completed its shutdown, but unable to send this info back to caller.
292+
This occurs when the caller has timed out waiting for the response. If you see this occuring frequently, consider increasing the shutdown timeout."
293+
);
244294
}
245295

246296
otel_debug!(
@@ -255,11 +305,11 @@ impl PeriodicReader {
255305
name: "PeriodReaderThreadExportingDueToTimer"
256306
);
257307

258-
if let Err(_e) = cloned_reader.collect_and_export(timeout) {
259-
otel_debug!(
260-
name: "PeriodReaderThreadExportingDueToTimerFailed"
261-
);
262-
}
308+
let export_result = cloned_reader.collect_and_export(timeout);
309+
otel_debug!(
310+
name: "PeriodReaderInvokedExport",
311+
export_result = format!("{:?}", export_result)
312+
);
263313

264314
let time_taken_for_export = export_start.elapsed();
265315
if time_taken_for_export > interval {
@@ -390,17 +440,7 @@ impl PeriodicReaderInner {
390440

391441
// Relying on futures executor to execute async call.
392442
// TODO: Pass timeout to exporter
393-
let exporter_result = futures_executor::block_on(self.exporter.export(&mut rm));
394-
#[allow(clippy::question_mark)]
395-
if let Err(e) = exporter_result {
396-
otel_warn!(
397-
name: "PeriodReaderExportError",
398-
error = format!("{:?}", e)
399-
);
400-
return Err(e);
401-
}
402-
403-
Ok(())
443+
futures_executor::block_on(self.exporter.export(&mut rm))
404444
}
405445

406446
fn force_flush(&self) -> MetricResult<()> {

opentelemetry-sdk/src/trace/provider.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ impl Drop for TracerProviderInner {
134134
let _ = self.shutdown(); // errors are handled within shutdown
135135
} else {
136136
otel_debug!(
137-
name: "TracerProvider.Drop.AlreadyShutdown"
137+
name: "TracerProvider.Drop.AlreadyShutdown",
138+
message = "TracerProvider was already shut down; drop will not attempt shutdown again."
138139
);
139140
}
140141
}

0 commit comments

Comments
 (0)