Skip to content

Commit fe549b8

Browse files
gautamg795Convex, Inc.
authored and
Convex, Inc.
committed
retry bumping max repeatable ts (#37458)
GitOrigin-RevId: a1546e544735ab5210fbfd1bb58aa88d03136c65
1 parent de870a5 commit fe549b8

File tree

1 file changed

+37
-10
lines changed

1 file changed

+37
-10
lines changed

crates/database/src/committer.rs

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::{
33
collections::BTreeSet,
44
ops::Bound,
55
sync::Arc,
6+
time::Duration,
67
};
78

89
use ::metrics::{
@@ -11,6 +12,7 @@ use ::metrics::{
1112
};
1213
use anyhow::Context as _;
1314
use common::{
15+
backoff::Backoff,
1416
bootstrap_model::tables::{
1517
TableMetadata,
1618
TableState,
@@ -26,7 +28,10 @@ use common::{
2628
ParsedDocument,
2729
ResolvedDocument,
2830
},
29-
errors::recapture_stacktrace,
31+
errors::{
32+
recapture_stacktrace,
33+
report_error,
34+
},
3035
fastrace_helpers::{
3136
initialize_root_from_parent,
3237
EncodedSpan,
@@ -591,27 +596,49 @@ impl<RT: Runtime> Committer<RT> {
591596
.next_max_repeatable_ts()
592597
.expect("new_max_repeatable should exist");
593598
let persistence = self.persistence.clone();
594-
let outer_span = Span::enter_with_parent("outer_bump_max_repeatable_ts", root_span);
599+
let span = Span::enter_with_parent("bump_max_repeatable_ts", root_span);
600+
let runtime = self.runtime.clone();
595601
self.persistence_writes.push_back(
596602
async move {
597-
let span = Span::enter_with_parent("inner_bump_max_repeatable_ts", &outer_span);
598603
// The MaxRepeatableTimestamp persistence global ensures all future
599604
// commits on future leaders will be after new_max_repeatable, and followers
600605
// can know this timestamp is repeatable.
601-
persistence
602-
.write_persistence_global(
603-
PersistenceGlobalKey::MaxRepeatableTimestamp,
604-
new_max_repeatable.into(),
605-
)
606-
.in_span(span)
607-
.await?;
606+
607+
// If we fail to bump the timestamp, we'll backoff and retry
608+
// which will block the committer from making forward progress until we
609+
// succceed. We don't want to kill the committer and reload the
610+
// instance if we can avoid it, as that would exacerbate any
611+
// load-related issues.
612+
let mut backoff = Backoff::new(Duration::from_secs(1), Duration::from_secs(60));
613+
loop {
614+
match persistence
615+
.write_persistence_global(
616+
PersistenceGlobalKey::MaxRepeatableTimestamp,
617+
new_max_repeatable.into(),
618+
)
619+
.await
620+
{
621+
Ok(()) => break,
622+
Err(mut e) => {
623+
let delay = backoff.fail(&mut runtime.rng());
624+
report_error(&mut e).await;
625+
tracing::error!(
626+
"Failed to bump max repeatable timestamp, retrying after {:.2}s",
627+
delay.as_secs_f32()
628+
);
629+
runtime.wait(delay).await;
630+
continue;
631+
},
632+
}
633+
}
608634
Ok(PersistenceWrite::MaxRepeatableTimestamp {
609635
new_max_repeatable,
610636
timer,
611637
result,
612638
commit_id,
613639
})
614640
}
641+
.in_span(span)
615642
.boxed(),
616643
);
617644
}

0 commit comments

Comments
 (0)