Skip to content

Commit 91061e6

Browse files
jordanhunt22Convex, Inc.
authored and
Convex, Inc.
committed
[Index Retention] Fix alerting for index retention (#25367)
GitOrigin-RevId: ada561b1455d30e9f2e6fb3acfafeea8f362ee82
1 parent cd5777d commit 91061e6

File tree

2 files changed

+44
-6
lines changed

2 files changed

+44
-6
lines changed

crates/database/src/metrics.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,14 @@ pub fn log_retention_cursor_age(age_secs: f64) {
402402
log_gauge(&RETENTION_CURSOR_AGE_SECONDS, age_secs)
403403
}
404404

405+
register_convex_gauge!(
406+
RETENTION_CURSOR_LAG_SECONDS,
407+
"Lag between the retention cursor and the min index snapshot"
408+
);
409+
pub fn log_retention_cursor_lag(age_secs: f64) {
410+
log_gauge(&RETENTION_CURSOR_LAG_SECONDS, age_secs)
411+
}
412+
405413
register_convex_gauge!(
406414
DOCUMENT_RETENTION_CURSOR_AGE_SECONDS,
407415
"Age of the document retention cursor"
@@ -410,6 +418,14 @@ pub fn log_document_retention_cursor_age(age_secs: f64) {
410418
log_gauge(&DOCUMENT_RETENTION_CURSOR_AGE_SECONDS, age_secs)
411419
}
412420

421+
register_convex_gauge!(
422+
DOCUMENT_RETENTION_CURSOR_LAG_SECONDS,
423+
"Lag between the retention cursor and the min document snapshot"
424+
);
425+
pub fn log_document_retention_cursor_lag(age_secs: f64) {
426+
log_gauge(&DOCUMENT_RETENTION_CURSOR_LAG_SECONDS, age_secs)
427+
}
428+
413429
register_convex_gauge!(
414430
RETENTION_MISSING_CURSOR_INFO,
415431
"Index retention has no cursor"

crates/database/src/retention.rs

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,11 @@ use crate::{
117117
latest_min_document_snapshot_timer,
118118
latest_min_snapshot_timer,
119119
log_document_retention_cursor_age,
120+
log_document_retention_cursor_lag,
120121
log_document_retention_no_cursor,
121122
log_document_retention_scanned_document,
122123
log_retention_cursor_age,
124+
log_retention_cursor_lag,
123125
log_retention_documents_deleted,
124126
log_retention_expired_index_entry,
125127
log_retention_index_entries_deleted,
@@ -441,6 +443,7 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
441443
// even if the deletion future is stuck.
442444
Self::get_checkpoint(
443445
persistence.reader().as_ref(),
446+
bounds_writer.reader(),
444447
snapshot_reader.clone(),
445448
retention_type,
446449
)
@@ -1050,6 +1053,7 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
10501053
let _timer = retention_delete_timer();
10511054
let cursor = Self::get_checkpoint(
10521055
reader.as_ref(),
1056+
bounds_reader.clone(),
10531057
snapshot_reader.clone(),
10541058
RetentionType::Index,
10551059
)
@@ -1179,6 +1183,7 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
11791183
let _timer = retention_delete_documents_timer();
11801184
let cursor = Self::get_checkpoint(
11811185
reader.as_ref(),
1186+
bounds_reader.clone(),
11821187
snapshot_reader.clone(),
11831188
RetentionType::Document,
11841189
)
@@ -1268,6 +1273,7 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
12681273

12691274
async fn get_checkpoint(
12701275
persistence: &dyn PersistenceReader,
1276+
bounds_reader: Reader<SnapshotBounds>,
12711277
snapshot_reader: Reader<SnapshotManager>,
12721278
retention_type: RetentionType,
12731279
) -> anyhow::Result<Timestamp> {
@@ -1276,12 +1282,28 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
12761282
// Only log if the checkpoint has been written once, to avoid logging time since
12771283
// epoch when the instance is first starting up.
12781284
match retention_type {
1279-
RetentionType::Document => log_document_retention_cursor_age(
1280-
(*snapshot_reader.lock().latest_ts()).secs_since_f64(checkpoint),
1281-
),
1282-
RetentionType::Index => log_retention_cursor_age(
1283-
(*snapshot_reader.lock().latest_ts()).secs_since_f64(checkpoint),
1284-
),
1285+
RetentionType::Document => {
1286+
log_document_retention_cursor_age(
1287+
(*snapshot_reader.lock().latest_ts()).secs_since_f64(checkpoint),
1288+
);
1289+
log_document_retention_cursor_lag(
1290+
bounds_reader
1291+
.lock()
1292+
.min_document_snapshot_ts
1293+
.secs_since_f64(checkpoint),
1294+
);
1295+
},
1296+
RetentionType::Index => {
1297+
log_retention_cursor_age(
1298+
(*snapshot_reader.lock().latest_ts()).secs_since_f64(checkpoint),
1299+
);
1300+
log_retention_cursor_lag(
1301+
bounds_reader
1302+
.lock()
1303+
.min_snapshot_ts
1304+
.secs_since_f64(checkpoint),
1305+
);
1306+
},
12851307
}
12861308
} else {
12871309
match retention_type {

0 commit comments

Comments
 (0)