Skip to content

Commit 3d5f7a0

Browse files
ldanilekConvex, Inc.
authored and
Convex, Inc.
committed
cancel snapshot imports that take >1 day (#25005)
we need to time bound snapshot imports. ideally they would finish quickly, but if they're taking more than a day then that's much too long. we need to be able to delete the file from S3 and the hidden tables being written to. reporting this as a bad request is somewhat incorrect, but it does represent what we should do with the error -- send it to the customer if they're watching and cancel the import. It makes the "import will execute to completion in the background" message somewhat false. When this happens though, there must have been a system error for days that we were not able to fix. GitOrigin-RevId: 33b2661ddbd36379b522fa72c737e48a22df5de4
1 parent 3504812 commit 3d5f7a0

File tree

2 files changed

+40
-5
lines changed

2 files changed

+40
-5
lines changed

crates/application/src/metrics.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
use std::time::Duration;
2+
13
use metrics::{
24
log_counter_with_labels,
5+
log_distribution,
36
log_distribution_with_labels,
47
log_gauge_with_labels,
58
register_convex_counter,
@@ -56,6 +59,14 @@ pub fn snapshot_import_timer() -> StatusTimer {
5659
StatusTimer::new(&SNAPSHOT_IMPORT_TIMER_SECONDS)
5760
}
5861

62+
register_convex_histogram!(
63+
SNAPSHOT_IMPORT_AGE_SECONDS,
64+
"Age of in-progress snapshot import",
65+
);
66+
pub fn log_snapshot_import_age(age: Duration) {
67+
log_distribution(&SNAPSHOT_IMPORT_AGE_SECONDS, age.as_secs_f64());
68+
}
69+
5970
register_convex_histogram!(
6071
SNAPSHOT_EXPORT_TIMER_SECONDS,
6172
"Time taken for a snapshot export",

crates/application/src/snapshot_import.rs

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ use value::{
166166
use crate::{
167167
export_worker::FileStorageZipMetadata,
168168
metrics::{
169+
log_snapshot_import_age,
169170
log_worker_starting,
170171
snapshot_import_timer,
171172
},
@@ -177,6 +178,10 @@ static IMPORT_SIZE_LIMIT: LazyLock<String> =
177178

178179
const INITIAL_BACKOFF: Duration = Duration::from_secs(1);
179180
const MAX_BACKOFF: Duration = Duration::from_secs(60);
181+
// If an import is taking longer than a day, it's a problem (and our fault).
182+
// But the customer is probably no longer waiting so we should fail the import.
183+
// If an import takes more than a week, the file may be deleted from S3.
184+
const MAX_IMPORT_AGE: Duration = Duration::from_secs(24 * 60 * 60);
180185

181186
pub struct SnapshotImportWorker<RT: Runtime> {
182187
runtime: RT,
@@ -591,6 +596,25 @@ impl<RT: Runtime> SnapshotImportWorker<RT> {
591596
&mut self,
592597
snapshot_import: ParsedDocument<SnapshotImport>,
593598
) -> anyhow::Result<(Timestamp, usize)> {
599+
if let Some(creation_time) = snapshot_import.creation_time() {
600+
let now = CreationTime::try_from(*self.database.now_ts_for_reads())?;
601+
let age = Duration::from_millis((f64::from(now) - f64::from(creation_time)) as u64);
602+
log_snapshot_import_age(age);
603+
if age > MAX_IMPORT_AGE / 2 {
604+
tracing::warn!(
605+
"SnapshotImport {} running too long ({:?})",
606+
snapshot_import.id(),
607+
age
608+
);
609+
}
610+
if age > MAX_IMPORT_AGE {
611+
anyhow::bail!(ErrorMetadata::bad_request(
612+
"ImportFailed",
613+
"Import took too long. Try again or contact Convex."
614+
));
615+
}
616+
}
617+
594618
let (initial_schemas, objects) = self.parse_import(snapshot_import.id()).await?;
595619

596620
let usage = FunctionUsageTracker::new();
@@ -1159,7 +1183,7 @@ fn wrap_import_err(e: anyhow::Error) -> anyhow::Error {
11591183
}
11601184
}
11611185

1162-
async fn wait_for_export_worker<RT: Runtime>(
1186+
async fn wait_for_import_worker<RT: Runtime>(
11631187
application: &Application<RT>,
11641188
identity: Identity,
11651189
import_id: DocumentIdV6,
@@ -1201,7 +1225,7 @@ pub async fn do_import<RT: Runtime>(
12011225
let import_id =
12021226
upload_import_file(application, identity.clone(), format, mode, body_stream).await?;
12031227

1204-
let snapshot_import = wait_for_export_worker(application, identity.clone(), import_id).await?;
1228+
let snapshot_import = wait_for_import_worker(application, identity.clone(), import_id).await?;
12051229
match &snapshot_import.state {
12061230
ImportState::Uploaded | ImportState::InProgress { .. } | ImportState::Completed { .. } => {
12071231
anyhow::bail!("should be WaitingForConfirmation, is {snapshot_import:?}")
@@ -1214,7 +1238,7 @@ pub async fn do_import<RT: Runtime>(
12141238

12151239
perform_import(application, identity.clone(), import_id).await?;
12161240

1217-
let snapshot_import = wait_for_export_worker(application, identity.clone(), import_id).await?;
1241+
let snapshot_import = wait_for_import_worker(application, identity.clone(), import_id).await?;
12181242
match &snapshot_import.state {
12191243
ImportState::Uploaded
12201244
| ImportState::WaitingForConfirmation { .. }
@@ -2337,7 +2361,7 @@ mod tests {
23372361
use crate::{
23382362
snapshot_import::{
23392363
upload_import_file,
2340-
wait_for_export_worker,
2364+
wait_for_import_worker,
23412365
},
23422366
test_helpers::{
23432367
ApplicationFixtureArgs,
@@ -2705,7 +2729,7 @@ a
27052729
)
27062730
.await?;
27072731

2708-
let snapshot_import = wait_for_export_worker(&app, new_admin_id(), import_id).await?;
2732+
let snapshot_import = wait_for_import_worker(&app, new_admin_id(), import_id).await?;
27092733

27102734
let state = snapshot_import.state.clone();
27112735
must_let!(let ImportState::WaitingForConfirmation {

0 commit comments

Comments
 (0)