[Fix] Shutdown + join futures on retention_manager shutdown (#24515)

jordanhunt22 · Convex, Inc. · commit 4496d46f4817 · 2024-04-10T22:45:32.000Z
It turns out that if we don't join a handle after shutdown, it can still perform work after the worker that created it has been dropped. So, I refactored the retention manager code to join all of its handles on shutdown.

Its slightly concerning that we use this same pattern of calling `.shutdown()` in other places without joining the future because we can't guarantee that the handle stops doing work before the creator goes away. Should we try to get rid of this pattern entirely?

Intuitively speaking it would seem that calling `.shutdown()` would terminate a thread synchronously, but this is not actually what happens.

GitOrigin-RevId: db38a2e748ebd958f6be2cde8258f7ae2f1ce00e
diff --git a/crates/application/src/lib.rs b/crates/application/src/lib.rs
@@ -2531,7 +2531,7 @@ impl<RT: Runtime> Application<RT> {
         self.actions_isolate.shutdown().await?;
         self.database_isolate.shutdown().await?;
         self.module_cache.shutdown();
-        self.database.shutdown();
+        self.database.shutdown().await?;
         tracing::info!("Application shut down");
         Ok(())
     }
diff --git a/crates/database/src/database.rs b/crates/database/src/database.rs
@@ -912,11 +912,12 @@ impl<RT: Runtime> Database<RT> {
         )
     }
 
-    pub fn shutdown(&self) {
+    pub async fn shutdown(&self) -> anyhow::Result<()> {
         self.committer.shutdown();
         self.subscriptions.shutdown();
-        self.retention_manager.shutdown();
+        self.retention_manager.shutdown().await?;
         tracing::info!("Database shutdown");
+        Ok(())
     }
 
     pub fn retention_validator(&self) -> Arc<dyn RetentionValidator> {
diff --git a/crates/database/src/retention.rs b/crates/database/src/retention.rs
@@ -68,9 +68,9 @@ use common::{
     query::Order,
     runtime::{
         new_rate_limiter,
+        shutdown_and_join,
         Runtime,
         RuntimeInstant,
-        SpawnHandle,
     },
     sha256::Sha256,
     sync::split_rw_lock::{
@@ -172,25 +172,21 @@ impl Checkpoint {
 pub struct LeaderRetentionManager<RT: Runtime> {
     rt: RT,
     bounds_reader: Reader<SnapshotBounds>,
-    advance_min_snapshot_handle: Arc<Mutex<RT::Handle>>,
-    deletion_handle: Arc<Mutex<RT::Handle>>,
-    document_deletion_handle: Arc<Mutex<RT::Handle>>,
     index_table_id: TableIdAndTableNumber,
     checkpoint_reader: Reader<Checkpoint>,
     document_checkpoint_reader: Reader<Checkpoint>,
+    handles: Arc<Mutex<Vec<RT::Handle>>>,
 }
 
 impl<RT: Runtime> Clone for LeaderRetentionManager<RT> {
     fn clone(&self) -> Self {
         Self {
             rt: self.rt.clone(),
             bounds_reader: self.bounds_reader.clone(),
-            advance_min_snapshot_handle: self.advance_min_snapshot_handle.clone(),
-            deletion_handle: self.deletion_handle.clone(),
-            document_deletion_handle: self.deletion_handle.clone(),
             index_table_id: self.index_table_id,
             checkpoint_reader: self.checkpoint_reader.clone(),
             document_checkpoint_reader: self.document_checkpoint_reader.clone(),
+            handles: self.handles.clone(),
         }
     }
 }
@@ -342,19 +338,25 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
         Ok(Self {
             rt,
             bounds_reader,
-            advance_min_snapshot_handle: Arc::new(Mutex::new(advance_min_snapshot_handle)),
-            deletion_handle: Arc::new(Mutex::new(deletion_handle)),
-            document_deletion_handle: Arc::new(Mutex::new(document_deletion_handle)),
             index_table_id,
             checkpoint_reader,
             document_checkpoint_reader,
+            handles: Arc::new(Mutex::new(vec![
+                // Order matters because we need to shutdown the threads that have
+                // receivers before the senders
+                deletion_handle,
+                document_deletion_handle,
+                advance_min_snapshot_handle,
+            ])),
         })
     }
 
-    pub fn shutdown(&self) {
-        self.deletion_handle.lock().shutdown();
-        self.document_deletion_handle.lock().shutdown();
-        self.advance_min_snapshot_handle.lock().shutdown();
+    pub async fn shutdown(&self) -> anyhow::Result<()> {
+        let handles: Vec<_> = self.handles.lock().drain(..).collect();
+        for handle in handles.into_iter() {
+            shutdown_and_join(handle).await?;
+        }
+        Ok(())
     }
 
     /// Returns the timestamp which we would like to use as min_snapshot_ts.
@@ -1143,7 +1145,7 @@ impl<RT: Runtime> LeaderRetentionManager<RT> {
             if !is_working {
                 min_document_snapshot_ts = match min_document_snapshot_rx.changed().await {
                     Err(err) => {
-                        tracing::warn!("Failed to receive document snapshot: {}", err);
+                        report_error(&mut err.into());
                         // Fall back to polling if the channel is closed or falls over. This should
                         // really never happen.
                         Self::wait_with_jitter(&rt, *MAX_RETENTION_DELAY_SECONDS).await;

Original file line number	Diff line number	Diff line change
`@@ -2531,7 +2531,7 @@ impl<RT: Runtime> Application<RT> {`
`2531`	`2531`	`self.actions_isolate.shutdown().await?;`
`2532`	`2532`	`self.database_isolate.shutdown().await?;`
`2533`	`2533`	`self.module_cache.shutdown();`
`2534`		`- self.database.shutdown();`
	`2534`	`+ self.database.shutdown().await?;`
`2535`	`2535`	`tracing::info!("Application shut down");`
`2536`	`2536`	`Ok(())`
`2537`	`2537`	`}`
Original file line number	Diff line number	Diff line change
`@@ -912,11 +912,12 @@ impl<RT: Runtime> Database<RT> {`
`912`	`912`	`)`
`913`	`913`	`}`
`914`	`914`
`915`		`- pub fn shutdown(&self) {`
	`915`	`+ pub async fn shutdown(&self) -> anyhow::Result<()> {`
`916`	`916`	`self.committer.shutdown();`
`917`	`917`	`self.subscriptions.shutdown();`
`918`		`- self.retention_manager.shutdown();`
	`918`	`+ self.retention_manager.shutdown().await?;`
`919`	`919`	`tracing::info!("Database shutdown");`
	`920`	`+ Ok(())`
`920`	`921`	`}`
`921`	`922`
`922`	`923`	`pub fn retention_validator(&self) -> Arc<dyn RetentionValidator> {`