@@ -64,8 +64,6 @@ const ACCEPTABLE_REPLICATION_LAG: Duration = Duration::from_secs(30);
64
64
/// the lag again
65
65
const REPLICATION_SLEEP : Duration = Duration :: from_secs ( 10 ) ;
66
66
67
- type PooledPgConnection = PooledConnection < ConnectionManager < PgConnection > > ;
68
-
69
67
lazy_static ! {
70
68
static ref STATEMENT_TIMEOUT : Option <String > = ENV_VARS
71
69
. store
@@ -667,17 +665,77 @@ impl From<Result<CopyTableWorker, StoreError>> for WorkerResult {
667
665
}
668
666
}
669
667
668
+ /// We pass connections back and forth between the control loop and various
669
+ /// workers. We need to make sure that we end up with the connection that
670
+ /// was used to acquire the copy lock in the right place so we can release
671
+ /// the copy lock which is only possible with the connection that acquired
672
+ /// it.
673
+ ///
674
+ /// This struct helps us with that. It wraps a connection and tracks whether
675
+ /// the connection was used to acquire the copy lock
676
+ struct LockTrackingConnection {
677
+ inner : PooledConnection < ConnectionManager < PgConnection > > ,
678
+ has_lock : bool ,
679
+ }
680
+
681
+ impl LockTrackingConnection {
682
+ fn new ( inner : PooledConnection < ConnectionManager < PgConnection > > ) -> Self {
683
+ Self {
684
+ inner,
685
+ has_lock : false ,
686
+ }
687
+ }
688
+
689
+ fn transaction < T , F > ( & mut self , f : F ) -> Result < T , StoreError >
690
+ where
691
+ F : FnOnce ( & mut PgConnection ) -> Result < T , StoreError > ,
692
+ {
693
+ let conn = & mut self . inner ;
694
+ conn. transaction ( |conn| f ( conn) )
695
+ }
696
+
697
+ /// Put `self` into `other` if `self` has the lock.
698
+ fn extract ( self , other : & mut Option < Self > ) {
699
+ if self . has_lock {
700
+ * other = Some ( self ) ;
701
+ }
702
+ }
703
+
704
+ fn lock ( & mut self , logger : & Logger , dst : & Site ) -> Result < ( ) , StoreError > {
705
+ if self . has_lock {
706
+ warn ! ( logger, "already acquired copy lock for {}" , dst) ;
707
+ return Ok ( ( ) ) ;
708
+ }
709
+ advisory_lock:: lock_copying ( & mut self . inner , dst) ?;
710
+ self . has_lock = true ;
711
+ Ok ( ( ) )
712
+ }
713
+
714
+ fn unlock ( & mut self , logger : & Logger , dst : & Site ) -> Result < ( ) , StoreError > {
715
+ if !self . has_lock {
716
+ error ! (
717
+ logger,
718
+ "tried to release copy lock for {} even though we are not the owner" , dst
719
+ ) ;
720
+ return Ok ( ( ) ) ;
721
+ }
722
+ advisory_lock:: unlock_copying ( & mut self . inner , dst) ?;
723
+ self . has_lock = false ;
724
+ Ok ( ( ) )
725
+ }
726
+ }
727
+
670
728
/// A helper to run copying of one table. We need to thread `conn` and
671
729
/// `table` from the control loop to the background worker and back again to
672
730
/// the control loop. This worker facilitates that
673
731
struct CopyTableWorker {
674
- conn : PooledPgConnection ,
732
+ conn : LockTrackingConnection ,
675
733
table : TableState ,
676
734
result : Result < Status , StoreError > ,
677
735
}
678
736
679
737
impl CopyTableWorker {
680
- fn new ( conn : PooledPgConnection , table : TableState ) -> Self {
738
+ fn new ( conn : LockTrackingConnection , table : TableState ) -> Self {
681
739
Self {
682
740
conn,
683
741
table,
@@ -699,7 +757,7 @@ impl CopyTableWorker {
699
757
fn run_inner ( & mut self , logger : Logger , progress : & CopyProgress ) -> Result < Status , StoreError > {
700
758
use Status :: * ;
701
759
702
- let conn = & mut self . conn ;
760
+ let conn = & mut self . conn . inner ;
703
761
progress. start_table ( & self . table ) ;
704
762
while !self . table . finished ( ) {
705
763
// It is important that this check happens outside the write
@@ -855,7 +913,7 @@ pub struct Connection {
855
913
/// individual table. Except for that case, this will always be
856
914
/// `Some(..)`. Most code shouldn't access `self.conn` directly, but use
857
915
/// `self.transaction`
858
- conn : Option < PooledPgConnection > ,
916
+ conn : Option < LockTrackingConnection > ,
859
917
pool : ConnectionPool ,
860
918
primary : Primary ,
861
919
workers : usize ,
@@ -901,9 +959,9 @@ impl Connection {
901
959
}
902
960
false
903
961
} ) ?;
904
- let conn = Some ( conn) ;
905
962
let src_manifest_idx_and_name = Arc :: new ( src_manifest_idx_and_name) ;
906
963
let dst_manifest_idx_and_name = Arc :: new ( dst_manifest_idx_and_name) ;
964
+ let conn = Some ( LockTrackingConnection :: new ( conn) ) ;
907
965
Ok ( Self {
908
966
logger,
909
967
conn,
@@ -990,6 +1048,7 @@ impl Connection {
990
1048
let Some ( table) = state. unfinished . pop ( ) else {
991
1049
return None ;
992
1050
} ;
1051
+ let conn = LockTrackingConnection :: new ( conn) ;
993
1052
994
1053
let worker = CopyTableWorker :: new ( conn, table) ;
995
1054
Some ( Box :: pin (
@@ -1031,7 +1090,7 @@ impl Connection {
1031
1090
let result = workers. select ( ) . await ;
1032
1091
match result {
1033
1092
Ok ( worker) => {
1034
- self . conn = Some ( worker . conn ) ;
1093
+ worker . conn . extract ( & mut self . conn ) ;
1035
1094
}
1036
1095
Err ( e) => {
1037
1096
/* Ignore; we had an error previously */
@@ -1098,7 +1157,7 @@ impl Connection {
1098
1157
W :: Ok ( worker) => {
1099
1158
// Put the connection back into self.conn so that we can use it
1100
1159
// in the next iteration.
1101
- self . conn = Some ( worker . conn ) ;
1160
+ worker . conn . extract ( & mut self . conn ) ;
1102
1161
1103
1162
match ( worker. result , progress. is_cancelled ( ) ) {
1104
1163
( Ok ( Status :: Finished ) , false ) => {
@@ -1207,20 +1266,30 @@ impl Connection {
1207
1266
) ;
1208
1267
1209
1268
let dst_site = self . dst . site . cheap_clone ( ) ;
1210
- self . transaction ( |conn| advisory_lock:: lock_copying ( conn, & dst_site) ) ?;
1269
+ let Some ( conn) = self . conn . as_mut ( ) else {
1270
+ return Err ( constraint_violation ! (
1271
+ "copy connection went missing (copy_data)"
1272
+ ) ) ;
1273
+ } ;
1274
+ conn. lock ( & self . logger , & dst_site) ?;
1211
1275
1212
1276
let res = self . copy_data_internal ( index_list) . await ;
1213
1277
1214
- if self . conn . is_none ( ) {
1215
- // A background worker panicked and left us without our
1216
- // dedicated connection, but we still need to release the copy
1217
- // lock; get a normal connection, not from the fdw pool for that
1218
- // as that will be much less contended. We won't be holding on
1219
- // to the connection for long as `res` will be an error and we
1220
- // will abort starting this subgraph
1221
- self . conn = Some ( self . pool . get ( ) ?) ;
1278
+ match self . conn . as_mut ( ) {
1279
+ None => {
1280
+ // A background worker panicked and left us without our
1281
+ // dedicated connection; we would need to get that
1282
+ // connection to unlock the advisory lock. We can't do that,
1283
+ // so we just log an error
1284
+ warn ! (
1285
+ self . logger,
1286
+ "can't unlock copy lock since the default worker panicked; lock will linger until session ends"
1287
+ ) ;
1288
+ }
1289
+ Some ( conn) => {
1290
+ conn. unlock ( & self . logger , & dst_site) ?;
1291
+ }
1222
1292
}
1223
- self . transaction ( |conn| advisory_lock:: unlock_copying ( conn, & dst_site) ) ?;
1224
1293
1225
1294
if matches ! ( res, Ok ( Status :: Cancelled ) ) {
1226
1295
warn ! ( & self . logger, "Copying was cancelled and is incomplete" ) ;
0 commit comments