Skip to content

Commit 6c34c56

Browse files
committed
Ensure partial MPP claims continue to blocks channels on restart
In 9cc6e08, we started using the `RAAMonitorUpdateBlockingAction` logic to block RAAs which may remove a preimage from one `ChannelMonitor` if it isn't durably stored in another that is a part of the same MPP claim. At the time, when we claimed a payment, if we saw that the HTLC was already claimed in the channel, we'd simply drop the new RAA blocker. This can happen on reload when replaying MPP claims. However, just because an HTLC is no longer present in `ChannelManager`'s `Channel`, doesn't mean that the `ChannelMonitorUpdate` which stored the preimage actually made it durably into the `ChannelMonitor` on disk. We could begin an MPP payment, have one channel get the preimage durably into its `ChannelMonitor`, then step forward another update with the peer. Then, we could reload, causing the MPP claims to be replayed across all chanels, leading to the RAA blocker(s) being dropped and all channels being unlocked. Finally, if the first channel managed to step forward a further update with its peer before the (now-replayed) `ChannelMonitorUpdate`s for all MPP parts make it to disk we could theoretically lose the preimage. This is, of course, a somewhat comically unlikely scenario, but I had an old note to expand the test and it turned up the issue, so we might as well fix it.
1 parent eae2bb1 commit 6c34c56

File tree

2 files changed

+143
-15
lines changed

2 files changed

+143
-15
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4208,13 +4208,17 @@ fn test_glacial_peer_cant_hang() {
42084208
do_test_glacial_peer_cant_hang(true);
42094209
}
42104210

4211-
#[test]
4212-
fn test_partial_claim_mon_update_compl_actions() {
4211+
fn do_test_partial_claim_mon_update_compl_actions(reload_a: bool, reload_b: bool) {
42134212
// Test that if we have an MPP claim that we ensure the preimage for the claim is retained in
42144213
// all the `ChannelMonitor`s until the preimage reaches every `ChannelMonitor` for a channel
42154214
// which was a part of the MPP.
42164215
let chanmon_cfgs = create_chanmon_cfgs(4);
42174216
let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
4217+
4218+
let (persister, persister_2, persister_3);
4219+
let (new_chain_mon, new_chain_mon_2, new_chain_mon_3);
4220+
let (nodes_3_reload, nodes_3_reload_2, nodes_3_reload_3);
4221+
42184222
let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
42194223
let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
42204224

@@ -4243,6 +4247,8 @@ fn test_partial_claim_mon_update_compl_actions() {
42434247
let paths = &[&[&nodes[1], &nodes[3]][..], &[&nodes[2], &nodes[3]][..]];
42444248
send_along_route_with_secret(&nodes[0], route, paths, 200_000, payment_hash, payment_secret);
42454249

4250+
// Store the monitor for channel 4 without the preimage to use on reload
4251+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
42464252
// Claim along both paths, but only complete one of the two monitor updates.
42474253
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
42484254
chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
@@ -4254,7 +4260,13 @@ fn test_partial_claim_mon_update_compl_actions() {
42544260
// Complete the 1<->3 monitor update and play the commitment_signed dance forward until it
42554261
// blocks.
42564262
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_3_id);
4257-
expect_payment_claimed!(&nodes[3], payment_hash, 200_000);
4263+
let payment_claimed = nodes[3].node.get_and_clear_pending_events();
4264+
assert_eq!(payment_claimed.len(), 1, "{payment_claimed:?}");
4265+
if let Event::PaymentClaimed { payment_hash: ev_hash, .. } = &payment_claimed[0] {
4266+
assert_eq!(*ev_hash, payment_hash);
4267+
} else {
4268+
panic!("{payment_claimed:?}");
4269+
}
42584270
let updates = get_htlc_update_msgs(&nodes[3], &node_b_id);
42594271

42604272
nodes[1].node.handle_update_fulfill_htlc(node_d_id, &updates.update_fulfill_htlcs[0]);
@@ -4273,15 +4285,41 @@ fn test_partial_claim_mon_update_compl_actions() {
42734285
check_added_monitors(&nodes[3], 0);
42744286
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
42754287

4288+
if reload_a {
4289+
// After a reload (with the monitor not yet fully updated), the RAA should still be blocked
4290+
// waiting until the monitor update completes.
4291+
let node_ser = nodes[3].node.encode();
4292+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4293+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4294+
reload_node!(nodes[3], &node_ser, mons, persister, new_chain_mon, nodes_3_reload);
4295+
// The final update to channel 4 should be replayed.
4296+
persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
4297+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4298+
check_added_monitors(&nodes[3], 1);
4299+
4300+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4301+
// restart.
4302+
let second_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4303+
assert_eq!(payment_claimed, second_payment_claimed);
4304+
4305+
nodes[1].node.peer_disconnected(node_d_id);
4306+
nodes[2].node.peer_disconnected(node_d_id);
4307+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4308+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4309+
4310+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4311+
}
4312+
42764313
// Now double-check that the preimage is still in the 1<->3 channel and complete the pending
42774314
// monitor update, allowing node 3 to claim the payment on the 2<->3 channel. This also
42784315
// unblocks the 1<->3 channel, allowing node 3 to release the two blocked monitor updates and
42794316
// respond to the final commitment_signed.
42804317
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
4318+
assert!(nodes[3].node.get_and_clear_pending_events().is_empty());
42814319

42824320
nodes[3].chain_monitor.complete_sole_pending_chan_update(&chan_4_id);
42834321
let mut ds_msgs = nodes[3].node.get_and_clear_pending_msg_events();
4284-
assert_eq!(ds_msgs.len(), 2);
4322+
assert_eq!(ds_msgs.len(), 2, "{ds_msgs:?}");
42854323
check_added_monitors(&nodes[3], 2);
42864324

42874325
match remove_first_msg_event_to_node(&node_b_id, &mut ds_msgs) {
@@ -4324,13 +4362,86 @@ fn test_partial_claim_mon_update_compl_actions() {
43244362
assert!(get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43254363
assert!(get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43264364

4365+
if reload_b {
4366+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4367+
// reload once the HTLCs for the first payment have been removed and the monitors
4368+
// completed.
4369+
let node_ser = nodes[3].node.encode();
4370+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4371+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4372+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4373+
reload_node!(nodes[3], &node_ser, mons, persister_2, new_chain_mon_2, nodes_3_reload_2);
4374+
check_added_monitors(&nodes[3], 0);
4375+
4376+
nodes[1].node.peer_disconnected(node_d_id);
4377+
nodes[2].node.peer_disconnected(node_d_id);
4378+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4379+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4380+
4381+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4382+
4383+
// Because the HTLCs aren't yet cleared, the PaymentClaimed event will be replayed on
4384+
// restart.
4385+
let third_payment_claimed = nodes[3].node.get_and_clear_pending_events();
4386+
assert_eq!(payment_claimed, third_payment_claimed);
4387+
}
4388+
43274389
send_payment(&nodes[1], &[&nodes[3]], 100_000);
43284390
assert!(!get_monitor!(nodes[3], chan_3_id).get_stored_preimages().contains_key(&payment_hash));
43294391

4392+
if reload_b {
4393+
// Ensure that the channel pause logic doesn't accidentally get restarted after a second
4394+
// reload once the HTLCs for the first payment have been removed and the monitors
4395+
// completed, even if only one of the two monitors still knows about the first payment.
4396+
let node_ser = nodes[3].node.encode();
4397+
let chan_3_monitor_serialized = get_monitor!(nodes[3], chan_3_id).encode();
4398+
let chan_4_monitor_serialized = get_monitor!(nodes[3], chan_4_id).encode();
4399+
let mons = &[&chan_3_monitor_serialized[..], &chan_4_monitor_serialized[..]];
4400+
reload_node!(nodes[3], &node_ser, mons, persister_3, new_chain_mon_3, nodes_3_reload_3);
4401+
check_added_monitors(&nodes[3], 0);
4402+
4403+
nodes[1].node.peer_disconnected(node_d_id);
4404+
nodes[2].node.peer_disconnected(node_d_id);
4405+
reconnect_nodes(ReconnectArgs::new(&nodes[1], &nodes[3]));
4406+
reconnect_nodes(ReconnectArgs::new(&nodes[2], &nodes[3]));
4407+
4408+
assert!(nodes[3].node.get_and_clear_pending_msg_events().is_empty());
4409+
4410+
// Because the HTLCs aren't yet cleared, the PaymentClaimed events for both payments will
4411+
// be replayed on restart.
4412+
// Use this as an opportunity to check the payment_ids are unique.
4413+
let mut events = nodes[3].node.get_and_clear_pending_events();
4414+
assert_eq!(events.len(), 2);
4415+
events.retain(|ev| *ev != payment_claimed[0]);
4416+
assert_eq!(events.len(), 1);
4417+
if let Event::PaymentClaimed { payment_id: original_payment_id, .. } = &payment_claimed[0] {
4418+
assert!(original_payment_id.is_some());
4419+
if let Event::PaymentClaimed { amount_msat, payment_id, .. } = &events[0] {
4420+
assert!(payment_id.is_some());
4421+
assert_ne!(original_payment_id, payment_id);
4422+
assert_eq!(*amount_msat, 100_000);
4423+
} else {
4424+
panic!("{events:?}");
4425+
}
4426+
} else {
4427+
panic!("{events:?}");
4428+
}
4429+
4430+
send_payment(&nodes[1], &[&nodes[3]], 100_000);
4431+
}
4432+
43304433
send_payment(&nodes[2], &[&nodes[3]], 100_000);
43314434
assert!(!get_monitor!(nodes[3], chan_4_id).get_stored_preimages().contains_key(&payment_hash));
43324435
}
43334436

4437+
#[test]
4438+
fn test_partial_claim_mon_update_compl_actions() {
4439+
do_test_partial_claim_mon_update_compl_actions(true, true);
4440+
do_test_partial_claim_mon_update_compl_actions(true, false);
4441+
do_test_partial_claim_mon_update_compl_actions(false, true);
4442+
do_test_partial_claim_mon_update_compl_actions(false, false);
4443+
}
4444+
43344445
#[test]
43354446
fn test_claim_to_closed_channel_blocks_forwarded_preimage_removal() {
43364447
// One of the last features for async persistence we implemented was the correct blocking of

lightning/src/ln/channelmanager.rs

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8079,25 +8079,42 @@ where
80798079
// payment claim from a `ChannelMonitor`. In some cases (MPP or
80808080
// if the HTLC was only recently removed) we make such claims
80818081
// after an HTLC has been removed from a channel entirely, and
8082-
// thus the RAA blocker has long since completed.
8082+
// thus the RAA blocker may have long since completed.
80838083
//
8084-
// In any other case, the RAA blocker must still be present and
8085-
// blocking RAAs.
8086-
debug_assert!(
8087-
during_init
8088-
|| peer_state
8089-
.actions_blocking_raa_monitor_updates
8090-
.get(&chan_id)
8091-
.unwrap()
8092-
.contains(&raa_blocker)
8093-
);
8084+
// However, its possible that the `ChannelMonitorUpdate` containing
8085+
// the preimage never completed and is still pending. In that case,
8086+
// we need to re-add the RAA blocker, which we do here. Handling
8087+
// the post-update action, below, will remove it again.
8088+
//
8089+
// In any other case (i.e. not during startup), the RAA blocker
8090+
// must still be present and blocking RAAs.
8091+
let actions = &mut peer_state.actions_blocking_raa_monitor_updates;
8092+
let actions_list = actions.entry(chan_id).or_insert_with(Vec::new);
8093+
if !actions_list.contains(&raa_blocker) {
8094+
debug_assert!(during_init);
8095+
actions_list.push(raa_blocker);
8096+
}
80948097
}
80958098
let action = if let Some(action) = action_opt {
80968099
action
80978100
} else {
80988101
return;
80998102
};
81008103

8104+
// If there are monitor updates in flight, we may be in the case
8105+
// described above, replaying a claim on startup which needs an RAA
8106+
// blocker to remain blocked. Thus, in such a case we simply push the
8107+
// post-update action to the blocked list and move on.
8108+
let in_flight_mons = peer_state.in_flight_monitor_updates.get(&chan_id);
8109+
if in_flight_mons.map(|(_, mons)| !mons.is_empty()).unwrap_or(false) {
8110+
peer_state
8111+
.monitor_update_blocked_actions
8112+
.entry(chan_id)
8113+
.or_insert_with(Vec::new)
8114+
.push(action);
8115+
return;
8116+
}
8117+
81018118
mem::drop(peer_state_opt);
81028119

81038120
log_trace!(logger, "Completing monitor update completion action for channel {} as claim was redundant: {:?}",

0 commit comments

Comments
 (0)