-
Notifications
You must be signed in to change notification settings - Fork 254
Track segment indexes correctly during DSN sync #3521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
60b6d97
65d7f6f
bf83772
b16fc0a
8e1937f
10a7a93
db1167e
e03f74d
1524873
083b8d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ use sc_consensus::import_queue::ImportQueueService; | |
use sc_consensus::IncomingBlock; | ||
use sc_consensus_subspace::archiver::{decode_block, encode_block, SegmentHeadersStore}; | ||
use sc_service::Error; | ||
use sc_tracing::tracing::{debug, trace}; | ||
use sc_tracing::tracing::{debug, info, trace}; | ||
use sp_consensus::BlockOrigin; | ||
use sp_runtime::generic::SignedBlock; | ||
use sp_runtime::traits::{Block as BlockT, Header, NumberFor, One}; | ||
|
@@ -35,7 +35,7 @@ pub(super) async fn import_blocks_from_dsn<Block, AS, Client, PG, IQS>( | |
client: &Client, | ||
piece_getter: &PG, | ||
import_queue_service: &mut IQS, | ||
last_processed_segment_index: &mut SegmentIndex, | ||
last_completed_segment_index: &mut SegmentIndex, | ||
last_processed_block_number: &mut NumberFor<Block>, | ||
erasure_coding: &ErasureCoding, | ||
) -> Result<u64, Error> | ||
|
@@ -70,7 +70,7 @@ where | |
let mut imported_blocks = 0; | ||
let mut reconstructor = Arc::new(Mutex::new(Reconstructor::new(erasure_coding.clone()))); | ||
// Start from the first unprocessed segment and process all segments known so far | ||
let segment_indices_iter = (*last_processed_segment_index + SegmentIndex::ONE) | ||
let segment_indices_iter = (*last_completed_segment_index + SegmentIndex::ONE) | ||
..=segment_headers_store | ||
.max_segment_index() | ||
.expect("Exists, we have inserted segment headers above; qed"); | ||
|
@@ -105,20 +105,56 @@ where | |
// so it can't change. Resetting the reconstructor loses any partial blocks, so we | ||
// only reset if the (possibly partial) last block has been processed. | ||
if *last_processed_block_number >= last_archived_maybe_partial_block_number { | ||
*last_processed_segment_index = segment_index; | ||
debug!( | ||
target: LOG_TARGET, | ||
%segment_index, | ||
%last_processed_block_number, | ||
%last_archived_maybe_partial_block_number, | ||
%last_archived_block_partial, | ||
"Already processed last (possibly partial) block in segment, resetting reconstructor", | ||
); | ||
*last_completed_segment_index = segment_index; | ||
// Reset reconstructor instance | ||
reconstructor = Arc::new(Mutex::new(Reconstructor::new(erasure_coding.clone()))); | ||
continue; | ||
} | ||
// Just one partial unprocessed block and this was the last segment available, so nothing to | ||
// import | ||
// import. (But we also haven't finished this segment yet, because of the partial block.) | ||
if last_archived_maybe_partial_block_number == *last_processed_block_number + One::one() | ||
&& last_archived_block_partial | ||
&& segment_indices_iter.peek().is_none() | ||
{ | ||
// Reset reconstructor instance | ||
reconstructor = Arc::new(Mutex::new(Reconstructor::new(erasure_coding.clone()))); | ||
continue; | ||
if segment_indices_iter.peek().is_none() { | ||
// We haven't fully processed this segment yet, because it ends with a partial block. | ||
*last_completed_segment_index = segment_index.saturating_sub(SegmentIndex::ONE); | ||
|
||
// We don't need to reset the reconstructor here. We've finished getting blocks, so | ||
// we're about to return and drop the reconstructor and its partial block anyway. | ||
// (Normally, we'd need that partial block to avoid a block gap. But we should be close | ||
// enough to the tip that normal syncing will fill any gaps.) | ||
debug!( | ||
target: LOG_TARGET, | ||
%segment_index, | ||
%last_processed_block_number, | ||
%last_archived_maybe_partial_block_number, | ||
%last_archived_block_partial, | ||
"No more segments, snap sync is about to finish", | ||
); | ||
continue; | ||
} else { | ||
// Downloading an entire segment for one partial block should be rare, but if it | ||
// happens a lot we want to see it in the logs. | ||
// | ||
// TODO: if this happens a lot, check for network/DSN sync bugs - we should be able | ||
// to sync to near the tip reliably, so we don't have to keep reconstructor state. | ||
info!( | ||
target: LOG_TARGET, | ||
%segment_index, | ||
%last_processed_block_number, | ||
%last_archived_maybe_partial_block_number, | ||
%last_archived_block_partial, | ||
"Downloading entire segment for one partial block", | ||
); | ||
} | ||
} | ||
|
||
let segment_pieces = download_segment_pieces(segment_index, piece_getter) | ||
|
@@ -239,7 +275,12 @@ where | |
import_queue_service.import_blocks(BlockOrigin::NetworkInitialSync, blocks_to_import); | ||
} | ||
|
||
*last_processed_segment_index = segment_index; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is in this line in the
// Segments are only fully processed when all their blocks are processed.
if last_archived_block_partial {
*last_processed_segment_index = segment_index.saturating_sub(1);
} else {
*last_processed_segment_index = segment_index;
} |
||
// Segments are only fully processed when all their blocks are fully processed. | ||
if last_archived_block_partial { | ||
*last_completed_segment_index = segment_index.saturating_sub(SegmentIndex::ONE); | ||
} else { | ||
*last_completed_segment_index = segment_index; | ||
} | ||
} | ||
|
||
Ok(imported_blocks) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -416,7 +416,7 @@ where | |
{ | ||
let block_number = *header.number(); | ||
|
||
const STATE_SYNC_RETRIES: u32 = 5; | ||
const STATE_SYNC_RETRIES: u32 = 20; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need such high state sync retries ? Not opposing it but want to know the reason behind the change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was sufficient for some nodes, but not others. Both Jim and I got failures here during testing, and the state sync is small, so it's cheaper to retry a few more times here, than give up and start the entire process again (or require manual operator intervention). |
||
const LOOP_PAUSE: Duration = Duration::from_secs(20); | ||
const MAX_GET_PEERS_ATTEMPT_NUMBER: usize = 30; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why was it needed to reorder this? It was at the end to logically show that we discard all the extra notifications that were issues while we were processing the previous notification. I don't see how it fixes or breaks anything being moved here, but I also don't understand why it is needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not particularly committed to this part of the change.
The block gap was sometimes being hidden by substrate sync, which was one reason the bug was hard to diagnose and fix.
Reducing the amount of time that the atomic is set to "true" in each loop reduces the chance that substrate sync hides any bugs - if there are a lot of sync notifications being sent continuously.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, so the goal was to move it before atomic specifically. Wasn't obvious, but makes sense now, thanks.