Skip to content

Commit 57367bb

Browse files
apollo_l1_provider: go to bootstrap when getting commit_block, not on startup (#9840)
1 parent 04199f1 commit 57367bb

File tree

6 files changed

+130
-118
lines changed

6 files changed

+130
-118
lines changed

crates/apollo_l1_provider/src/bootstrapper.rs

Lines changed: 25 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,28 @@ use std::sync::atomic::{AtomicU8, Ordering};
22
use std::sync::Arc;
33
use std::time::Duration;
44

5-
use apollo_batcher_types::batcher_types::GetHeightResponse;
6-
use apollo_batcher_types::communication::SharedBatcherClient;
75
use apollo_l1_provider_types::SharedL1ProviderClient;
86
use apollo_state_sync_types::communication::SharedStateSyncClient;
97
use indexmap::IndexSet;
108
use starknet_api::block::BlockNumber;
119
use starknet_api::transaction::TransactionHash;
12-
use tokio::sync::OnceCell;
13-
use tracing::{debug, error, info};
10+
use tracing::debug;
1411

15-
pub type LazyCatchUpHeight = Arc<OnceCell<BlockNumber>>;
12+
// When the Provider gets a commit_block that is too high, it starts bootstrapping.
13+
// The commit is rejected by the provider, so it must use sync to catch up to the height of the
14+
// commit, including that height. The sync task continues until reaching the target height,
15+
// inclusive, and only after the commit_block (from sync) causes the Provider's current height to be
16+
// one above the target height, is the backlog applied. Once done with the sync+backlog, the current
17+
// height should be one above the last commit in the backlog, which makes it ready for the next
18+
// commit_block from the batcher.
1619

1720
/// Caches commits to be applied later. This flow is only relevant while the node is starting up.
1821
#[derive(Clone)]
1922
pub struct Bootstrapper {
20-
/// The catch-up height for the bootstrapper is the batcher height (unless overridden
21-
/// explicitly). This value, due to infra constraints as of now, is only fetchable _after_
22-
/// the provider is running, and not during its initialization, hence we are forced to
23-
/// lazily fetch it at runtime.
24-
pub catch_up_height: LazyCatchUpHeight,
23+
pub catch_up_height: BlockNumber,
2524
pub sync_retry_interval: Duration,
2625
pub commit_block_backlog: Vec<CommitBlockBacklog>,
2726
pub l1_provider_client: SharedL1ProviderClient,
28-
pub batcher_client: SharedBatcherClient,
2927
pub sync_client: SharedStateSyncClient,
3028
// Keep track of sync task for health checks and logging status.
3129
pub sync_task_handle: SyncTaskHandle,
@@ -39,16 +37,14 @@ impl Bootstrapper {
3937

4038
pub fn new(
4139
l1_provider_client: SharedL1ProviderClient,
42-
batcher_client: SharedBatcherClient,
4340
sync_client: SharedStateSyncClient,
4441
sync_retry_interval: Duration,
45-
catch_up_height: LazyCatchUpHeight,
42+
catch_up_height: BlockNumber,
4643
) -> Self {
4744
Self {
4845
sync_retry_interval,
4946
commit_block_backlog: Default::default(),
5047
l1_provider_client,
51-
batcher_client,
5248
sync_client,
5349
sync_task_handle: SyncTaskHandle::NotStartedYet,
5450
n_sync_health_check_failures: Default::default(),
@@ -57,12 +53,8 @@ impl Bootstrapper {
5753
}
5854

5955
/// Check if the caller has caught up with the bootstrapper.
60-
/// If catch_up_height is unset, the batcher isn't even ready yet.
6156
pub fn is_caught_up(&self, current_provider_height: BlockNumber) -> bool {
62-
let is_caught_up = match self.catch_up_height() {
63-
Some(catch_up_height) => current_provider_height > catch_up_height,
64-
None => current_provider_height == BlockNumber(0),
65-
};
57+
let is_caught_up = current_provider_height > self.catch_up_height;
6658

6759
self.sync_task_health_check(is_caught_up);
6860

@@ -88,25 +80,29 @@ impl Bootstrapper {
8880

8981
/// Spawns async task that produces and sends commit block messages to the provider, according
9082
/// to information from the batcher and sync clients, until the provider is caught up.
91-
pub async fn start_l2_sync(&mut self, current_provider_height: BlockNumber) {
83+
pub fn start_l2_sync(
84+
&mut self,
85+
current_provider_height: BlockNumber,
86+
catch_up_height: BlockNumber,
87+
) {
88+
self.catch_up_height = catch_up_height;
9289
// FIXME: spawning a task like this is evil.
9390
// However, we aren't using the task executor, so no choice :(
9491
// Once we start using a centralized threadpool, spawn through it instead of the
9592
// tokio runtime.
9693
let sync_task_handle = tokio::spawn(l2_sync_task(
9794
self.l1_provider_client.clone(),
98-
self.batcher_client.clone(),
9995
self.sync_client.clone(),
10096
current_provider_height,
101-
self.catch_up_height.clone(),
97+
catch_up_height,
10298
self.sync_retry_interval,
10399
));
104100

105101
self.sync_task_handle = SyncTaskHandle::Started(sync_task_handle.into());
106102
}
107103

108-
pub fn catch_up_height(&self) -> Option<BlockNumber> {
109-
self.catch_up_height.get().copied()
104+
pub fn catch_up_height(&self) -> BlockNumber {
105+
self.catch_up_height
110106
}
111107

112108
pub fn sync_started(&self) -> bool {
@@ -155,36 +151,17 @@ impl std::fmt::Debug for Bootstrapper {
155151

156152
async fn l2_sync_task(
157153
l1_provider_client: SharedL1ProviderClient,
158-
batcher_client: SharedBatcherClient,
159154
sync_client: SharedStateSyncClient,
160155
mut current_height: BlockNumber,
161-
catch_up_height: LazyCatchUpHeight,
156+
catch_up_height: BlockNumber,
162157
retry_interval: Duration,
163158
) {
164-
info!("Try fetching batcher height to initialize catch up point");
165-
while !catch_up_height.initialized() {
166-
let Ok(GetHeightResponse { height: batcher_height }) = batcher_client.get_height().await
167-
else {
168-
error!("Batcher height request failed. Retrying...");
169-
tokio::time::sleep(retry_interval).await;
170-
continue;
171-
};
172-
173-
let Some(batcher_latest_block_number) = batcher_height.prev() else {
174-
info!("Batcher height is 0, no need to catch up. exiting...");
175-
return;
176-
};
177-
178-
info!("Catch up height set: {batcher_latest_block_number}");
179-
catch_up_height
180-
.set(batcher_latest_block_number)
181-
.expect("This is the only write-point, cannot fail")
182-
}
183-
let catch_up_height = *catch_up_height.get().expect("Initialized above");
184-
185159
while current_height <= catch_up_height {
186160
// TODO(Gilad): add tracing instrument.
187-
debug!("Try syncing L1Provider with L2 height: {}", current_height);
161+
debug!(
162+
"Syncing L1Provider with L2 height: {} to target height: {}",
163+
current_height, catch_up_height
164+
);
188165
let block = sync_client.get_block(current_height).await.inspect_err(|err| debug!("{err}"));
189166

190167
match block {

crates/apollo_l1_provider/src/l1_provider.rs

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,13 @@ impl L1Provider {
6868
);
6969
};
7070

71-
// The provider now goes into bootstrap state.
72-
// TODO(guyn): in the future, this will happen when batcher calls the provider with a height
73-
// bigger than the current height. This would happen either in Uninitialized or in other
74-
// non-bootstrap states. That means we will move from Uninitialized to Pending (but
75-
// on a height much lower than the batcher's).
71+
// The provider now goes into Pending state.
72+
// The current_height is set to a very old height, that doesn't include any of the events
73+
// sent now, or to be scraped in the future. The provider will begin bootstrapping when the
74+
// batcher calls commit_block with a height above the current height.
7675
self.start_height = Some(historic_l2_height);
7776
self.current_height = historic_l2_height;
78-
self.state = ProviderState::Bootstrap;
79-
self.bootstrapper.start_l2_sync(self.current_height).await;
77+
self.state = ProviderState::Pending;
8078
self.add_events(events)?;
8179

8280
Ok(())
@@ -284,16 +282,43 @@ impl L1Provider {
284282

285283
// If not historical height and not bootstrapping, must go into bootstrap state upon getting
286284
// wrong height.
287-
// TODO(guyn): for now, we go into bootstrap using panic. We should improve this.
288-
self.check_height_with_panic(height);
289-
self.apply_commit_block(committed_txs, rejected_txs);
290-
291-
self.state = self.state.transition_to_pending();
292-
Ok(())
285+
match self.check_height_with_error(height) {
286+
Ok(_) => {
287+
self.apply_commit_block(committed_txs, rejected_txs);
288+
self.state = self.state.transition_to_pending();
289+
Ok(())
290+
}
291+
Err(err) => {
292+
// We are returning an error -> not accepting the block with this height. In order
293+
// to to be able to serve future requests, we must catch up to it, and finish
294+
// catching up when the provider has synced this height.
295+
if self.state.is_uninitialized() {
296+
warn!(
297+
"Provider received a block height ({height}) while it is uninitialized. \
298+
Cannot start bootstrapping until getting the historic_height from the \
299+
scraper during the initialize call."
300+
);
301+
} else {
302+
info!(
303+
"Provider received a block_height ({height}) that is higher than the \
304+
current height ({}), starting bootstrapping.",
305+
self.current_height
306+
);
307+
self.start_bootstrapping(height);
308+
}
309+
Err(err)
310+
}
311+
}
293312
}
294313

295314
// Functions called internally.
296315

316+
/// Go from current state to Bootstrap state and start the L2 sync.
317+
pub fn start_bootstrapping(&mut self, target_height: BlockNumber) {
318+
self.state = ProviderState::Bootstrap;
319+
self.bootstrapper.start_l2_sync(self.current_height, target_height);
320+
}
321+
297322
/// Commit the given transactions, and increment the current height.
298323
fn apply_commit_block(
299324
&mut self,
@@ -323,7 +348,6 @@ impl L1Provider {
323348
"Bootstrapper processing commit-block at height: {new_height}, current height is \
324349
{current_height}"
325350
);
326-
327351
match new_height.cmp(&current_height) {
328352
// This is likely a bug in the batcher/sync, it should never be _behind_ the provider.
329353
Less => {
@@ -369,6 +393,8 @@ impl L1Provider {
369393
};
370394

371395
// If caught up, apply the backlog and transition to Pending.
396+
// Note that at this point self.current_height is already incremented to the next height, it
397+
// is one more than the latest block that was committed.
372398
if self.bootstrapper.is_caught_up(self.current_height) {
373399
info!(
374400
"Bootstrapper sync completed, provider height is now {}, processing backlog...",
@@ -408,21 +434,6 @@ impl L1Provider {
408434
Ok(())
409435
}
410436

411-
fn check_height_with_panic(&mut self, height: BlockNumber) {
412-
if height > self.current_height {
413-
// TODO(shahak): Add a way to move to bootstrap mode from any point and move to
414-
// bootstrap here instead of panicking.
415-
panic!(
416-
"Batcher surpassed l1 provider. Panicking in order to restart the provider and \
417-
bootstrap again. l1 provider height: {}, batcher height: {}",
418-
self.current_height, height
419-
);
420-
}
421-
if height < self.current_height {
422-
panic!("Unexpected height: expected >= {}, got {}", self.current_height, height);
423-
}
424-
}
425-
426437
fn check_height_with_error(&mut self, height: BlockNumber) -> L1ProviderResult<()> {
427438
if height != self.current_height {
428439
return Err(L1ProviderError::UnexpectedHeight {
@@ -535,6 +546,8 @@ impl L1ProviderBuilder {
535546
})
536547
.or(self.startup_height);
537548

549+
// TODO(guyn): try to remove the input catchup_height entirely (check if it is needed for
550+
// tests/Anvil).
538551
let catchup_height = self
539552
.config
540553
.bootstrap_catch_up_height_override
@@ -546,13 +559,10 @@ impl L1ProviderBuilder {
546559
);
547560
})
548561
.or(self.catchup_height)
549-
.map(|catchup_height| Arc::new(catchup_height.into()))
550-
// When kept None, this value is fetched from the batcher by the bootstrapper at runtime.
551-
.unwrap_or_default();
562+
.unwrap_or_default(); // If bootstrapper is not running, the catchup height can be arbitrarily low.
552563

553564
let bootstrapper = Bootstrapper::new(
554565
self.l1_provider_client,
555-
self.batcher_client,
556566
self.state_sync_client,
557567
self.config.startup_sync_sleep_retry_interval_seconds,
558568
catchup_height,

0 commit comments

Comments
 (0)