Allow AwaitingDownload to be a valid in-between state (#7984)

N/A


  Extracts (3) from https://github.com/sigp/lighthouse/pull/7946.

Prior to peerdas, a batch should never have been in `AwaitingDownload` state because we immediataly try to move from `AwaitingDownload` to `Downloading` state by sending batches. This was always possible as long as we had peers in the `SyncingChain` in the pre-peerdas world.

However, this is no longer the case as a batch can be stuck waiting in `AwaitingDownload` state if we have no peers to request the columns from. This PR makes `AwaitingDownload` to be an allowable in between state. If a batch is found to be in this state, then we attempt to send the batch instead of erroring like before.
Note to reviewer: We need to make sure that this doesn't lead to a bunch of batches stuck in `AwaitingDownload` state if the chain can be progressed.

Backfill already retries all batches in AwaitingDownload state so we just need to make `AwaitingDownload` a valid state during processing and validation.

This PR explicitly adds the same logic for forward sync to download batches stuck in `AwaitingDownload`.
Apart from that, we also force download of the `processing_target` when sync stops progressing. This is required in cases where `self.batches` has > `BATCH_BUFFER_SIZE` batches that are waiting to get processed but the `processing_batch` has repeatedly failed at download/processing stage. This leads to sync getting stuck and never recovering.
This commit is contained in:
Pawan Dhananjay
2025-09-04 00:39:16 -07:00
committed by GitHub
parent c2a92f1a8c
commit 84ec209eba
2 changed files with 67 additions and 18 deletions

View File

@@ -687,11 +687,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
// Batch is not ready, nothing to process // Batch is not ready, nothing to process
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { // Batches can be in `AwaitingDownload` state if there weren't good data column subnet
// peers to send the request to.
BatchState::AwaitingDownload => return Ok(ProcessResult::Successful),
BatchState::Failed | BatchState::Processing(_) => {
// these are all inconsistent states: // these are all inconsistent states:
// - Failed -> non recoverable batch. Chain should have been removed // - Failed -> non recoverable batch. Chain should have been removed
// - AwaitingDownload -> A recoverable failed batch should have been
// re-requested.
// - Processing -> `self.current_processing_batch` is None // - Processing -> `self.current_processing_batch` is None
self.fail_sync(BackFillError::InvalidSyncState(String::from( self.fail_sync(BackFillError::InvalidSyncState(String::from(
"Invalid expected batch state", "Invalid expected batch state",
@@ -790,7 +791,8 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
} }
} }
BatchState::Downloading(..) => {} BatchState::Downloading(..) => {}
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { BatchState::AwaitingDownload => return,
BatchState::Failed | BatchState::Poisoned => {
crit!("batch indicates inconsistent chain state while advancing chain") crit!("batch indicates inconsistent chain state while advancing chain")
} }
BatchState::AwaitingProcessing(..) => {} BatchState::AwaitingProcessing(..) => {}

View File

@@ -350,7 +350,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
return Ok(KeepChain); return Ok(KeepChain);
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
BatchState::Processing(_) | BatchState::AwaitingDownload | BatchState::Failed => { // Batches can be in `AwaitingDownload` state if there weren't good data column subnet
// peers to send the request to.
BatchState::AwaitingDownload => return Ok(KeepChain),
BatchState::Processing(_) | BatchState::Failed => {
// these are all inconsistent states: // these are all inconsistent states:
// - Processing -> `self.current_processing_batch` is None // - Processing -> `self.current_processing_batch` is None
// - Failed -> non recoverable batch. For an optimistic batch, it should // - Failed -> non recoverable batch. For an optimistic batch, it should
@@ -384,7 +387,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// Batch is not ready, nothing to process // Batch is not ready, nothing to process
} }
BatchState::Poisoned => unreachable!("Poisoned batch"), BatchState::Poisoned => unreachable!("Poisoned batch"),
BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { // Batches can be in `AwaitingDownload` state if there weren't good data column subnet
// peers to send the request to.
BatchState::AwaitingDownload => return Ok(KeepChain),
BatchState::Failed | BatchState::Processing(_) => {
// these are all inconsistent states: // these are all inconsistent states:
// - Failed -> non recoverable batch. Chain should have been removed // - Failed -> non recoverable batch. Chain should have been removed
// - AwaitingDownload -> A recoverable failed batch should have been // - AwaitingDownload -> A recoverable failed batch should have been
@@ -582,8 +588,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
BatchProcessResult::NonFaultyFailure => { BatchProcessResult::NonFaultyFailure => {
batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?;
// Simply re-download the batch. // Simply re-download all batches in `AwaitingDownload` state.
self.send_batch(network, batch_id) self.attempt_send_awaiting_download_batches(network, "non-faulty-failure")
} }
} }
} }
@@ -717,6 +723,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
previous_start = %old_start, previous_start = %old_start,
new_start = %self.start_epoch, new_start = %self.start_epoch,
processing_target = %self.processing_target, processing_target = %self.processing_target,
id=%self.id,
"Chain advanced" "Chain advanced"
); );
} }
@@ -753,7 +760,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
} }
// this is our robust `processing_target`. All previous batches must be awaiting // this is our robust `processing_target`. All previous batches must be awaiting
// validation // validation
let mut redownload_queue = Vec::new();
for (id, batch) in self.batches.range_mut(..batch_id) { for (id, batch) in self.batches.range_mut(..batch_id) {
if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? {
@@ -763,18 +769,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
failing_batch: *id, failing_batch: *id,
}); });
} }
redownload_queue.push(*id);
} }
// no batch maxed out it process attempts, so now the chain's volatile progress must be // no batch maxed out it process attempts, so now the chain's volatile progress must be
// reset // reset
self.processing_target = self.start_epoch; self.processing_target = self.start_epoch;
for id in redownload_queue { // finally, re-request the failed batch and all other batches in `AwaitingDownload` state.
self.send_batch(network, id)?; self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch")
}
// finally, re-request the failed batch.
self.send_batch(network, batch_id)
} }
pub fn stop_syncing(&mut self) { pub fn stop_syncing(&mut self) {
@@ -810,6 +812,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// advance the chain to the new validating epoch // advance the chain to the new validating epoch
self.advance_chain(network, validating_epoch); self.advance_chain(network, validating_epoch);
// attempt to download any batches stuck in the `AwaitingDownload` state because of
// a lack of peers earlier
self.attempt_send_awaiting_download_batches(network, "start_syncing")?;
if self.optimistic_start.is_none() if self.optimistic_start.is_none()
&& optimistic_epoch > self.processing_target && optimistic_epoch > self.processing_target
&& !self.attempted_optimistic_starts.contains(&optimistic_epoch) && !self.attempted_optimistic_starts.contains(&optimistic_epoch)
@@ -939,6 +944,41 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
} }
} }
/// Attempts to send all batches that are in `AwaitingDownload` state.
///
/// Batches might get stuck in `AwaitingDownload` post peerdas because of lack of peers
/// in required subnets. We need to progress them if peers are available at a later point.
pub fn attempt_send_awaiting_download_batches(
&mut self,
network: &mut SyncNetworkContext<T>,
src: &str,
) -> ProcessingResult {
// Collect all batches in AwaitingDownload state and see if they can be sent
let awaiting_downloads: Vec<_> = self
.batches
.iter()
.filter(|(_, batch)| matches!(batch.state(), BatchState::AwaitingDownload))
.map(|(batch_id, _)| batch_id)
.copied()
.collect();
debug!(
?awaiting_downloads,
src, "Attempting to send batches awaiting downlaod"
);
for batch_id in awaiting_downloads {
if self.good_peers_on_sampling_subnets(batch_id, network) {
self.send_batch(network, batch_id)?;
} else {
debug!(
src = "attempt_send_awaiting_download_batches",
"Waiting for peers to be available on sampling column subnets"
);
}
}
Ok(KeepChain)
}
/// Requests the batch assigned to the given id from a given peer. /// Requests the batch assigned to the given id from a given peer.
pub fn send_batch( pub fn send_batch(
&mut self, &mut self,
@@ -1089,14 +1129,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
if !matches!(self.state, ChainSyncingState::Syncing) { if !matches!(self.state, ChainSyncingState::Syncing) {
return Ok(KeepChain); return Ok(KeepChain);
} }
// find the next pending batch and request it from the peer // find the next pending batch and request it from the peer
// check if we have the batch for our optimistic start. If not, request it first. // check if we have the batch for our optimistic start. If not, request it first.
// We wait for this batch before requesting any other batches. // We wait for this batch before requesting any other batches.
if let Some(epoch) = self.optimistic_start { if let Some(epoch) = self.optimistic_start {
if !self.good_peers_on_sampling_subnets(epoch, network) { if !self.good_peers_on_sampling_subnets(epoch, network) {
debug!("Waiting for peers to be available on sampling column subnets"); debug!(
src = "request_batches_optimistic",
"Waiting for peers to be available on sampling column subnets"
);
return Ok(KeepChain); return Ok(KeepChain);
} }
@@ -1105,6 +1147,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type);
entry.insert(optimistic_batch); entry.insert(optimistic_batch);
self.send_batch(network, epoch)?; self.send_batch(network, epoch)?;
} else {
self.attempt_send_awaiting_download_batches(network, "request_batches_optimistic")?;
} }
return Ok(KeepChain); return Ok(KeepChain);
} }
@@ -1179,7 +1223,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// block and data column requests are currently coupled. This can be removed once we find a // block and data column requests are currently coupled. This can be removed once we find a
// way to decouple the requests and do retries individually, see issue #6258. // way to decouple the requests and do retries individually, see issue #6258.
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
debug!("Waiting for peers to be available on custody column subnets"); debug!(
src = "include_next_batch",
"Waiting for peers to be available on custody column subnets"
);
return None; return None;
} }