mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-03 00:31:50 +00:00
Allow AwaitingDownload to be a valid in-between state (#7984)
N/A Extracts (3) from https://github.com/sigp/lighthouse/pull/7946. Prior to peerdas, a batch should never have been in `AwaitingDownload` state because we immediataly try to move from `AwaitingDownload` to `Downloading` state by sending batches. This was always possible as long as we had peers in the `SyncingChain` in the pre-peerdas world. However, this is no longer the case as a batch can be stuck waiting in `AwaitingDownload` state if we have no peers to request the columns from. This PR makes `AwaitingDownload` to be an allowable in between state. If a batch is found to be in this state, then we attempt to send the batch instead of erroring like before. Note to reviewer: We need to make sure that this doesn't lead to a bunch of batches stuck in `AwaitingDownload` state if the chain can be progressed. Backfill already retries all batches in AwaitingDownload state so we just need to make `AwaitingDownload` a valid state during processing and validation. This PR explicitly adds the same logic for forward sync to download batches stuck in `AwaitingDownload`. Apart from that, we also force download of the `processing_target` when sync stops progressing. This is required in cases where `self.batches` has > `BATCH_BUFFER_SIZE` batches that are waiting to get processed but the `processing_batch` has repeatedly failed at download/processing stage. This leads to sync getting stuck and never recovering.
This commit is contained in:
@@ -687,11 +687,12 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
// Batch is not ready, nothing to process
|
// Batch is not ready, nothing to process
|
||||||
}
|
}
|
||||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||||
BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => {
|
// Batches can be in `AwaitingDownload` state if there weren't good data column subnet
|
||||||
|
// peers to send the request to.
|
||||||
|
BatchState::AwaitingDownload => return Ok(ProcessResult::Successful),
|
||||||
|
BatchState::Failed | BatchState::Processing(_) => {
|
||||||
// these are all inconsistent states:
|
// these are all inconsistent states:
|
||||||
// - Failed -> non recoverable batch. Chain should have been removed
|
// - Failed -> non recoverable batch. Chain should have been removed
|
||||||
// - AwaitingDownload -> A recoverable failed batch should have been
|
|
||||||
// re-requested.
|
|
||||||
// - Processing -> `self.current_processing_batch` is None
|
// - Processing -> `self.current_processing_batch` is None
|
||||||
self.fail_sync(BackFillError::InvalidSyncState(String::from(
|
self.fail_sync(BackFillError::InvalidSyncState(String::from(
|
||||||
"Invalid expected batch state",
|
"Invalid expected batch state",
|
||||||
@@ -790,7 +791,8 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
BatchState::Downloading(..) => {}
|
BatchState::Downloading(..) => {}
|
||||||
BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => {
|
BatchState::AwaitingDownload => return,
|
||||||
|
BatchState::Failed | BatchState::Poisoned => {
|
||||||
crit!("batch indicates inconsistent chain state while advancing chain")
|
crit!("batch indicates inconsistent chain state while advancing chain")
|
||||||
}
|
}
|
||||||
BatchState::AwaitingProcessing(..) => {}
|
BatchState::AwaitingProcessing(..) => {}
|
||||||
|
|||||||
@@ -350,7 +350,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
return Ok(KeepChain);
|
return Ok(KeepChain);
|
||||||
}
|
}
|
||||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||||
BatchState::Processing(_) | BatchState::AwaitingDownload | BatchState::Failed => {
|
// Batches can be in `AwaitingDownload` state if there weren't good data column subnet
|
||||||
|
// peers to send the request to.
|
||||||
|
BatchState::AwaitingDownload => return Ok(KeepChain),
|
||||||
|
BatchState::Processing(_) | BatchState::Failed => {
|
||||||
// these are all inconsistent states:
|
// these are all inconsistent states:
|
||||||
// - Processing -> `self.current_processing_batch` is None
|
// - Processing -> `self.current_processing_batch` is None
|
||||||
// - Failed -> non recoverable batch. For an optimistic batch, it should
|
// - Failed -> non recoverable batch. For an optimistic batch, it should
|
||||||
@@ -384,7 +387,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
// Batch is not ready, nothing to process
|
// Batch is not ready, nothing to process
|
||||||
}
|
}
|
||||||
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
BatchState::Poisoned => unreachable!("Poisoned batch"),
|
||||||
BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => {
|
// Batches can be in `AwaitingDownload` state if there weren't good data column subnet
|
||||||
|
// peers to send the request to.
|
||||||
|
BatchState::AwaitingDownload => return Ok(KeepChain),
|
||||||
|
BatchState::Failed | BatchState::Processing(_) => {
|
||||||
// these are all inconsistent states:
|
// these are all inconsistent states:
|
||||||
// - Failed -> non recoverable batch. Chain should have been removed
|
// - Failed -> non recoverable batch. Chain should have been removed
|
||||||
// - AwaitingDownload -> A recoverable failed batch should have been
|
// - AwaitingDownload -> A recoverable failed batch should have been
|
||||||
@@ -582,8 +588,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
BatchProcessResult::NonFaultyFailure => {
|
BatchProcessResult::NonFaultyFailure => {
|
||||||
batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?;
|
batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?;
|
||||||
|
|
||||||
// Simply re-download the batch.
|
// Simply re-download all batches in `AwaitingDownload` state.
|
||||||
self.send_batch(network, batch_id)
|
self.attempt_send_awaiting_download_batches(network, "non-faulty-failure")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -717,6 +723,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
previous_start = %old_start,
|
previous_start = %old_start,
|
||||||
new_start = %self.start_epoch,
|
new_start = %self.start_epoch,
|
||||||
processing_target = %self.processing_target,
|
processing_target = %self.processing_target,
|
||||||
|
id=%self.id,
|
||||||
"Chain advanced"
|
"Chain advanced"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -753,7 +760,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
}
|
}
|
||||||
// this is our robust `processing_target`. All previous batches must be awaiting
|
// this is our robust `processing_target`. All previous batches must be awaiting
|
||||||
// validation
|
// validation
|
||||||
let mut redownload_queue = Vec::new();
|
|
||||||
|
|
||||||
for (id, batch) in self.batches.range_mut(..batch_id) {
|
for (id, batch) in self.batches.range_mut(..batch_id) {
|
||||||
if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? {
|
if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? {
|
||||||
@@ -763,18 +769,14 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
failing_batch: *id,
|
failing_batch: *id,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
redownload_queue.push(*id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// no batch maxed out it process attempts, so now the chain's volatile progress must be
|
// no batch maxed out it process attempts, so now the chain's volatile progress must be
|
||||||
// reset
|
// reset
|
||||||
self.processing_target = self.start_epoch;
|
self.processing_target = self.start_epoch;
|
||||||
|
|
||||||
for id in redownload_queue {
|
// finally, re-request the failed batch and all other batches in `AwaitingDownload` state.
|
||||||
self.send_batch(network, id)?;
|
self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch")
|
||||||
}
|
|
||||||
// finally, re-request the failed batch.
|
|
||||||
self.send_batch(network, batch_id)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stop_syncing(&mut self) {
|
pub fn stop_syncing(&mut self) {
|
||||||
@@ -810,6 +812,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
|
|
||||||
// advance the chain to the new validating epoch
|
// advance the chain to the new validating epoch
|
||||||
self.advance_chain(network, validating_epoch);
|
self.advance_chain(network, validating_epoch);
|
||||||
|
// attempt to download any batches stuck in the `AwaitingDownload` state because of
|
||||||
|
// a lack of peers earlier
|
||||||
|
self.attempt_send_awaiting_download_batches(network, "start_syncing")?;
|
||||||
if self.optimistic_start.is_none()
|
if self.optimistic_start.is_none()
|
||||||
&& optimistic_epoch > self.processing_target
|
&& optimistic_epoch > self.processing_target
|
||||||
&& !self.attempted_optimistic_starts.contains(&optimistic_epoch)
|
&& !self.attempted_optimistic_starts.contains(&optimistic_epoch)
|
||||||
@@ -939,6 +944,41 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Attempts to send all batches that are in `AwaitingDownload` state.
|
||||||
|
///
|
||||||
|
/// Batches might get stuck in `AwaitingDownload` post peerdas because of lack of peers
|
||||||
|
/// in required subnets. We need to progress them if peers are available at a later point.
|
||||||
|
pub fn attempt_send_awaiting_download_batches(
|
||||||
|
&mut self,
|
||||||
|
network: &mut SyncNetworkContext<T>,
|
||||||
|
src: &str,
|
||||||
|
) -> ProcessingResult {
|
||||||
|
// Collect all batches in AwaitingDownload state and see if they can be sent
|
||||||
|
let awaiting_downloads: Vec<_> = self
|
||||||
|
.batches
|
||||||
|
.iter()
|
||||||
|
.filter(|(_, batch)| matches!(batch.state(), BatchState::AwaitingDownload))
|
||||||
|
.map(|(batch_id, _)| batch_id)
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
debug!(
|
||||||
|
?awaiting_downloads,
|
||||||
|
src, "Attempting to send batches awaiting downlaod"
|
||||||
|
);
|
||||||
|
|
||||||
|
for batch_id in awaiting_downloads {
|
||||||
|
if self.good_peers_on_sampling_subnets(batch_id, network) {
|
||||||
|
self.send_batch(network, batch_id)?;
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
src = "attempt_send_awaiting_download_batches",
|
||||||
|
"Waiting for peers to be available on sampling column subnets"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(KeepChain)
|
||||||
|
}
|
||||||
|
|
||||||
/// Requests the batch assigned to the given id from a given peer.
|
/// Requests the batch assigned to the given id from a given peer.
|
||||||
pub fn send_batch(
|
pub fn send_batch(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -1089,14 +1129,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
if !matches!(self.state, ChainSyncingState::Syncing) {
|
if !matches!(self.state, ChainSyncingState::Syncing) {
|
||||||
return Ok(KeepChain);
|
return Ok(KeepChain);
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the next pending batch and request it from the peer
|
// find the next pending batch and request it from the peer
|
||||||
|
|
||||||
// check if we have the batch for our optimistic start. If not, request it first.
|
// check if we have the batch for our optimistic start. If not, request it first.
|
||||||
// We wait for this batch before requesting any other batches.
|
// We wait for this batch before requesting any other batches.
|
||||||
if let Some(epoch) = self.optimistic_start {
|
if let Some(epoch) = self.optimistic_start {
|
||||||
if !self.good_peers_on_sampling_subnets(epoch, network) {
|
if !self.good_peers_on_sampling_subnets(epoch, network) {
|
||||||
debug!("Waiting for peers to be available on sampling column subnets");
|
debug!(
|
||||||
|
src = "request_batches_optimistic",
|
||||||
|
"Waiting for peers to be available on sampling column subnets"
|
||||||
|
);
|
||||||
return Ok(KeepChain);
|
return Ok(KeepChain);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1105,6 +1147,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type);
|
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type);
|
||||||
entry.insert(optimistic_batch);
|
entry.insert(optimistic_batch);
|
||||||
self.send_batch(network, epoch)?;
|
self.send_batch(network, epoch)?;
|
||||||
|
} else {
|
||||||
|
self.attempt_send_awaiting_download_batches(network, "request_batches_optimistic")?;
|
||||||
}
|
}
|
||||||
return Ok(KeepChain);
|
return Ok(KeepChain);
|
||||||
}
|
}
|
||||||
@@ -1179,7 +1223,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
// block and data column requests are currently coupled. This can be removed once we find a
|
// block and data column requests are currently coupled. This can be removed once we find a
|
||||||
// way to decouple the requests and do retries individually, see issue #6258.
|
// way to decouple the requests and do retries individually, see issue #6258.
|
||||||
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
|
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
|
||||||
debug!("Waiting for peers to be available on custody column subnets");
|
debug!(
|
||||||
|
src = "include_next_batch",
|
||||||
|
"Waiting for peers to be available on custody column subnets"
|
||||||
|
);
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user