Sync cleanups (#8230)

N/A


  1. In the batch retry logic, we were failing to set the batch state to `AwaitingDownload` before attempting a retry. This PR sets it to `AwaitingDownload` before the retry and sets it back to `Downloading` if the retry suceeded in sending out a request
2. Remove all peer scoring logic from retrying and rely on just de priorotizing the failed peer. I finally concede the point to @dapplion 😄
3. Changes `block_components_by_range_request` to accept `block_peers` and `column_peers`. This is to ensure that we use the full synced peerset for requesting columns in order to avoid splitting the column peers among multiple head chains. During forward sync, we want the block peers to be the peers from the syncing chain and column peers to be all synced peers from the peerdb.
Also, fixes a typo and calls `attempt_send_awaiting_download_batches` from more places


Co-Authored-By: Pawan Dhananjay <pawandhananjay@gmail.com>
This commit is contained in:
Pawan Dhananjay
2025-10-20 04:50:00 -07:00
committed by GitHub
parent c012f46cb9
commit 092aaae961
6 changed files with 77 additions and 44 deletions

View File

@@ -247,23 +247,16 @@ impl<E: EthSpec> PeerDB<E> {
.map(|(peer_id, _)| peer_id)
}
/// Returns all the synced peers from the list of allowed peers that claim to have the block
/// Returns all the synced peers from the peer db that claim to have the block
/// components for the given epoch based on `status.earliest_available_slot`.
///
/// If `earliest_available_slot` info is not available, then return peer anyway assuming it has the
/// required data.
///
/// If `allowed_peers` is `Some`, then filters for the epoch only for those peers.
pub fn synced_peers_for_epoch<'a>(
&'a self,
epoch: Epoch,
allowed_peers: Option<&'a HashSet<PeerId>>,
) -> impl Iterator<Item = &'a PeerId> {
pub fn synced_peers_for_epoch(&self, epoch: Epoch) -> impl Iterator<Item = &PeerId> {
self.peers
.iter()
.filter(move |(peer_id, info)| {
allowed_peers.is_none_or(|allowed| allowed.contains(peer_id))
&& info.is_connected()
.filter(move |(_, info)| {
info.is_connected()
&& match info.sync_status() {
SyncStatus::Synced { info } => {
info.has_slot(epoch.end_slot(E::slots_per_epoch()))

View File

@@ -210,7 +210,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals
.peers
.read()
.synced_peers_for_epoch(self.to_be_downloaded, None)
.synced_peers_for_epoch(self.to_be_downloaded)
.next()
.is_some()
// backfill can't progress if we do not have peers in the required subnets post peerdas.
@@ -313,7 +313,6 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
} => {
debug!(?batch_id, error, "Block components coupling error");
@@ -325,11 +324,8 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
failed_columns.insert(*column);
failed_peers.insert(*peer);
}
for peer in failed_peers.iter() {
network.report_peer(*peer, *action, "failed to return columns");
}
// Only retry if peer failure **and** retries have been exceeded
// Only retry if peer failure **and** retries haven't been exceeded
if !*exceeded_retries {
return self.retry_partial_batch(
network,
@@ -888,7 +884,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals
.peers
.read()
.synced_peers_for_epoch(batch_id, None)
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();
@@ -899,6 +895,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
request,
RangeRequestId::BackfillSync { batch_id },
&synced_peers,
&synced_peers, // All synced peers have imported up to the finalized slot so they must have their custody columns available
&failed_peers,
) {
Ok(request_id) => {
@@ -964,7 +961,7 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, None)
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();

View File

@@ -2,7 +2,7 @@ use beacon_chain::{
block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root,
};
use lighthouse_network::{
PeerAction, PeerId,
PeerId,
service::api_types::{
BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId,
},
@@ -63,7 +63,6 @@ pub(crate) enum CouplingError {
DataColumnPeerFailure {
error: String,
faulty_peers: Vec<(ColumnIndex, PeerId)>,
action: PeerAction,
exceeded_retries: bool,
},
BlobPeerFailure(String),
@@ -253,7 +252,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
if let Err(CouplingError::DataColumnPeerFailure {
error: _,
faulty_peers,
action: _,
exceeded_retries: _,
}) = &resp
{
@@ -377,7 +375,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
return Err(CouplingError::DataColumnPeerFailure {
error: format!("No columns for block {block_root:?} with data"),
faulty_peers: responsible_peers,
action: PeerAction::LowToleranceError,
exceeded_retries,
});
@@ -402,7 +399,6 @@ impl<E: EthSpec> RangeBlockComponentsRequest<E> {
return Err(CouplingError::DataColumnPeerFailure {
error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"),
faulty_peers: naughty_peers,
action: PeerAction::LowToleranceError,
exceeded_retries
});
}
@@ -468,7 +464,7 @@ mod tests {
NumBlobs, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec,
};
use lighthouse_network::{
PeerAction, PeerId,
PeerId,
service::api_types::{
BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId,
DataColumnsByRangeRequestId, Id, RangeRequestId,
@@ -785,7 +781,6 @@ mod tests {
if let Err(super::CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
}) = result
{
@@ -793,7 +788,6 @@ mod tests {
assert_eq!(faulty_peers.len(), 2); // columns 3 and 4 missing
assert_eq!(faulty_peers[0].0, 3); // column index 3
assert_eq!(faulty_peers[1].0, 4); // column index 4
assert!(matches!(action, PeerAction::LowToleranceError));
assert!(!exceeded_retries); // First attempt, should be false
} else {
panic!("Expected PeerFailure error");
@@ -957,13 +951,11 @@ mod tests {
if let Err(super::CouplingError::DataColumnPeerFailure {
error: _,
faulty_peers,
action,
exceeded_retries,
}) = result
{
assert_eq!(faulty_peers.len(), 1); // column 2 missing
assert_eq!(faulty_peers[0].0, 2); // column index 2
assert!(matches!(action, PeerAction::LowToleranceError));
assert!(exceeded_retries); // Should be true after max retries
} else {
panic!("Expected PeerFailure error with exceeded_retries=true");

View File

@@ -533,19 +533,21 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
batch_type: ByRangeRequestType,
request: BlocksByRangeRequest,
requester: RangeRequestId,
peers: &HashSet<PeerId>,
block_peers: &HashSet<PeerId>,
column_peers: &HashSet<PeerId>,
peers_to_deprioritize: &HashSet<PeerId>,
) -> Result<Id, RpcRequestSendError> {
let range_request_span = debug_span!(
parent: None,
SPAN_OUTGOING_RANGE_REQUEST,
range_req_id = %requester,
peers = peers.len()
block_peers = block_peers.len(),
column_peers = column_peers.len()
);
let _guard = range_request_span.clone().entered();
let active_request_count_by_peer = self.active_request_count_by_peer();
let Some(block_peer) = peers
let Some(block_peer) = block_peers
.iter()
.map(|peer| {
(
@@ -579,7 +581,7 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
.collect();
Some(self.select_columns_by_range_peers_to_request(
&column_indexes,
peers,
column_peers,
active_request_count_by_peer,
peers_to_deprioritize,
)?)
@@ -770,7 +772,6 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
let range_req = entry.get_mut();
if let Some(blocks_result) = range_req.responses(&self.chain.spec) {
if let Err(CouplingError::DataColumnPeerFailure {
action: _,
error,
faulty_peers: _,
exceeded_retries,

View File

@@ -334,6 +334,31 @@ impl<E: EthSpec, B: BatchConfig> BatchInfo<E, B> {
}
}
/// Change the batch state from `Self::Downloading` to `Self::AwaitingDownload` without
/// registering a failed attempt.
///
/// Note: must use this cautiously with some level of retry protection
/// as not registering a failed attempt could lead to requesting in a loop.
#[must_use = "Batch may have failed"]
pub fn downloading_to_awaiting_download(
&mut self,
) -> Result<BatchOperationOutcome, WrongState> {
match self.state.poison() {
BatchState::Downloading(_) => {
self.state = BatchState::AwaitingDownload;
Ok(self.outcome())
}
BatchState::Poisoned => unreachable!("Poisoned batch"),
other => {
self.state = other;
Err(WrongState(format!(
"Download failed for batch in wrong state {:?}",
self.state
)))
}
}
}
pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> {
match self.state.poison() {
BatchState::AwaitingDownload => {

View File

@@ -871,7 +871,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
CouplingError::DataColumnPeerFailure {
error,
faulty_peers,
action,
exceeded_retries,
} => {
debug!(?batch_id, error, "Block components coupling error");
@@ -883,12 +882,22 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
failed_columns.insert(*column);
failed_peers.insert(*peer);
}
for peer in failed_peers.iter() {
network.report_peer(*peer, *action, "failed to return columns");
}
// Retry the failed columns if the column requests haven't exceeded the
// max retries. Otherwise, remove treat it as a failed batch below.
if !*exceeded_retries {
// Set the batch back to `AwaitingDownload` before retrying.
// This is to ensure that the batch doesn't get stuck in `Downloading` state.
//
// DataColumn retries has a retry limit so calling `downloading_to_awaiting_download`
// is safe.
if let BatchOperationOutcome::Failed { blacklist } =
batch.downloading_to_awaiting_download()?
{
return Err(RemoveChain::ChainFailed {
blacklist,
failing_batch: batch_id,
});
}
return self.retry_partial_batch(
network,
batch_id,
@@ -936,7 +945,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
failing_batch: batch_id,
});
}
self.send_batch(network, batch_id)
// The errored batch is set to AwaitingDownload above.
// We now just attempt to download all batches stuck in `AwaitingDownload`
// state in the right order.
self.attempt_send_awaiting_download_batches(network, "injecting error")
} else {
debug!(
batch_epoch = %batch_id,
@@ -969,7 +981,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.collect();
debug!(
?awaiting_downloads,
src, "Attempting to send batches awaiting downlaod"
src, "Attempting to send batches awaiting download"
);
for batch_id in awaiting_downloads {
@@ -998,11 +1010,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let (request, batch_type) = batch.to_blocks_by_range_request();
let failed_peers = batch.failed_peers();
let synced_peers = network
let synced_column_peers = network
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, Some(&self.peers))
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();
@@ -1013,7 +1025,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
chain_id: self.id,
batch_id,
},
&synced_peers,
// Request blocks only from peers of this specific chain
&self.peers,
// Request column from all synced peers, even if they are not part of this chain.
// This is to avoid splitting of good column peers across many head chains in a heavy forking
// environment. If the column peers and block peer are on different chains, then we return
// a coupling error and retry only the columns that failed to couple. See `Self::retry_partial_batch`.
&synced_column_peers,
&failed_peers,
) {
Ok(request_id) => {
@@ -1081,7 +1099,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
.network_globals()
.peers
.read()
.synced_peers_for_epoch(batch_id, Some(&self.peers))
.synced_peers_for_epoch(batch_id)
.cloned()
.collect::<HashSet<_>>();
@@ -1093,6 +1111,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&failed_columns,
) {
Ok(_) => {
// inform the batch about the new request
batch.start_downloading(id)?;
debug!(
?batch_id,
id, "Retried column requests from different peers"
@@ -1100,6 +1120,8 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
return Ok(KeepChain);
}
Err(e) => {
// No need to explicitly fail the batch since its in `AwaitingDownload` state
// before we attempted to retry.
debug!(?batch_id, id, e, "Failed to retry partial batch");
}
}
@@ -1123,6 +1145,9 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
) -> Result<KeepChain, RemoveChain> {
let _guard = self.span.clone().entered();
debug!("Resuming chain");
// attempt to download any batches stuck in the `AwaitingDownload` state because of
// a lack of peers before.
self.attempt_send_awaiting_download_batches(network, "resume")?;
// Request more batches if needed.
self.request_batches(network)?;
// If there is any batch ready for processing, send it.