Implement reliable range sync for PeerDAS

This commit is contained in:
dapplion
2025-05-21 23:34:28 -05:00
parent b014675b7a
commit 4fb2ae658a
23 changed files with 2580 additions and 701 deletions

View File

@@ -1,4 +1,5 @@
use beacon_chain::block_verification_types::RpcBlock;
use itertools::Itertools;
use lighthouse_network::rpc::methods::BlocksByRangeRequest;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::PeerId;
@@ -17,15 +18,7 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5;
/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty.
const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3;
/// Type of expected batch.
#[derive(Debug, Copy, Clone, Display)]
#[strum(serialize_all = "snake_case")]
pub enum ByRangeRequestType {
BlocksAndColumns,
BlocksAndBlobs,
Blocks,
}
// TODO(das): Consider merging with PeerGroup
#[derive(Clone, Debug)]
pub struct BatchPeers {
block_peer: PeerId,
@@ -53,6 +46,12 @@ impl BatchPeers {
pub fn column(&self, index: &ColumnIndex) -> Option<&PeerId> {
self.column_peers.get(index)
}
pub fn iter_unique_peers(&self) -> impl Iterator<Item = &PeerId> {
std::iter::once(&self.block_peer)
.chain(self.column_peers.values())
.unique()
}
}
/// Allows customisation of the above constants used in other sync methods such as BackFillSync.

View File

@@ -10,7 +10,7 @@ use itertools::Itertools;
use lighthouse_network::service::api_types::Id;
use lighthouse_network::{PeerAction, PeerId};
use logging::crit;
use std::collections::{btree_map::Entry, BTreeMap, HashSet};
use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet};
use strum::IntoStaticStr;
use tracing::{debug, instrument, warn};
use types::{Epoch, EthSpec, Hash256, Slot};
@@ -87,9 +87,11 @@ pub struct SyncingChain<T: BeaconChainTypes> {
batches: BTreeMap<BatchId, BatchInfo<T::EthSpec>>,
/// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain
/// and thus available to download this chain from, as well as the batches we are currently
/// requesting.
peers: HashSet<PeerId>,
/// and thus available to download this chain from.
///
/// Also, For each peer tracks the total requests done per peer as part of this SyncingChain
/// `HashMap<peer, total_requests_per_peer>`
peers: HashMap<PeerId, usize>,
/// Starting epoch of the next batch that needs to be downloaded.
to_be_downloaded: BatchId,
@@ -121,7 +123,40 @@ pub enum ChainSyncingState {
Syncing,
}
#[cfg(test)]
#[derive(Debug, Eq, PartialEq)]
pub enum BatchStateSummary {
Downloading,
Processing,
AwaitingProcessing,
AwaitingValidation,
Unexpected(&'static str),
}
impl<T: BeaconChainTypes> SyncingChain<T> {
/// Returns a summary of batch states for assertions in tests.
#[cfg(test)]
pub fn batches_state(&self) -> Vec<(BatchId, BatchStateSummary)> {
self.batches
.iter()
.map(|(id, batch)| {
let state = match batch.state() {
// A batch is never left in this state, it's only the initial value
BatchState::AwaitingDownload => {
BatchStateSummary::Unexpected("AwaitingDownload")
}
BatchState::Downloading { .. } => BatchStateSummary::Downloading,
BatchState::AwaitingProcessing { .. } => BatchStateSummary::AwaitingProcessing,
BatchState::Poisoned => BatchStateSummary::Unexpected("Poisoned"),
BatchState::Processing { .. } => BatchStateSummary::Processing,
BatchState::Failed => BatchStateSummary::Unexpected("Failed"),
BatchState::AwaitingValidation { .. } => BatchStateSummary::AwaitingValidation,
};
(*id, state)
})
.collect()
}
#[allow(clippy::too_many_arguments)]
pub fn new(
id: Id,
@@ -138,7 +173,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
target_head_slot,
target_head_root,
batches: BTreeMap::new(),
peers: HashSet::from_iter([peer_id]),
peers: HashMap::from_iter([(peer_id, <_>::default())]),
to_be_downloaded: start_epoch,
processing_target: start_epoch,
optimistic_start: None,
@@ -168,7 +203,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
/// Peers currently syncing this chain.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
pub fn peers(&self) -> impl Iterator<Item = PeerId> + '_ {
self.peers.iter().cloned()
self.peers.keys().cloned()
}
/// Progress in epochs made by the chain
@@ -221,6 +256,12 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
request_id: Id,
blocks: Vec<RpcBlock<T::EthSpec>>,
) -> ProcessingResult {
// Account for one more requests to this peer
// TODO(das): this code assumes that we do a single request per peer per RpcBlock
for peer in batch_peers.iter_unique_peers() {
*self.peers.entry(*peer).or_default() += 1;
}
// check if we have this batch
let batch = match self.batches.get_mut(&batch_id) {
None => {
@@ -400,11 +441,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.request_batches(network)?;
}
}
} else if !self.good_peers_on_sampling_subnets(self.processing_target, network) {
// This is to handle the case where no batch was sent for the current processing
// target when there is no sampling peers available. This is a valid state and should not
// return an error.
return Ok(KeepChain);
} else {
return Err(RemoveChain::WrongChainState(format!(
"Batch not found for current processing target {}",
@@ -577,7 +613,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
"Batch failed to download. Dropping chain scoring peers"
);
for peer in self.peers.drain() {
for (peer, _) in self.peers.drain() {
network.report_peer(peer, penalty, "faulty_chain");
}
Err(RemoveChain::ChainFailed {
@@ -842,7 +878,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
network: &mut SyncNetworkContext<T>,
peer_id: PeerId,
) -> ProcessingResult {
self.peers.insert(peer_id);
self.peers.insert(peer_id, <_>::default());
self.request_batches(network)
}
@@ -854,7 +890,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
&mut self,
network: &mut SyncNetworkContext<T>,
batch_id: BatchId,
peer_id: &PeerId,
request_id: Id,
err: RpcResponseError,
) -> ProcessingResult {
@@ -869,7 +904,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
debug!(
batch_epoch = %batch_id,
batch_state = ?batch.state(),
%peer_id,
%request_id,
?batch_state,
"Batch not expecting block"
@@ -880,12 +914,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
batch_epoch = %batch_id,
batch_state = ?batch.state(),
error = ?err,
%peer_id,
%request_id,
"Batch download error"
);
if let BatchOperationOutcome::Failed { blacklist } =
batch.download_failed(Some(*peer_id))?
// TODO(das): Is it necessary for the batch to track failed peers? Can we make this
// mechanism compatible with PeerDAS and before PeerDAS?
batch.download_failed(None)?
{
return Err(RemoveChain::ChainFailed {
blacklist,
@@ -896,7 +931,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
} else {
debug!(
batch_epoch = %batch_id,
%peer_id,
%request_id,
batch_state,
"Batch not found"
@@ -937,6 +971,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
},
&synced_peers,
&failed_peers,
&self.peers,
) {
Ok(request_id) => {
// inform the batch about the new request
@@ -953,14 +988,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
return Ok(KeepChain);
}
Err(e) => match e {
// TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For
// sync to work properly it must be okay to have "stalled" batches in
// AwaitingDownload state. Currently it will error with invalid state if
// that happens. Sync manager must periodicatlly prune stalled batches like
// we do for lookup sync. Then we can deprecate the redundant
// `good_peers_on_sampling_subnets` checks.
e
@ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => {
RpcRequestSendError::InternalError(e) => {
// NOTE: under normal conditions this shouldn't happen but we handle it anyway
warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request");
// register the failed download and check if the batch can be retried
@@ -1019,11 +1047,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// check if we have the batch for our optimistic start. If not, request it first.
// We wait for this batch before requesting any other batches.
if let Some(epoch) = self.optimistic_start {
if !self.good_peers_on_sampling_subnets(epoch, network) {
debug!("Waiting for peers to be available on sampling column subnets");
return Ok(KeepChain);
}
if let Entry::Vacant(entry) = self.batches.entry(epoch) {
let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH);
entry.insert(optimistic_batch);
@@ -1046,35 +1069,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
Ok(KeepChain)
}
/// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in
/// every sampling column subnet.
fn good_peers_on_sampling_subnets(
&self,
epoch: Epoch,
network: &SyncNetworkContext<T>,
) -> bool {
if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) {
// Require peers on all sampling column subnets before sending batches
let peers_on_all_custody_subnets = network
.network_globals()
.sampling_subnets
.iter()
.all(|subnet_id| {
let peer_count = network
.network_globals()
.peers
.read()
.good_custody_subnet_peer(*subnet_id)
.count();
peer_count > 0
});
peers_on_all_custody_subnets
} else {
true
}
}
/// Creates the next required batch from the chain. If there are no more batches required,
/// `false` is returned.
#[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)]
@@ -1107,15 +1101,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
return None;
}
// don't send batch requests until we have peers on sampling subnets
// TODO(das): this is a workaround to avoid sending out excessive block requests because
// block and data column requests are currently coupled. This can be removed once we find a
// way to decouple the requests and do retries individually, see issue #6258.
if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) {
debug!("Waiting for peers to be available on custody column subnets");
return None;
}
// If no batch needs a retry, attempt to send the batch of the next epoch to download
let next_batch_id = self.to_be_downloaded;
// this batch could have been included already being an optimistic batch

View File

@@ -54,6 +54,13 @@ pub struct ChainCollection<T: BeaconChainTypes> {
}
impl<T: BeaconChainTypes> ChainCollection<T> {
#[cfg(test)]
pub(crate) fn iter(&self) -> impl Iterator<Item = &SyncingChain<T>> {
self.finalized_chains
.values()
.chain(self.head_chains.values())
}
pub fn new(beacon_chain: Arc<BeaconChain<T>>) -> Self {
ChainCollection {
beacon_chain,

View File

@@ -9,10 +9,9 @@ mod sync_type;
pub use batch::{
BatchConfig, BatchInfo, BatchOperationOutcome, BatchPeers, BatchProcessingResult, BatchState,
ByRangeRequestType,
};
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
#[cfg(test)]
pub use chain_collection::SyncChainStatus;
pub use chain::BatchStateSummary;
pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH};
pub use range::RangeSync;
pub use sync_type::RangeSyncType;

View File

@@ -39,6 +39,8 @@
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
//! and further batches are requested as current blocks are being processed.
#[cfg(test)]
use super::chain::BatchStateSummary;
use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain};
use super::chain_collection::{ChainCollection, SyncChainStatus};
use super::sync_type::RangeSyncType;
@@ -100,10 +102,23 @@ where
}
#[cfg(test)]
pub(crate) fn __failed_chains(&mut self) -> Vec<Hash256> {
pub(crate) fn failed_chains(&mut self) -> Vec<Hash256> {
self.failed_chains.keys().copied().collect()
}
#[cfg(test)]
pub(crate) fn batches_state(&self) -> Vec<(ChainId, BatchId, BatchStateSummary)> {
self.chains
.iter()
.flat_map(|chain| {
chain
.batches_state()
.into_iter()
.map(|(batch_id, state)| (chain.id(), batch_id, state))
})
.collect()
}
#[instrument(parent = None,
level = "info",
fields(component = "range_sync"),
@@ -344,7 +359,6 @@ where
pub fn inject_error(
&mut self,
network: &mut SyncNetworkContext<T>,
peer_id: PeerId,
batch_id: BatchId,
chain_id: ChainId,
request_id: Id,
@@ -352,7 +366,7 @@ where
) {
// check that this request is pending
match self.chains.call_by_id(chain_id, |chain| {
chain.inject_error(network, batch_id, &peer_id, request_id, err)
chain.inject_error(network, batch_id, request_id, err)
}) {
Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain {